# Определение траснпортных узлов

## Подготовка данных

In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as img
from scipy.spatial.distance import pdist

%matplotlib inline

matplotlib.rcParams.update({'font.size': 20})

# Построение модели классификации по всей МСК

**Загрузка данных**

In [4]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [38]:
df = pd.read_csv('stats/nearest_objects_all_msc.csv', header=0, sep=',', error_bad_lines=False)
df.fillna(0, inplace=True)
df.head()

b'Skipping line 8657: expected 8 fields, saw 9\n'


Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object
0,1000400,Камчатская ул.,Восточный,tat,55.828272,37.824554,7810.01,40
1,1000502,Ул. Приорова,Северный,tat,55.820821,37.525614,11964.51,37
2,1001386,"\Метро \""\""Китай-город\""\""\""""",Центральный,tat,55.754188,37.634137,76585.78,71
3,1002511,Химкинская больница,Северный,tat,55.882864,37.452565,1519.54,14
4,1002606,Школа Летово,Новомосковский,tat,55.560648,37.420421,108.95,8


#### Ручная разметка транспортных узлов 

In [39]:
df['is_node'] = 0

# узел планерная 
df.loc[(df['latitude'] < 55.864135) & (df['longitude'] > 37.431944) &
       (df['latitude'] > 55.858837) & (df['longitude'] < 37.4381604), 'is_node' ] = 1

# узел тушинская 
df.loc[(df['latitude'] < 55.828734) & (df['longitude'] > 37.435055) &
       (df['latitude'] > 55.823351) & (df['longitude'] < 37.440195), 'is_node' ] = 1

# узел сходненская
df.loc[(df['latitude'] < 55.852601) & (df['longitude'] > 37.435957) &
       (df['latitude'] > 55.848749) & (df['longitude'] < 37.443066), 'is_node' ] = 1

# узел щукинская
df.loc[(df['latitude'] < 55.811426) & (df['longitude'] > 37.458159) &
       (df['latitude'] > 55.807198) & (df['longitude'] < 37.467520), 'is_node' ] = 1

# узел строгино
df.loc[(df['latitude'] < 55.805215) & (df['longitude'] > 37.398673) &
       (df['latitude'] > 55.800818) & (df['longitude'] < 37.409324), 'is_node' ] = 1

# узел пятницкое
df.loc[(df['latitude'] < 55.858373) & (df['longitude'] > 37.352029) &
       (df['latitude'] > 55.853828) & (df['longitude'] < 37.356751), 'is_node' ] = 1

# узел тат маршала жукова
df.loc[(df['latitude'] < 55.780982) & (df['longitude'] > 37.452453) &
       (df['latitude'] > 55.778892) & (df['longitude'] < 37.455643), 'is_node' ] = 1

# узел волоколамская
df.loc[(df['latitude'] < 55.837754) & (df['longitude'] > 37.380501) &
       (df['latitude'] > 55.832266) & (df['longitude'] < 37.385652), 'is_node' ] = 1

In [40]:
# Обработка наименований станций
df['name'] = df['name'].apply(lambda x: change_name(x))

Обучающая выборка на Северо-Западном округе

In [56]:
df_train = df[df['division_nm'] == 'Северо-Западный'].copy()

In [60]:
df_clf = df_train[['division_nm', 'latitude', 'longitude', 'type', 'avg_permit_day', 'num_near_object', 'is_node']].copy()
col = ['avg_permit_day', 'num_near_object', 'is_node']
df_clf_std = pd.DataFrame(MinMaxScaler().fit_transform(df_clf[col]), columns=col)

In [61]:
df_clf_std.head(2)

Unnamed: 0,avg_permit_day,num_near_object,is_node
0,0.156915,0.36,0.0
1,0.041842,0.4,0.0


In [62]:
%%time
model = AdaBoostClassifier()

model.fit(df_clf_std.drop('is_node', axis=1), df_clf_std['is_node'])

Wall time: 101 ms


AdaBoostClassifier()

Тестовая выборка по всей Мск

In [66]:
df_test = df.copy()
col = ['avg_permit_day', 'num_near_object', 'is_node']
df_test = pd.DataFrame(MinMaxScaler().fit_transform(df_test[col]), columns=col)

In [67]:
proba = model.predict_proba(df_test.drop('is_node', axis=1))[:, 1:]

In [69]:
df['proba'] = proba
df.head(2)

Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object,is_node,proba
0,1000400,камчатская,Восточный,tat,55.828272,37.824554,7810.01,40,0,0.354865
1,1000502,приорова,Северный,tat,55.820821,37.525614,11964.51,37,0,0.349218


In [87]:
lower_proba = df.loc[df['is_node'] == 1, 'proba'].sort_values()[:1].values[0]
lower_proba = df.groupby('is_node')['proba'].median()[1]
lower_proba = 0.486

In [88]:
df['proba_node'] = 0
df.loc[df['proba'] > lower_proba, 'proba_node'] = 1
df.head(2)

Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object,is_node,proba,proba_node
0,1000400,камчатская,Восточный,tat,55.828272,37.824554,7810.01,40,0,0.354865,0
1,1000502,приорова,Северный,tat,55.820821,37.525614,11964.51,37,0,0.349218,0


In [89]:
df[['name','latitude','longitude','type','proba_node']].to_csv('CLF_all_msc.csv', header=True, index=False)

## Кластеризация по всей МСК

In [90]:
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering
import sklearn.utils

In [91]:
df.head(2)

Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object,is_node,proba,proba_node
0,1000400,камчатская,Восточный,tat,55.828272,37.824554,7810.01,40,0,0.354865,0
1,1000502,приорова,Северный,tat,55.820821,37.525614,11964.51,37,0,0.349218,0


In [92]:
df_db = df[['division_nm', 'latitude', 'longitude', 'avg_permit_day', 'num_near_object']].copy()
col = ['latitude', 'longitude', 'avg_permit_day', 'num_near_object']
df_db_std = pd.DataFrame(MinMaxScaler().fit_transform(df_db[col]), columns=col)
# df_db_std = pd.concat([df_clf_std, pd.get_dummies(df_db[['division_nm']])], axis=1)

In [118]:
# df3_std = StandardScaler().fit_transform(df3_db)
db = DBSCAN(eps=0.0394, min_samples=4).fit(df_db_std)

In [119]:
df['label'] = db.labels_
df.head(2)

Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object,is_node,proba,proba_node,label,label_nodes
0,1000400,камчатская,Восточный,tat,55.828272,37.824554,7810.01,40,0,0.354865,0,0,0
1,1000502,приорова,Северный,tat,55.820821,37.525614,11964.51,37,0,0.349218,0,0,0


In [120]:
df['label'].unique()

array([ 0,  1,  2,  3,  4, 79,  5,  6,  7, 28,  8, -1,  9, 10, 81, 11, 12,
       13, 14, 15, 16, 36, 17, 18, 19, 20, 21, 22, 71, 23, 24, 25, 26, 27,
       72, 29, 30, 31, 32, 33, 38, 34, 35, 37, 68, 39, 40, 41, 42, 43, 44,
       45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 61, 56, 57, 58, 89, 59,
       60, 62, 88, 86, 63, 64, 65, 66, 67, 82, 69, 70, 73, 74, 75, 76, 77,
       78, 80, 83, 84, 85, 87], dtype=int64)

In [121]:
df[['name', 'type', 'latitude', 'longitude', 'label']].to_csv('DB_12.csv', header=True, index=False)

Оценка модели

In [96]:
df.head(2)

Unnamed: 0,id,name,division_nm,type,latitude,longitude,avg_permit_day,num_near_object,is_node,proba,proba_node,label
0,1000400,камчатская,Восточный,tat,55.828272,37.824554,7810.01,40,0,0.354865,0,0
1,1000502,приорова,Северный,tat,55.820821,37.525614,11964.51,37,0,0.349218,0,0


In [102]:
df['label_nodes'] = 0
df.loc[df['label'] > 0 , 'label_nodes'] = 1 

In [103]:
rec = recall_score(df['label_nodes'], df['proba_node'], average='binary')
print("recall: %.2f%%" % (rec * 100.0)) 
prc = precision_score(df['label_nodes'], df['proba_node'], average='binary')
print("precision: %.2f%%" % (prc * 100.0)) 

recall: 15.48%
precision: 16.36%


In [116]:
np.linspace(0.035, 0.045, 10)

array([0.035     , 0.03611111, 0.03722222, 0.03833333, 0.03944444,
       0.04055556, 0.04166667, 0.04277778, 0.04388889, 0.045     ])

In [117]:
for eps in np.linspace(0.035, 0.045, 10):
    db = DBSCAN(eps=eps, min_samples=4).fit(df_db_std)

    df['label'] = db.labels_

    df['label_nodes'] = 0
    df.loc[df['label'] > 0 , 'label_nodes'] = 1 

    rec = recall_score(df['label_nodes'], df['proba_node'], average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(df['label_nodes'], df['proba_node'], average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    print("eps: %.4f%%" % (eps)) 
    print("*"*30)

recall: 19.50%
precision: 52.27%
eps: 0.0350%
******************************
recall: 17.66%
precision: 40.01%
eps: 0.0361%
******************************
recall: 17.57%
precision: 34.67%
eps: 0.0372%
******************************
recall: 19.13%
precision: 33.96%
eps: 0.0383%
******************************
recall: 21.13%
precision: 31.91%
eps: 0.0394%
******************************
recall: 19.53%
precision: 25.32%
eps: 0.0406%
******************************
recall: 15.48%
precision: 16.36%
eps: 0.0417%
******************************
recall: 13.26%
precision: 11.34%
eps: 0.0428%
******************************
recall: 13.53%
precision: 11.34%
eps: 0.0439%
******************************
recall: 16.25%
precision: 11.50%
eps: 0.0450%
******************************
