In [10]:
import numpy as np
import geopandas as gpd
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from IPython.display import SVG
from graphviz import Source

In [220]:
def missing_and_unique(df, col):
    n_missing = df[col].isna().sum()
    unique_vals = df[col].unique()
    print(f'Number of missing values: {n_missing}')
    print(f'Number of unique values: {len(unique_vals)}')
    print('They are:', unique_vals)

# Pre-Processing

In [221]:
# read files
df_filtered = gpd.read_file('./data/network/trt_network_filtered.shp')
df_easy_features = gpd.read_file('./data/network/network_w_easy_features.shp')
df_twoway = gpd.read_file('./data/network/trt_network_twoway_LTS.shp')

In [222]:
# process the easy feature file
df_easy_features.loc[24513, 'LTS'] = 4
df_easy_features = df_easy_features[df_easy_features['length_in_'] >= 50]

In [223]:
# process filtered features
df_filtered.index = df_filtered['OBJECTID'] - 1

In [224]:
# file stats
print(f'Number of links in filtered: {len(df_filtered)}, in easy: {len(df_easy_features)}')

Number of links in filtered: 41533, in easy: 41533


In [225]:
df_filtered.columns

Index(['OBJECTID', 'GEO_ID', 'LFN_ID', 'LF_NAME', 'FNODE', 'TNODE',
       'ONE_WAY_DI', 'length_in_', 'GEO_ID_1', 'LTS', 'Shape_Leng', 'ONEWAY',
       'Shape_Le_1', 'LTS_less4', 'geometry'],
      dtype='object')

In [226]:
# join and select relevant columns
feature_cols = ['FCODE_DESC', 'SPEED', 'NBRLANES_2', 'CP_TYPE']
df = df_filtered.join(df_easy_features[feature_cols])
flags = df[(df['LTS']==1) & (df['LF_NAME'] == 'Yonge St')].index
df.loc[flags, 'CP_TYPE'] = 'Cycle Tracks'
df = df[['GEO_ID'] + feature_cols + ['LTS']]
df.columns = ['GEO_ID', 'road_type', 'speed_limit', 'nlanes', 'infras_type', 'lts']
df

Unnamed: 0_level_0,GEO_ID,road_type,speed_limit,nlanes,infras_type,lts
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,30079678,Trail,0,0,Major Multi-use Pathway,1
3,30082310,Other,0,2,,1
4,30008940,Trail,0,0,Major Multi-use Pathway,1
7,9950476,Major Arterial,60,4,,3
8,7641209,Local,50,2,,2
...,...,...,...,...,...,...
59544,14253385,Major Arterial,50,4,,4
59545,107745,Major Arterial,60,5,,4
59551,30113300,Other,40,2,,1
59552,30012613,Major Arterial,60,4,,4


In [227]:
df_twoway.columns

Index(['Join_Count', 'TARGET_FID', 'JOIN_FID', 'GEO_ID', 'LFN_ID', 'LF_NAME',
       'ADDRESS_L', 'ADDRESS_R', 'OE_FLAG_L', 'OE_FLAG_R', 'LONUML', 'HINUML',
       'LONUMR', 'HINUMR', 'FNODE', 'TNODE', 'ONE_WAY_DI', 'DIR_CODE_D',
       'FCODE', 'FCODE_DESC', 'JURIS_CODE', 'OBJECTID', 'CP_TYPE',
       'NRN_SpeedL', 'NBRLANES_2', 'Left_Parki', 'Right_Park', 'GTASpeed_1',
       'GTASpeed_2', 'GTASPeed_3', 'Shape_Leng', 'Final_Spee', 'BL_width',
       'BL_speed', 'with_parki', 'LTS_BL', 'LTS_mixed', 'LTS', 'geometry'],
      dtype='object')

In [228]:
df_twoway = df_twoway[['GEO_ID', 'NRN_SpeedL', 'Final_Spee']]
df_twoway.columns = ['GEO_ID', 'speed_limit_verification', 'speed']
df_twoway = df_twoway.groupby('GEO_ID').max()
df_twoway

Unnamed: 0_level_0,speed_limit_verification,speed
GEO_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
108,40,40.0
117,40,40.0
118,40,40.0
120,40,40.0
121,40,40.0
...,...,...
30113710,0,0.0
30113711,0,0.0
30113820,0,0.0
30113823,0,0.0


## Actual speed

In [229]:
missing_and_unique(df_twoway, 'speed')

Number of missing values: 0
Number of unique values: 1731
They are: [40.   47.49 17.91 ... 37.35 32.75 23.11]


In [230]:
missing_and_unique(df_twoway, 'speed_limit_verification')

Number of missing values: 0
Number of unique values: 12
They are: [40 50 60  0 90 30 15 20 10 25 70 80]


In [231]:
df = df.set_index('GEO_ID')

In [232]:
df = df.join(df_twoway, how='left', rsuffix='_r')
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,road_type,speed_limit,nlanes,infras_type,lts,speed_limit_verification,speed
0,Trail,0,0,Major Multi-use Pathway,1,0,0.00
1,Other,0,2,,1,0,0.00
2,Trail,0,0,Major Multi-use Pathway,1,0,0.00
3,Major Arterial,60,4,,3,60,37.62
4,Local,50,2,,2,50,50.00
...,...,...,...,...,...,...,...
41528,Major Arterial,50,4,,4,50,49.54
41529,Major Arterial,60,5,,4,60,58.89
41530,Other,40,2,,1,40,40.00
41531,Major Arterial,60,4,,4,60,41.29


In [233]:
df['speed_limit'] = df['speed_limit_verification']
df = df[['road_type', 'speed_limit', 'nlanes', 'infras_type', 'speed', 'lts']]
df.columns = [['road_type', 'speed_limit', 'nlanes', 'infras_type', 'speed_actual', 'lts']]
df

Unnamed: 0,road_type,speed_limit,nlanes,infras_type,speed_actual,lts
0,Trail,0,0,Major Multi-use Pathway,0.00,1
1,Other,0,2,,0.00,1
2,Trail,0,0,Major Multi-use Pathway,0.00,1
3,Major Arterial,60,4,,37.62,3
4,Local,50,2,,50.00,2
...,...,...,...,...,...,...
41528,Major Arterial,50,4,,49.54,4
41529,Major Arterial,60,5,,58.89,4
41530,Other,40,2,,40.00,1
41531,Major Arterial,60,4,,41.29,4


In [237]:
df.to_csv('./data/network/intermediate.csv', index=False)

In [238]:
df = pd.read_csv('./data/network/intermediate.csv')
df

Unnamed: 0,road_type,speed_limit,nlanes,infras_type,speed_actual,lts
0,Trail,0,0,Major Multi-use Pathway,0.00,1
1,Other,0,2,,0.00,1
2,Trail,0,0,Major Multi-use Pathway,0.00,1
3,Major Arterial,60,4,,37.62,3
4,Local,50,2,,50.00,2
...,...,...,...,...,...,...
41528,Major Arterial,50,4,,49.54,4
41529,Major Arterial,60,5,,58.89,4
41530,Other,40,2,,40.00,1
41531,Major Arterial,60,4,,41.29,4


## Road Type

In [239]:
# road type column
missing_and_unique(df, 'road_type')

Number of missing values: 0
Number of unique values: 15
They are: ['Trail' 'Other' 'Major Arterial' 'Local' 'Collector' 'Laneway' 'Walkway'
 'Minor Arterial' 'Pending' 'Major Arterial Ramp' 'Access Road'
 'Collector Ramp' 'Other Ramp' 'Busway' 'Minor Arterial Ramp']


In [240]:
# replace
road_type_mapping = {'Major Arterial': 'Arterial', 'Minor Arterial': 'Arterial', 
                     'Major Arterial Ramp': 'Arterial', 'Minor Arterial Ramp': 'Ramp',
                     'Collector Ramp': 'Ramp', 'Other Ramp': 'Other', 'Busway': 'Other',
                     'Pending': 'Other'}
df['road_type'].replace(road_type_mapping, inplace=True)
missing_and_unique(df, 'road_type')

Number of missing values: 0
Number of unique values: 9
They are: ['Trail' 'Other' 'Arterial' 'Local' 'Collector' 'Laneway' 'Walkway'
 'Access Road' 'Ramp']


In [241]:
# one-hot-encoding
df = pd.concat([df, pd.get_dummies(df['road_type'])], axis=1)

In [242]:
df

Unnamed: 0,road_type,speed_limit,nlanes,infras_type,speed_actual,lts,Access Road,Arterial,Collector,Laneway,Local,Other,Ramp,Trail,Walkway
0,Trail,0,0,Major Multi-use Pathway,0.00,1,0,0,0,0,0,0,0,1,0
1,Other,0,2,,0.00,1,0,0,0,0,0,1,0,0,0
2,Trail,0,0,Major Multi-use Pathway,0.00,1,0,0,0,0,0,0,0,1,0
3,Arterial,60,4,,37.62,3,0,1,0,0,0,0,0,0,0
4,Local,50,2,,50.00,2,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41528,Arterial,50,4,,49.54,4,0,1,0,0,0,0,0,0,0
41529,Arterial,60,5,,58.89,4,0,1,0,0,0,0,0,0,0
41530,Other,40,2,,40.00,1,0,0,0,0,0,1,0,0,0
41531,Arterial,60,4,,41.29,4,0,1,0,0,0,0,0,0,0


## Speed limit

In [243]:
missing_and_unique(df, 'speed_limit')

Number of missing values: 0
Number of unique values: 11
They are: [ 0 60 50 40 20 30 10 70 80 15 25]


In [244]:
df = pd.concat([df, pd.get_dummies(df['speed_limit'])], axis=1)

In [245]:
df['s_leq_40'] = np.minimum(df['speed_limit'].values, 40)
df['s_leq_56'] = np.minimum(np.maximum(df['speed_limit'].values - 40, 0), 16)
df['s_geq_56'] = np.maximum(df['speed_limit'].values - 56, 0)

## nlanes

In [246]:
missing_and_unique(df, 'nlanes')

Number of missing values: 0
Number of unique values: 11
They are: [ 0  2  4  5  6  3  1  7 21  8 44]


In [247]:
df['nlanes'] = df['nlanes'].replace({44: 4, 21: 2})
missing_and_unique(df, 'nlanes')

Number of missing values: 0
Number of unique values: 9
They are: [0 2 4 5 6 3 1 7 8]


In [314]:
n_lanes_onehot = pd.get_dummies(df['nlanes']).values

# infras type

In [248]:
missing_and_unique(df, 'infras_type')

Number of missing values: 36817
Number of unique values: 10
They are: ['Major Multi-use Pathway' nan 'Suggested On-Street Routes'
 'Signed Routes' 'Bike Lanes' 'Park Roads Cycling Connections'
 'Minor Multi-use Pathway' 'Quiet Street Cycling Routes' 'Cycle Tracks'
 'Informal Dirt Footpath']


In [249]:
cp_mapping = {val: 0 for val in df['infras_type'].unique()}
cp_mapping['Bike Lanes'] = 1
cp_mapping['Cycle Tracks'] = 1
df['infras_binary'] = df['infras_type'].replace(cp_mapping)
df.index = range(len(df))
df

Unnamed: 0,road_type,speed_limit,nlanes,infras_type,speed_actual,lts,Access Road,Arterial,Collector,Laneway,...,30,40,50,60,70,80,s_leq_40,s_leq_56,s_geq_56,infras_binary
0,Trail,0,0,Major Multi-use Pathway,0.00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Other,0,2,,0.00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Trail,0,0,Major Multi-use Pathway,0.00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Arterial,60,4,,37.62,3,0,1,0,0,...,0,0,0,1,0,0,40,16,4,0
4,Local,50,2,,50.00,2,0,0,0,0,...,0,0,1,0,0,0,40,10,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41528,Arterial,50,4,,49.54,4,0,1,0,0,...,0,0,1,0,0,0,40,10,0,0
41529,Arterial,60,5,,58.89,4,0,1,0,0,...,0,0,0,1,0,0,40,16,4,0
41530,Other,40,2,,40.00,1,0,0,0,0,...,0,1,0,0,0,0,40,0,0,0
41531,Arterial,60,4,,41.29,4,0,1,0,0,...,0,0,0,1,0,0,40,16,4,0


# Save data

In [300]:
road_type = df[['Access Road', 'Arterial', 'Collector', 'Laneway', 'Local', 'Other', 'Ramp', 'Trail', 'Walkway']].values
speed_lmt = df['speed_limit'].values
speed_lmt_onehot = df[[0, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80]].astype(int).values
speed_lmt_thred = df[['s_leq_40', 's_leq_56', 's_geq_56']].values
n_lanes = df['nlanes'].values
infras = df['infras_binary'].values
speed_actual = np.round(df['speed_actual'].values, 4)

In [98]:
np.savetxt('./data/road/road_type.txt', road_type, delimiter=',')
np.savetxt('./data/road/speed_limit.txt', speed_lmt, delimiter=',')
np.savetxt('./data/road/speed_limit_onehot.txt', speed_lmt_onehot, delimiter=',')
np.savetxt('./data/road/speed_limit_thred.txt', speed_lmt_thred, delimiter=',')
np.savetxt('./data/road/n_lanes.txt', n_lanes, delimiter=',')
np.savetxt('./data/road/n_lanes_onehot.txt', n_lanes_onehot, delimiter=',')
np.savetxt('./data/road/cyc_infras.txt', infras, delimiter=',')
np.savetxt('./data/road/speed_actual.txt', speed_actual, delimiter=',')

In [322]:
np.loadtxt('./data/road/n_lanes_onehot.txt', delimiter=',')

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [319]:
np.savetxt('./data/road/n_lanes_onehot.txt', n_lanes_onehot, delimiter=',')

In [254]:
df.to_csv('./data/network/feature_final.csv', index=False)

In [272]:
np.genfromtxt('./data/road/speed_actual.txt')

array([ 0.  ,  0.  ,  0.  , ..., 40.  , 41.29,  0.  ])

# Naive Attri Pred Baseline

In [328]:
# road type
np.max(road_type.sum(axis=0)) / len(road_type)

0.49064599234343775

In [331]:
# Cyc Infras
(infras == 0).sum() / len(infras)

0.974983747863145

In [337]:
# n lanes onehot
print('acc', np.max(n_lanes_onehot.sum(axis=0)) / len(n_lanes_onehot))
maj = np.argmax(n_lanes_onehot.sum(axis=0))
print('mae', np.abs(n_lanes - maj).mean())
print('mae', ((n_lanes - maj) ** 2).mean())

acc 0.6777020682348975
mae 0.606457515710399
mae 1.3522259408181445


In [339]:
# speed actual
mean_speed = speed_actual.mean()
print('mae', np.abs(n_lanes - mean_speed).mean())
print('mae', ((n_lanes - mean_speed) ** 2).mean())

mae 35.51543399224713
mae 1262.6980401478418


# Train Test Split

In [255]:
def load_data(df, features):
    f2c = {'speed_limit': ['speed_limit'], 'nlanes': ['nlanes'],
           'speed_limit_onehot': [0, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80],
           'road_type': ['Access Road', 'Arterial', 'Collector', 'Laneway', 'Local', 'Other', 'Ramp', 'Trail', 'Walkway'],
           'infras': ['infras_binary'],
           'speed_actual': ['speed_actual']}
    cols = []
    for f in features:
        cols += f2c[f]
    return df[cols], df['lts']

In [24]:
def load_emb(encoder_name, purpose):
    root = '/Users/bolin/Library/CloudStorage/OneDrive-UniversityofToronto/AutoLTS/'
    X = np.loadtxt(f'{root}/emb/{encoder_name}_{purpose}.txt', delimiter=',').astype(float)
    y = np.loadtxt(f'{root}/data/LTS/lts_labels.txt').astype(int)
    indi = np.loadtxt(f'{root}/data/{purpose}_idx.txt').astype(int)
    y = y[indi]
    return X, y

In [256]:
indi_train = np.loadtxt(f'./data/training_idx.txt').astype(int)
indi_vali = np.loadtxt(f'./data/validation_idx.txt').astype(int)
# indi_train = np.concatenate([indi_train, indi_vali])
indi_test = np.loadtxt(f'./data/test_idx.txt').astype(int)

In [257]:
def model_eval(y_true, y_pred):
    # acc
    acc = (y_pred == y_true).sum() / y_true.shape[0] * 100
    # h/l acc
    flag_pred = (y_pred <= 2)
    flag_true = (y_true <= 2)
    hl_acc = (flag_pred == flag_true).sum() / y_true.shape[0] * 100
    # mae
    mae = np.abs(y_true - y_pred).mean()
    # mse
    mse = ((y_true - y_pred) ** 2).mean()
    # kt
    pred_mat = np.sign(y_pred.reshape((-1, 1)) - y_pred.reshape((1, -1)))
    true_mat = np.sign(y_true.reshape((-1, 1)) - y_true.reshape((1, -1)))
    kt = ((pred_mat == true_mat).sum() - y_true.shape[0])/2
    kt /= ((y_pred.shape[0] - 1)** 2)
    # flr
    n_high_stress = (y_true >= 3).sum()
    false_low_stress = ((y_pred <= 2) * (y_true >= 3)).sum()
    flr = false_low_stress / n_high_stress * 100
    # fhr
    n_low_stress = (y_true <= 2).sum()
    false_high_stress = ((y_pred >= 3) * (y_true <= 2)).sum()
    fhr = false_high_stress / n_low_stress * 100
    # record generation
    return pd.Series([acc, hl_acc, flr, fhr, kt], index=['Accuracy', 'H/L Accuracy', 'FLR', 'FHR', 'KT']).round(2)

# CART - speed actual + lanes + road type + cyc infras

In [264]:
X, y = load_data(df, ['infras', 'road_type', 'speed_actual', 'nlanes'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [265]:
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.95968183310833, 0.9550485271581816)

In [266]:
confusion_matrix(y_test, clf.predict(X_test), normalize='true')

array([[0.98089701, 0.00830565, 0.00872093, 0.00207641],
       [0.01466993, 0.95305623, 0.03227384, 0.        ],
       [0.04737516, 0.04993598, 0.90012804, 0.00256082],
       [0.00312989, 0.0172144 , 0.0485133 , 0.93114241]])

In [267]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        95.50
H/L Accuracy    96.92
FLR              6.27
FHR              2.07
KT               0.47
dtype: float64

# CART - speed actual + n lanes

In [303]:
X, y = load_data(df, ['speed_actual', 'nlanes'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [308]:
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.850147772466888, 0.8399455133662523)

In [309]:
confusion_matrix(y_test, clf.predict(X_test), normalize='true')

array([[0.97383721, 0.00622924, 0.01453488, 0.00539867],
       [0.1594132 , 0.82102689, 0.01711491, 0.00244499],
       [0.10243278, 0.49295775, 0.38924456, 0.01536492],
       [0.00312989, 0.0172144 , 0.03286385, 0.94679186]])

In [310]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        83.99
H/L Accuracy    90.36
FLR             33.66
FHR              1.98
KT               0.41
dtype: float64

# CART - speed limit + lanes

In [258]:
X, y = load_data(df, ['speed_limit', 'nlanes'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [259]:
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.797205093589229, 0.7905670015324365)

In [33]:
confusion_matrix(y_test, clf.predict(X_test), normalize='true')

array([[9.72591362e-01, 1.49501661e-02, 8.30564784e-04, 1.16279070e-02],
       [1.68215159e-01, 8.22493888e-01, 9.77995110e-04, 8.31295844e-03],
       [1.44686300e-01, 5.68501921e-01, 2.04865557e-02, 2.66325224e-01],
       [1.09546166e-02, 4.06885759e-02, 4.69483568e-03, 9.43661972e-01]])

In [34]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        79.06
H/L Accuracy    89.12
FLR             41.55
FHR              1.10
KT               0.40
dtype: float64

## Grid Search

In [150]:
dt_clf = DecisionTreeClassifier(random_state=0)
params = {'criterion': ['gini', 'entropy'],
          'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
          'min_samples_split': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]}

clf = GridSearchCV(dt_clf, params, cv=5)
search = clf.fit(X_train, y_train)
search.best_params_

{'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 0.01}

In [151]:
dt_clf = DecisionTreeClassifier(random_state=0, criterion='gini', max_depth=6, min_samples_split=0.01)
dt_clf.fit(X_train, y_train)
dt_clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.7522895610610428, 0.743231738464158)

In [152]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        74.32
H/L Accuracy    82.55
FLR             71.13
FHR              0.34
KT               0.36
dtype: float64

# CART - Type + Cycle

In [153]:
X, y = load_data(df, ['infras', 'road_type'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [154]:
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.5718247163133506, 0.575174527498723)

In [155]:
confusion_matrix(y_test, clf.predict(X_test), normalize='true')

array([[0.34260797, 0.62167774, 0.02491694, 0.01079734],
       [0.01515892, 0.79217604, 0.15207824, 0.0405868 ],
       [0.04993598, 0.05633803, 0.45070423, 0.44302177],
       [0.01408451, 0.03442879, 0.04225352, 0.90923318]])

## Grid Search

In [156]:
dt_clf = DecisionTreeClassifier(random_state=0)
params = {'criterion': ['gini', 'entropy'],
          'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
          'min_samples_split': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]}

clf = GridSearchCV(dt_clf, params, cv=5)
search = clf.fit(X_train, y_train)
search.best_params_

{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 0.05}

In [158]:
dt_clf = DecisionTreeClassifier(random_state=0, criterion='gini', max_depth=5, min_samples_split=0.05)
dt_clf.fit(X_train, y_train)
dt_clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.5717152552267669, 0.5750042567682615)

In [159]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        57.50
H/L Accuracy    89.87
FLR              8.17
FHR             10.76
KT               0.33
dtype: float64

# Random Forest - speed + nlanes

In [35]:
X, y = load_data(df, ['speed_limit', 'nlanes'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [52]:
clf = RandomForestClassifier(random_state=0, n_estimators=50, max_depth=6)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.797205093589229, 0.7902264600715138)

In [53]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        79.02
H/L Accuracy    89.12
FLR             41.55
FHR              1.10
KT               0.40
dtype: float64

# Random Forest - road type + cyc infras

## Naive

In [None]:
X, y = load_data(df, ['infras', 'road_type'])
X_train = X.iloc[indi_train, :]
y_train = y.iloc[indi_train]
X_test = X.iloc[indi_test, :]
y_test = y.iloc[indi_test]

In [164]:
clf = RandomForestClassifier(random_state=0, n_estimators=100)
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.5718247163133506, 0.575174527498723)

## Grid Search

In [None]:
rf_clf = RandomForestClassifier(random_state=0)
params = {'criterion': ['gini', 'entropy'],
          'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
          'min_samples_split': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
          'max_features': ['sqrt', 'log2'],
          'n_estimators': [10, 30, 50, 70, 100, 130, 150, 170, 200]}

clf = GridSearchCV(rf_clf, params, cv=5)
search = clf.fit(X_train, y_train)
search.best_params_

In [163]:
clf = RandomForestClassifier(random_state=0, n_estimators=10, max_features='sqrt', min_samples_split=0.03, max_depth=6, criterion='gini')
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.5717152552267669, 0.5748339860378001)

In [165]:
model_eval(y_test.values, clf.predict(X_test))

Accuracy        57.52
H/L Accuracy    89.89
FLR              8.03
FHR             10.78
KT               0.33
dtype: float64