In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_pickle('../../data/processed/all_samples.pickle')

In [3]:
features = ['daylight_yn',
            'holiday_yn',
            'rush_hour_yn',
            'temp',
            'wind_speed',
            'precipitation',
            'road_length',
            'class_freeway',
            'class_local',
            'class_major',
            'class_other',
            'class_unimproved']

labels = 'accident_yn'

In [4]:
X = data[features]
y = data[labels]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [6]:
forest_100 = RandomForestClassifier(n_estimators=100)

forest_100.fit(X_train, y_train)

y_pred_100 = forest_100.predict(X_test)

print('Random Forest (n=100)')

print('Accuracy:', metrics.accuracy_score(y_test, y_pred_100))
print('Precision:', metrics.precision_score(y_test, y_pred_100))
print('Recall:', metrics.recall_score(y_test, y_pred_100))

feature_importance_100 = pd.Series(forest_100.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance_100)

Random Forest (n=100)
Accuracy: 0.8211645646353579
Precision: 0.656999452799215
Recall: 0.5889247839250377
road_length         0.550893
temp                0.163282
class_major         0.087130
wind_speed          0.078128
class_local         0.061115
daylight_yn         0.018803
precipitation       0.013709
class_freeway       0.009210
rush_hour_yn        0.007006
class_unimproved    0.006917
holiday_yn          0.003110
class_other         0.000698
dtype: float64


In [7]:
forest_500 = RandomForestClassifier(n_estimators=500)

forest_500.fit(X_train, y_train)

y_pred_500 = forest_500.predict(X_test)

print('Random Forest (n=500)')

print('Accuracy:', metrics.accuracy_score(y_test, y_pred_500))
print('Precision:', metrics.precision_score(y_test, y_pred_500))
print('Recall:', metrics.recall_score(y_test, y_pred_500))

feature_importance_500 = pd.Series(forest_500.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance_500)

Random Forest (n=500)
Accuracy: 0.8204783875258895
Precision: 0.654404047976012
Recall: 0.5906161730629366
road_length         0.537971
temp                0.158611
class_major         0.090141
wind_speed          0.077465
class_local         0.073265
daylight_yn         0.018672
precipitation       0.013502
class_freeway       0.010782
class_unimproved    0.008533
rush_hour_yn        0.007070
holiday_yn          0.003076
class_other         0.000913
dtype: float64
