In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_pickle('../../data/processed/all_samples.pickle')
data['datetime'] = pd.to_datetime(data.date)

In [3]:
data['day'] = data.datetime.dt.weekday_name
data = pd.get_dummies(data, prefix='day', columns=['day'])

In [4]:
features = ['hour',
            'daylight_yn',
            'holiday_yn',
            'rush_hour_yn',
            'temp',
            'wind_speed',
            'precipitation',
            'road_length',
            'class_freeway',
            'class_local',
            'class_major',
            'class_other',
            'class_unimproved',
            'day_Monday',
            'day_Tuesday',
            'day_Wednesday',
            'day_Thursday',
            'day_Friday',
            'day_Saturday',
            'day_Sunday']

labels = 'accident_yn'

In [5]:
X = data[features]
y = data[labels]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [7]:
forest = RandomForestClassifier(n_estimators=100)

forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)

print('Random Forest (n=100)')

print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('Precision:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))

feature_importance = pd.Series(forest.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)

Random Forest (n=100)
Accuracy: 0.8287672386212471
Precision: 0.6810197633112869
Recall: 0.5869120308509379
road_length         0.423833
temp                0.154762
wind_speed          0.099991
class_major         0.087232
hour                0.080630
class_local         0.071125
precipitation       0.013509
daylight_yn         0.012763
class_freeway       0.010974
class_unimproved    0.007695
holiday_yn          0.006710
rush_hour_yn        0.005353
day_Monday          0.003701
day_Wednesday       0.003630
day_Tuesday         0.003605
day_Sunday          0.003588
day_Thursday        0.003578
day_Friday          0.003343
day_Saturday        0.003173
class_other         0.000805
dtype: float64


In [10]:
forest_1k = RandomForestClassifier(n_estimators=1000)

forest_1k.fit(X_train, y_train)

y_pred_1k = forest_1k.predict(X_test)

print('Random Forest (n=0100)')

print('Accuracy:', metrics.accuracy_score(y_test, y_pred_1k))
print('Precision:', metrics.precision_score(y_test, y_pred_1k))
print('Recall:', metrics.recall_score(y_test, y_pred_1k))

feature_importance = pd.Series(forest_1k.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)

Random Forest (n=0100)
Accuracy: 0.8289608836950848
Precision: 0.680050627981696
Recall: 0.5907007425198315
road_length         0.421045
temp                0.155326
wind_speed          0.099657
class_major         0.090212
hour                0.081451
class_local         0.069730
precipitation       0.013593
daylight_yn         0.012632
class_freeway       0.010737
class_unimproved    0.008278
holiday_yn          0.006628
rush_hour_yn        0.005235
day_Monday          0.003692
day_Thursday        0.003615
day_Sunday          0.003610
day_Tuesday         0.003590
day_Wednesday       0.003531
day_Friday          0.003319
day_Saturday        0.003096
class_other         0.001022
dtype: float64


**n=1,000 results**

*Metrics*

| Metric    | Value              |
| --------- | ------------------ |
| Accuracy  | 0.8289608836950848 |
| Precision | 0.680050627981696  |
| Recall    | 0.5907007425198315 |

*Feature Importance*

| Feature          | Relative Importance |
| ---------------- | ------------------- |
| road_length      | 0.421045            |
| temp             | 0.155326            |
| wind_speed       | 0.099657            |
| class_major      | 0.090212            |
| hour             | 0.081451            |
| class_local      | 0.069730            |
| precipitation    | 0.013593            |
| daylight_yn      | 0.012632            |
| class_freeway    | 0.010737            |
| class_unimproved | 0.008278            |
| holiday_yn       | 0.006628            |
| rush_hour_yn     | 0.005235            |
| day_Monday       | 0.003692            |
| day_Thursday     | 0.003615            |
| day_Sunday       | 0.003610            |
| day_Tuesday      | 0.003590            |
| day_Wednesday    | 0.003531            |
| day_Friday       | 0.003319            |
| day_Saturday     | 0.003096            |
| class_other      | 0.001022            |

In [12]:
important_features = ['hour',
                      'daylight_yn',
                      'temp',
                      'wind_speed',
                      'precipitation',
                      'road_length',
                      'class_freeway',
                      'class_local',
                      'class_major']

In [13]:
X_important = data[important_features]

In [14]:
X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(X_important, y, test_size=0.4, random_state=42)

In [15]:
forest_imp = RandomForestClassifier(n_estimators=100)

forest_imp.fit(X_train_imp, y_train_imp)

y_pred_imp = forest_imp.predict(X_test_imp)

print('Random Forest (n=100)')

print('Accuracy:', metrics.accuracy_score(y_test_imp, y_pred_imp))
print('Precision:', metrics.precision_score(y_test_imp, y_pred_imp))
print('Recall:', metrics.recall_score(y_test_imp, y_pred_imp))

feature_importance_imp = pd.Series(forest_imp.feature_importances_, index=X_important.columns).sort_values(ascending=False)
print(feature_importance_imp)

Random Forest (n=100)
Accuracy: 0.8359952514860155
Precision: 0.6983864620228256
Recall: 0.6003078328230976
road_length      0.460490
temp             0.163775
wind_speed       0.097254
class_major      0.093245
hour             0.081035
class_local      0.067866
precipitation    0.013984
class_freeway    0.011831
daylight_yn      0.010521
dtype: float64


In [16]:
most_important_features = ['hour',
                           'temp',
                           'wind_speed',
                           'road_length',
                           'class_local',
                           'class_major']

In [17]:
X_most_important = data[most_important_features]

In [18]:
X_train_most, X_test_most, y_train_most, y_test_most = train_test_split(X_most_important, y, test_size=0.4, random_state=42)

In [19]:
forest_most = RandomForestClassifier(n_estimators=100)

forest_most.fit(X_train_most, y_train_most)

y_pred_most = forest_most.predict(X_test_most)

print('Random Forest (n=100)')

print('Accuracy:', metrics.accuracy_score(y_test_most, y_pred_most))
print('Precision:', metrics.precision_score(y_test_most, y_pred_most))
print('Recall:', metrics.recall_score(y_test_most, y_pred_most))

feature_importance_most = pd.Series(forest_most.feature_importances_, index=X_most_important.columns).sort_values(ascending=False)
print(feature_importance_most)

Random Forest (n=100)
Accuracy: 0.833233704346069
Precision: 0.6914576217956268
Recall: 0.5958256516076654
road_length    0.495734
temp           0.163123
hour           0.099266
class_major    0.097829
wind_speed     0.085692
class_local    0.058355
dtype: float64
