In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

In [2]:
data = pd.read_pickle('../../data/processed/all_samples.pickle')
data['datetime'] = pd.to_datetime(data.date)
data['day'] = data.datetime.dt.weekday_name
data = pd.get_dummies(data, prefix='day', columns=['day'])

In [3]:
features = ['hour',
            'daylight_yn',
            'holiday_yn',
            'rush_hour_yn',
            'temp',
            'wind_speed',
            'precipitation',
            'road_length',
            'class_freeway',
            'class_local',
            'class_major',
            'class_other',
            'class_unimproved',
            'day_Monday',
            'day_Tuesday',
            'day_Wednesday',
            'day_Thursday',
            'day_Friday',
            'day_Saturday',
            'day_Sunday']

labels = 'accident_yn'

In [4]:
X = data[features]
y = data[labels]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [8]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = 100 * metrics.accuracy_score(test_labels, predictions)
    precision = 100 * metrics.precision_score(test_labels, predictions)
    recall = 100 * metrics.recall_score(test_labels, predictions)
    f1 = metrics.f1_score(test_labels, predictions)
    roc_auc = metrics.roc_auc_score(test_labels, predictions)

    print('Model Performance')
    print('Accuracy:\t{:0.2f}%'.format(accuracy))
    print('Precision:\t{:0.2f}%'.format(precision))
    print('Recall:\t\t{:0.2f}%'.format(recall))
    print('F1 Score:\t{:0.2f}'.format(f1))
    print('ROC AUC:\t{:0.2f}'.format(roc_auc))

    return (accuracy, precision, recall)


In [None]:
base_model = RandomForestClassifier(n_estimators=100, random_state=42)
base_model.fit(X_train, y_train)
base_performance = evaluate(base_model, X_test, y_test)

In [14]:
rf = RandomForestClassifier()

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
grid_n_estimators = {'n_estimators': [5, 7, 10]}
scores = ['accuracy', 'precision', 'recall']

# GridSearch on n_estimators
search_n_estimators = GridSearchCV(rf, 
                                   param_grid=grid_n_estimators, 
                                   n_jobs=-1, 
                                   cv=3, 
                                   scoring=scores, 
                                   refit=False,
                                   verbose=2)

search_n_estimators.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:   25.6s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   31.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [5, 7, 10]}, pre_dispatch='2*n_jobs',
       refit=False, return_train_score='warn',
       scoring=['accuracy', 'precision', 'recall'], verbose=2)

In [None]:
search_n_estimators.

In [30]:
list(search_n_estimators.cv_results_['param_n_estimators'])

[5, 7, 10]

In [None]:
best_random = rf_random.best_estimator_
random_performance = evaluate(best_random, X_test, y_test)

In [None]:
improvement_accuracy = 100 * (random_performance[0] - base_performance[0]) / base_performance[0]
improvement_precision = 100 * (random_performance[1] - base_performance[1]) / base_performance[1]
improvement_recall = 100 * (random_performance[2] - base_performance[2]) / base_performance[2]

print('Accuracy Improvement:\t{:0.2f}%.'.format(improvement_accuracy))
print('Precision Improvement:\t{:0.2f}%.'.format(improvement_precision))
print('Recall Improvement:\t{:0.2f}%.'.format(improvement_recall))

In [12]:
pprint(base_model.get_params())

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [None]:
max_depth_default = None
min_samples_leaf_default = 1
min_samples_split_default = 2
n_estimators_default = 100

In [13]:
max_depth_range: [10, 32, 55, 77, 100, None]
min_samples_leaf_range: [1, 2, 4, 6, 7, 10]
min_samples_split_range: [2, 5, 8, 12]
n_estimators_range: [50, 75, 100, 125, 150]

In [None]:
def plot_results(model, param = 'n_estimators', name = 'Num Trees'):
    param_name = 'param_%s' % param

    # Extract information from the cross validation model
    train_scores = model.cv_results_['mean_train_score']
    test_scores = model.cv_results_['mean_test_score']
    train_time = model.cv_results_['mean_fit_time']
    param_values = list(model.cv_results_[param_name])
    
    # Plot the scores over the parameter
    plt.subplots(1, 2, figsize=(10, 6))
    plt.subplot(121)
    plt.plot(param_values, train_scores, 'bo-', label = 'train')
    plt.plot(param_values, test_scores, 'go-', label = 'test')
    plt.ylim(ymin = -10, ymax = 0)
    plt.legend()
    plt.xlabel(name)
    plt.ylabel('Neg Mean Absolute Error')
    plt.title('Score vs %s' % name)
    
    plt.subplot(122)
    plt.plot(param_values, train_time, 'ro-')
    plt.ylim(ymin = 0.0, ymax = 2.0)
    plt.xlabel(name)
    plt.ylabel('Train Time (sec)')
    plt.title('Training Time vs %s' % name)
    
    
    plt.tight_layout(pad = 4)