# Car accidents in the U.K.

Classifying fatal and minor car accidents. Dataset provides 251,832 incidents, and a range of features describing the driver, car type, weather condition, timing, and location of the accidents. Only features involving the car or driver will be used, as a way of determining which insurance policies should have elevated rates based on the prospective customer's applications, as they present a higher likeliness of severe car collisions.

## importing

In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import pandas_profiling
from xgboost import XGBClassifier, plot_importance
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from sklearn import svm, tree
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc, r2_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
scaler = MinMaxScaler()
%matplotlib inline
import pprint
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [2]:
# model diagnostics and confusion matrix
def model_report(model): 
    train_score = model.score(X_train, y_train)
    print("train score: {:.4}%".format(train_score * 100))

    test_score = model.score(X_test, y_test)
    print("test score: {:.4}%".format(test_score * 100))

    cv_score = np.mean(cross_val_score(model, X, y, cv=3))
    print("cross val score: {:.4}%".format(cv_score * 100))

    false_positive_rate, true_positive_rate, thresholds = roc_curve(actuals, predictions)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print("AUC  : {:.4}%".format(roc_auc * 100))

    print("\nClassification Report:")
    print(classification_report(actuals, predictions))
    
    print("\nConfusion Matrix: 1=Fatal, 0=Minor")
    display(pd.crosstab(actuals, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True))

In [3]:
# Accident data
df = pd.read_csv('Accidents_categorical.csv')

## eda

In [4]:
df.shape

(251832, 33)

In [5]:
df.head()

In [6]:
pandas_profiling.ProfileReport(df)

Are any areas impacted significantly more than others?

In [7]:
df.plot(kind="scatter", x="Longitude", y="Latitude", alpha= 0.1)

Are feature data normally distributed?

In [8]:
df.hist(figsize=(20,15), bins=30);

In [10]:
#pairplot and heatmap corr plot

## data prep

In [11]:
#taking % of data for speedier results before committing to model
df = df.sample(frac=.25, random_state=3)

In [12]:
# assign 1 and 0 to fatal and slight accidents in preparation of further analysis
df["Accident_Severity"]= df['Accident_Severity'].replace("Fatal_Serious", 1)
df["Accident_Severity"]= df['Accident_Severity'].replace("Slight", 0)

In [13]:
# assigning target as accident severity
target = df['Accident_Severity']

In [14]:
# reduce time of day to hour as opposed to minute and hour
df['Hour_of_Day'] = df['Hour_of_Day'].apply(lambda x: round(x*24))
#df['Hour_of_Day'].unique()

In [15]:
# assign target
y = df["Accident_Severity"]

In [9]:
df.columns

Index(['Accident_Index', 'Latitude', 'Longitude', 'Region',
       'Urban_or_Rural_Area', 'X1st_Road_Class', 'Driver_IMD_Decile',
       'Speed_limit', 'Road_Type', 'Road_Surface_Conditions', 'Weather',
       'High_Wind', 'Lights', 'Datetime', 'Year', 'Season', 'Month_of_Year',
       'Day_of_Month', 'Day_of_Week', 'Hour_of_Day', 'Number_of_Vehicles',
       'Age_of_Driver', 'Age_of_Vehicle', 'Junction_Detail',
       'Junction_Location', 'X1st_Point_of_Impact', 'Driver_Journey_Purpose',
       'Engine_CC', 'Propulsion_Code', 'Vehicle_Make', 'Vehicle_Category',
       'Vehicle_Manoeuvre', 'Accident_Severity'],
      dtype='object')

In [16]:
#dropping columns relating to time and weather or road conditions.
df.drop(columns=
    [
        'Accident_Index','Datetime'
        ,'Accident_Severity','Year','Latitude', 'Longitude', 'Region','X1st_Road_Class'
       ,'Speed_limit', 'Road_Type', 'Road_Surface_Conditions', 'Weather'
       ,'High_Wind', 'Lights', 'Season', 'Month_of_Year'
       ,'Day_of_Month','Day_of_Week','Hour_of_Day', 'Number_of_Vehicles'
        ,'Junction_Detail','Junction_Location','X1st_Point_of_Impact','Vehicle_Manoeuvre'
        ]
, axis=1, inplace=True)

In [17]:
# #selecting categorical features for dummies
# col_list = list(df.select_dtypes(include=['object']).columns)
# for col in ('Season','Day_of_Month','Day_of_Week','Month_of_Year','Hour_of_Day'):
#     col_list.append(col)

In [18]:
col_list = list(df.select_dtypes(include=['object']).columns)

In [19]:
## list of categorical features to dummy
#col_list

In [20]:
# create dummies for categorical features, assign to X
df = pd.DataFrame(pd.get_dummies(data=df,columns=col_list, drop_first=True
                    , prefix=col_list))
X = df

In [21]:
#train-test split 80%/20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [22]:
# y_train_fatal = (y_train=="1")
# y_test_fatal = (y_test=="1")

In [23]:
# scale train and test features seperately
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

## feature importance

In [24]:
# model = XGBClassifier()
# model.fit(X, y)
# ax = plot_importance(model)
# fig = ax.figure
# ax.grid(False)
# fig.set_size_inches(10,20)
# plt.show()

In [25]:
# from sklearn.ensemble import ExtraTreesClassifier
# model = ExtraTreesClassifier()
# model.fit(X,y)
# #plot graph of feature importances
# feat_importances = pd.Series(model.feature_importances_, index=X.columns)
# feat_importances.nlargest(20).plot(kind='barh')
# plt.show()

## choosing model

In [26]:
X_train.shape

(50366, 39)

In [30]:
# looking to keep enough to maintain 80% explained variance
pca = PCA(n_components=26)
principalComponents = pca.fit_transform(X_train)
print(np.sum(pca.explained_variance_ratio_))

0.8085629866377877


In [31]:
num_feats = 26

In [32]:
#assign pipelines
pipe_kn = Pipeline([('kn_model', KNeighborsClassifier())])

pipe_tree = Pipeline([('dt_model', DecisionTreeClassifier(random_state=3))])

pipe_bag = Pipeline([('bag_model', BaggingClassifier(DecisionTreeClassifier(random_state=3)))])

pipe_rf = Pipeline([('rf_model', RandomForestClassifier(random_state=3))])

pipe_ada = Pipeline([('ada_model', AdaBoostClassifier(random_state=3))])

pipe_gbt = Pipeline([('gbt_model', GradientBoostingClassifier(random_state=3))])

pipe_xgb = Pipeline([('xgb_model', XGBClassifier(random_state=3))])

pipe_svc = Pipeline([('svm_model', svm.SVC(random_state=3))])

In [33]:
# # initiating baseline model, cross val scores
# pipelines = [pipe_kn, pipe_tree, pipe_bag, pipe_rf, pipe_ada, pipe_gbt, pipe_xgb, pipe_svc]
# pipeline_names = ['KNeighbors','Decision Tree', 'Bagged Trees', 'Random Forest', 'AdaBoost', 'Gradient Boosting', 'XGBoost', 'Logistic Regression', 'Support Vector']

# for pipeline, name in zip(pipelines, pipeline_names):
#     print(name, ':')
#     print(pipeline)
#     pipeline.fit(X_train, y_train)
    
#     print()
#     recall = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='recall')
#     print('Recall', np.mean(recall))
#     precision = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision')
#     print('Precision', np.mean(precision))
#     f1 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
#     print('F1', np.mean(f1))
#     print()
#     print()

In [34]:
# #baseline model accuracy scores

# for classifier, pl in zip((XGBClassifier, LogisticRegression, RandomForestClassifier,DecisionTreeClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, svm.SVC),('xgbc','logistic regression', 'random forest', 'decision tree', 'adaboost', 'gradient', 'bagging', 'support vector')):

#     pipe = Pipeline([('pca', PCA(n_components=num_feats)),
#                     ('clf', classifier(random_state=3))])
#     pipe.fit(X_train, y_train)
    
#     print(pl)
#     print(pipe.score(X_test, y_test))
#     print()

In [35]:
for classifier, cls in zip((XGBClassifier(), LogisticRegression(), RandomForestClassifier(),DecisionTreeClassifier(), AdaBoostClassifier(),GradientBoostingClassifier(), BaggingClassifier(), svm.SVC() ),('xg','logistic regression', 'random forest', 'decision tree', 'adaboost', 'gradient', 'bagging', 'support vector')):
    
    
    pipe_lr = Pipeline([('pca', PCA(n_components=num_feats, random_state=3)),
         ('clf', classifier)])

    pipe_lr.fit(X_train, y_train)
    
    
    print(cls, ":")
    print('Accuracy',pipe_lr.score(X_train, y_train))
    recall = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
    print('Recall', np.mean(recall))
    precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision')
    print('Precision', np.mean(precision))
    f1 = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
    print('F1', np.mean(f1))
    print()

xg :
Accuracy 0.761783743001231
Recall 0.07897271268057784
Precision 0.619021858188776
F1 0.1400191670664559

logistic regression :
Accuracy 0.7556089425406027
Recall 0.0747191011235955
Precision 0.6276184200921654
F1 0.13352515239750679

random forest :
Accuracy 0.9471667394671008
Recall 0.15818619582664525
Precision 0.35075127724521943
F1 0.21691327623614934

decision tree :
Accuracy 0.9643608783703292
Recall 0.2918940609951846
Precision 0.28193109533087163
F1 0.28705289445940807

adaboost :
Accuracy 0.7594011833379661
Recall 0.08956661316211878
Precision 0.6121638685582239
F1 0.15616146888997534

gradient :
Accuracy 0.7629154588412819
Recall 0.08081861958266452
Precision 0.6197332665665177
F1 0.1429406248729658

bagging :
Accuracy 0.9486955485843624
Recall 0.17383627608346708
Precision 0.32617443887415154
F1 0.2292229238773114

support vector :
Accuracy 0.7610094111106699
Recall 0.08739967897271268
Precision 0.6027729911469525
F1 0.15258103295073489



## decision tree

In [36]:
# initialize and fit decision tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

predictions = dt_model.predict(X_test)
actuals = y_test

In [37]:
model_report(dt_model)

train score: 97.68%
test score: 63.93%
cross val score: 63.92%
AUC  : 52.35%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      9397
           1       0.29      0.29      0.29      3195

   micro avg       0.64      0.64      0.64     12592
   macro avg       0.52      0.52      0.52     12592
weighted avg       0.64      0.64      0.64     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7129,2268,9397
1,2274,921,3195
All,9403,3189,12592


In [38]:
# grid search params
param_grid_dt = [
    {'dt_model__criterion': ['entropy','gini'],
    'dt_model__max_depth': [2,3,4],
    'dt_model__min_samples_split': [0.01, 0.001],
    'dt_model__min_samples_leaf': [0.01, 0.001]}
]

# grid search
gs_dt = GridSearchCV(estimator=pipe_tree,
                    param_grid=param_grid_dt,
                    scoring=('accuracy'),
                    cv=5, n_jobs=-1, verbose=1)

# fitting grid search
gs_dt.fit(X_train, y_train)

# best parameters
print('Best accuracy: {:.4}%'.format(gs_dt.best_score_ * 100))
print('params:\n', gs_dt.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.3s


Best accuracy: 76.03%
params:
 {'dt_model__criterion': 'entropy', 'dt_model__max_depth': 3, 'dt_model__min_samples_leaf': 0.01, 'dt_model__min_samples_split': 0.01}


[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   16.0s finished


In [39]:
## recall scoring grid search
# gs_dt = GridSearchCV(estimator=pipe_tree,
#                     param_grid=param_grid_dt,
#                     scoring=('recall'),
#                     cv=4, n_jobs=-1, verbose=1)
# # fitting grid search
# gs_dt.fit(X_train, y_train)

# # best parameters
# print('Best accuracy: {:.4}%'.format(gs_dt.best_score_ * 100))
# print('params:\n', gs_dt.best_params_)

In [40]:
# using best params to fit model
dt_gs_model = DecisionTreeClassifier(criterion='entropy',
                               max_depth=3,
                               min_samples_leaf=0.01,
                               min_samples_split=0.01)
dt_gs_model.fit(X_train, y_train)

predictions = dt_gs_model.predict(X_test)
actuals = y_test

In [41]:
# new model report
model_report(dt_gs_model)

train score: 76.06%
test score: 75.56%
cross val score: 75.96%
AUC  : 52.94%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.68      0.07      0.13      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.72      0.53      0.49     12592
weighted avg       0.74      0.76      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9290,107,9397
1,2971,224,3195
All,12261,331,12592


## Random Forest

In [42]:
#initialize and fit random forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

predictions = rf_model.predict(X_test)
actuals = y_test

In [43]:
#baseline model report
model_report(rf_model)

train score: 95.11%
test score: 71.59%
cross val score: 71.46%
AUC  : 52.93%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.91      0.83      9397
           1       0.36      0.15      0.21      3195

   micro avg       0.72      0.72      0.72     12592
   macro avg       0.56      0.53      0.52     12592
weighted avg       0.66      0.72      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8533,864,9397
1,2714,481,3195
All,11247,1345,12592


In [44]:
# grid search params
param_grid_rf = [
    {'rf_model__criterion': ['entropy','gini'],
    'rf_model__n_estimators': [100,150],
    'rf_model__max_depth': [None],
    'rf_model__min_samples_split': [0.01,0.001],
    'rf_model__min_samples_leaf': [0.01,0.001]}
]

# grid search
gs_rf = GridSearchCV(estimator=pipe_rf,
                    param_grid=param_grid_rf,
                    scoring='accuracy',
                    cv=5, n_jobs=-1, verbose=1)

# fitting grid search
gs_rf.fit(X_train, y_train)

# best parameters
print('best accuracy: {:.4}%'.format(gs_rf.best_score_ * 100))
print('params:\n', gs_rf.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 18.1min finished


best accuracy: 76.07%
params:
 {'rf_model__criterion': 'entropy', 'rf_model__max_depth': None, 'rf_model__min_samples_leaf': 0.001, 'rf_model__min_samples_split': 0.001, 'rf_model__n_estimators': 100}


In [45]:
# fitting model using best params
rf_gs_model = RandomForestClassifier(criterion='entropy',
                                 max_depth=None,
                                 min_samples_leaf=0.001,
                                 min_samples_split=0.001,
                                 n_estimators=100)
rf_gs_model.fit(X_train, y_train)

predictions = rf_gs_model.predict(X_test)
actuals = y_test

In [46]:
# new model report
model_report(rf_gs_model)

train score: 76.13%
test score: 75.54%
cross val score: 75.96%
AUC  : 52.84%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.68      0.07      0.12      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.72      0.53      0.49     12592
weighted avg       0.74      0.76      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9296,101,9397
1,2979,216,3195
All,12275,317,12592


## adaboost

In [47]:
#initialize and fit model
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)

predictions = ada_model.predict(X_test)
actuals = y_test

In [48]:
#baseline model report
model_report(ada_model)

train score: 76.11%
test score: 75.44%
cross val score: 75.98%
AUC  : 53.35%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.98      0.86      9397
           1       0.62      0.08      0.15      3195

   micro avg       0.75      0.75      0.75     12592
   macro avg       0.69      0.53      0.50     12592
weighted avg       0.72      0.75      0.68     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9229,168,9397
1,2924,271,3195
All,12153,439,12592


In [49]:
param_grid_ada = [
    {'ada_model__n_estimators': [50,70],
    'ada_model__learning_rate': [0.1,0.2]}
]

# Construct Grid Search
gs_ada = GridSearchCV(estimator=pipe_ada,
                    param_grid=param_grid_ada,
                    scoring='accuracy',
                    cv=5, n_jobs=-1, verbose=1)

# Fit using grid search
gs_ada.fit(X_train, y_train)

# Best accuracy and parameters
print('best accuracy: {:.4}%'.format(gs_ada.best_score_ * 100))
print('params:\n', gs_ada.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.2min finished


best accuracy: 76.04%
params:
 {'ada_model__learning_rate': 0.2, 'ada_model__n_estimators': 70}


In [50]:
ada_gs_model = AdaBoostClassifier(n_estimators=70,
                                   learning_rate=0.2)
ada_gs_model.fit(X_train, y_train)

predictions = ada_gs_model.predict(X_test)
actuals = y_test


In [51]:
model_report(ada_gs_model)

train score: 76.07%
test score: 75.6%
cross val score: 75.97%
AUC  : 53.04%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.68      0.07      0.13      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.72      0.53      0.49     12592
weighted avg       0.74      0.76      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9289,108,9397
1,2964,231,3195
All,12253,339,12592


## gradient boost

In [52]:
#initialize and fit gradient boost
gbt_model = GradientBoostingClassifier()
gbt_model.fit(X_train, y_train)

predictions = gbt_model.predict(X_test)
actuals = y_test

In [53]:
#baseline model report
model_report(gbt_model)

train score: 76.17%
test score: 75.56%
cross val score: 75.93%
AUC  : 53.35%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.98      0.86      9397
           1       0.64      0.08      0.15      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.70      0.53      0.50     12592
weighted avg       0.73      0.76      0.68     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9251,146,9397
1,2931,264,3195
All,12182,410,12592


In [54]:
# grid search params
param_grid_gbt = [
    {'gbt_model__n_estimators': [40,50],
    'gbt_model__learning_rate': [0.15,0.1],
    'gbt_model__min_samples_split': [0.2,0.1],
    'gbt_model__min_samples_leaf': [0.1,1],
    'gbt_model__max_depth': [3,4]}
]

# grid search
gs_gbt = GridSearchCV(estimator=pipe_gbt,
                    param_grid=param_grid_gbt,
                    scoring='accuracy',
                    cv=5, n_jobs=-1, verbose=1)

# fit grid search
gs_gbt.fit(X_train, y_train)

# best params
print('best accuracy: {:.4}%'.format(gs_gbt.best_score_ * 100))
print('params:\n', gs_gbt.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 41.8min finished


best accuracy: 76.09%
params:
 {'gbt_model__learning_rate': 0.15, 'gbt_model__max_depth': 4, 'gbt_model__min_samples_leaf': 1, 'gbt_model__min_samples_split': 0.1, 'gbt_model__n_estimators': 50}


In [55]:
gbt_gs_model = GradientBoostingClassifier(learning_rate=0.15,
                                         max_depth=4,
                                         min_samples_leaf=1,
                                         min_samples_split=0.1,
                                         n_estimators=50)
gbt_gs_model.fit(X_train, y_train)

predictions = gbt_gs_model.predict(X_test)
actuals = y_test

In [56]:
# new model report
model_report(gbt_gs_model)

train score: 75.9%
test score: 75.44%
cross val score: 75.79%
AUC  : 52.53%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.68      0.06      0.11      3195

   micro avg       0.75      0.75      0.75     12592
   macro avg       0.72      0.53      0.48     12592
weighted avg       0.74      0.75      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9308,89,9397
1,3003,192,3195
All,12311,281,12592


## XGB

In [57]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

predictions = xgb_model.predict(X_test)
actuals = y_test

In [58]:
model_report(xgb_model)

train score: 76.13%
test score: 75.62%
cross val score: 75.96%
AUC  : 53.24%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.67      0.08      0.14      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.71      0.53      0.50     12592
weighted avg       0.74      0.76      0.68     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9273,124,9397
1,2946,249,3195
All,12219,373,12592


In [59]:
param_grid_xgb = [
    {'xgb_model__min_child_weight': [.01,1e-3],
    'xgb_model__learning_rate': [0.01,0.1],
    'xgb_model__n_estimators': [40,50],
    'xgb_model__max_depth': [2,3]}
]

# Construct Grid Search
gs_xgb = GridSearchCV(estimator=pipe_xgb,
                    param_grid=param_grid_xgb,
                    scoring='accuracy',
                    cv=5, n_jobs=-1, verbose=1)

# Fit using grid search
gs_xgb.fit(X, y)

# Best accuracy and parameters
print('Best accuracy: {:.4}%'.format(gs_xgb.best_score_ * 100))
print('params:\n', gs_xgb.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 18.4min finished


Best accuracy: 75.97%
params:
 {'xgb_model__learning_rate': 0.1, 'xgb_model__max_depth': 2, 'xgb_model__min_child_weight': 0.01, 'xgb_model__n_estimators': 50}


In [60]:
xgb_gs_model = XGBClassifier(learning_rate=0.1,
                             max_depth=2,
                             min_child_weight=0.01,
                             n_estimators=50)
xgb_gs_model.fit(X_train, y_train)

predictions = xgb_gs_model.predict(X_test)
actuals = y_test

In [61]:
model_report(xgb_gs_model)

train score: 76.08%
test score: 75.57%
cross val score: 75.97%
AUC  : 53.02%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.67      0.07      0.13      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.72      0.53      0.49     12592
weighted avg       0.74      0.76      0.67     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9285,112,9397
1,2964,231,3195
All,12249,343,12592


In [62]:
# xgb_gs_model_train_score = xgb_gs_model.score(X_train, y_train)
# print("train score: {:.4}%".format(xgb_gs_model_train_score * 100))

# xgb_gs_model_test_score = xgb_gs_model.score(X_test, y_test)
# print("test score: {:.4}%".format(xgb_gs_model_test_score * 100))

# xgb_gs_cv_score = np.mean(cross_val_score(gbt_gs_model, X, y, cv=3))
# print("cross val score: {:.4}%".format(xgb_gs_cv_score * 100))

# false_positive_rate, true_positive_rate, thresholds = roc_curve(actuals, predictions)
# roc_auc = auc(false_positive_rate, true_positive_rate)
# print("AUC : {:.4}%".format(roc_auc * 100))

# print("\nClassification Report:")
# print(classification_report(actuals, predictions))

# print("\nConfusion Matrix: 1=Fatal, 0=Minor")
# pd.crosstab(actuals, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [63]:
# model = XGBClassifier(random_state=3)
# model.fit(X_train, y_train)
# train_preds = model.predict(X_train)
# val_preds = model.predict(X_test)
# training_accuracy = accuracy_score(y_train, train_preds)
# val_accuracy = accuracy_score(y_test, val_preds)
# training_precision = precision_score(y_train, train_preds)
# val_precision = precision_score(y_test, val_preds)

# print("Training Accuracy:   {:.4}%".format(training_accuracy * 100))
# print("Validation Accuracy: {:.4}%".format(val_accuracy * 100))
# print('----------------------------')
# print("Training precision:   {:.4}%".format(training_precision * 100))
# print("Validation precision: {:.4}%".format(val_precision * 100))
# print('----------------------------')
# print("Training Confusion Matrix:")
# print(confusion_matrix(y_train, train_preds))
# print("Validation Confusion Matrix:")
# print(confusion_matrix(y_test, val_preds))

In [64]:
# param_grid_xgb = [
#     {'xgb_model__min_child_weight': [0.0001, 0.001],
#     'xgb_model__learning_rate': [0.1],
#     'xgb_model__n_estimators': [50],
#     'xgb_model__max_depth': [2, 3]}
# ]

# # Construct Grid Search
# gs_xgb = GridSearchCV(estimator=pipe_xgb,
#                     param_grid=param_grid_xgb,
#                     scoring='accuracy',
#                     cv=4, n_jobs=-1, verbose=1)

# # Fit using grid search
# gs_xgb.fit(X_train, y_train)

# # Best accuracy and parameters
# print('Best accuracy: {:.4}%'.format(gs_xgb.best_score_ * 100))
# print('Best Params:\n', gs_xgb.best_params_)

In [65]:
# xgb_gs_model = XGBClassifier(learning_rate=0.1,
#                              max_depth=3,
#                              min_child_weight=0.001,
#                              n_estimators=50)
# xgb_gs_model.fit(X_train, y_train)

# predictions = xgb_gs_model.predict(X_test)
# actuals = y_test

In [66]:
# xgb_gs_model_train_score = xgb_gs_model.score(X_train, y_train)
# print("Training Accuracy: {:.4}%".format(xgb_gs_model_train_score * 100))

# xgb_gs_model_test_score = xgb_gs_model.score(X_test, y_test)
# print("Testing Accuracy: {:.4}%".format(xgb_gs_model_test_score * 100))

# xgb_gs_cv_score = np.mean(cross_val_score(gbt_gs_model, X_train, y_train, cv=3))
# print("Cross Val Score: {:.4}%".format(xgb_gs_cv_score * 100))

# false_positive_rate, true_positive_rate, thresholds = roc_curve(actuals, predictions)
# roc_auc = auc(false_positive_rate, true_positive_rate)
# print("AUC is : {:.4}%".format(roc_auc * 100))

# print("\nClassification Report:")
# print(classification_report(actuals, predictions))

# print("\nConfusion Matrix: 1=Fatal, 0=Minor")
# pd.crosstab(actuals, predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)

## SVC

In [67]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

predictions = gbt_model.predict(X_test)
actuals = y_test

In [68]:
model_report(svm_model)

train score: 76.08%
test score: 75.56%
cross val score: 75.51%
AUC  : 53.35%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.98      0.86      9397
           1       0.64      0.08      0.15      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.70      0.53      0.50     12592
weighted avg       0.73      0.76      0.68     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9251,146,9397
1,2931,264,3195
All,12182,410,12592


In [69]:
# Construct pipeline for support vector grid search

pipe_svm = Pipeline([('pca', PCA(n_components=26)),
            ('clf', svm.SVC(random_state=3))])

# Set grid search params
param_grid_svm = [
 # {'clf__C': [0.1, 1]  , 'clf__kernel': ['linear']},
  {'clf__C': [0.1, 1], 'clf__gamma': [0.1, 0.01], 'clf__kernel': ['rbf','linear']},
 ]

# Construct grid search
gs_svm = GridSearchCV(estimator=pipe_svm,
            param_grid=param_grid_svm,
            scoring='recall',
            cv=3, verbose=2, return_train_score = True)

# Fit using grid search
gs_svm.fit(X_train, y_train)

# Best accuracy
print('Best accuracy: %.3f' % gs_svm.best_score_)

# Best params
print('\nBest params:\n', gs_svm.best_params_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf, total= 1.9min
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s


[CV] ...... clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf, total= 1.8min
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf .....................
[CV] ...... clf__C=0.1, clf__gamma=0.1, clf__kernel=rbf, total= 1.8min
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=linear ..................
[CV] ... clf__C=0.1, clf__gamma=0.1, clf__kernel=linear, total=  35.3s
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=linear ..................
[CV] ... clf__C=0.1, clf__gamma=0.1, clf__kernel=linear, total=  35.4s
[CV] clf__C=0.1, clf__gamma=0.1, clf__kernel=linear ..................
[CV] ... clf__C=0.1, clf__gamma=0.1, clf__kernel=linear, total=  35.7s
[CV] clf__C=0.1, clf__gamma=0.01, clf__kernel=rbf ....................
[CV] ..... clf__C=0.1, clf__gamma=0.01, clf__kernel=rbf, total= 1.2min
[CV] clf__C=0.1, clf__gamma=0.01, clf__kernel=rbf ....................
[CV] ..... clf__C=0.1, clf__gamma=0.01, clf__kernel=rbf, total= 1.2min
[CV] clf__C=0.1, clf__gamma=0.01, clf__kernel=rbf ....................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 44.7min finished


Best accuracy: 0.084

Best params:
 {'clf__C': 1, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}


In [70]:
svm_gs_model = XGBClassifier(clf_C=1,
                             clf_gamma=0.1,
                             clf_kernel='rbf',
                             )
svm_gs_model.fit(X_train, y_train)

predictions = svm_gs_model.predict(X_test)
actuals = y_test

In [71]:
model_report(svm_gs_model)

train score: 76.13%
test score: 75.62%
cross val score: 75.96%
AUC  : 53.24%

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.99      0.86      9397
           1       0.67      0.08      0.14      3195

   micro avg       0.76      0.76      0.76     12592
   macro avg       0.71      0.53      0.50     12592
weighted avg       0.74      0.76      0.68     12592


Confusion Matrix: 1=Fatal, 0=Minor


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9273,124,9397
1,2946,249,3195
All,12219,373,12592


In [72]:
# #from sklearn import svm, grid_search
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_

In [73]:
# svc_param_selection(X_train, y_train, 3)

## model reports

In [74]:
# svc_param_selection(X_train, y_train, 5)

In [75]:
# def plot_feature_importances(model):
#     n_features = X.shape[1]
#     plt.figure(figsize=(8,8))
#     plt.barh(range(n_features), model.feature_importances_, align='center') 
#     plt.yticks(np.arange(n_features), X.columns.values) 
#     plt.xlabel("Feature importance")
#     plt.ylabel("Feature")

In [76]:
# for index, val in enumerate(pipelines):
#     print('%s pipeline test accuracy: %.3f' % (pipeline_names[index], val.score(X_test, y_test)))

In [77]:
#gs_rf.cv_results_

In [78]:
#gs_ab.cv_results_

In [79]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [80]:
#gs_svm.cv_results_

In [81]:
# from sklearn.linear_model import SGDClassifier
# sgd_clf = SGDClassifier(random_state=123)
# sgd_clf.fit(X_train, y_train_fatal)

In [82]:
# from sklearn.model_selection import cross_val_score
# cross_val_score(sgd_clf, X_train, y_train_fatal, cv=3, scoring="accuracy")

In [83]:
# y_scores = cross_val_predict(sgd_clf, X_train, y_train_fatal, cv=3, method="decision_function")

In [84]:
# from sklearn.metrics import precision_recall_curve
# precisions, recalls, thresholds = precision_recall_curve(y_train_fatal, y_scores)

In [85]:
# def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
#     plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
#     plt.plot(thresholds, recalls[:-1], "g--", label="Recall")

In [86]:
# plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# plt.show()

In [87]:
# from sklearn.metrics import roc_curve
# fpr, tpr, thresholds = roc_curve(y_train_fatal, y_scores)

In [88]:
# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0,1], [0,1], 'k--')

In [89]:
# plot_roc_curve(fpr, tpr)

In [90]:
# from sklearn.metrics import roc_auc_score
# roc_auc_score(y_train_fatal, y_scores)

In [91]:
# from sklearn.ensemble import RandomForestClassifier
# forest_clf = RandomForestClassifier(random_state=42)
# y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_fatal, cv=3, method="predict_proba")

In [92]:
# y_scores_forest = y_probas_forest[:, 1]
# fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_fatal, y_scores_forest)

In [93]:
# plt.plot(fpr, tpr, "b:", label='SGD')
# plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
# plt.legend(loc="lower right")
# plt.show()

In [94]:
# roc_auc_score(y_train_fatal, y_scores_forest)

In [95]:
# # Import the three supervised learning models from sklearn
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import GaussianNB


# clf_A = RandomForestClassifier(random_state=random_state)
# clf_B = GaussianNB()
# clf_C = SVC(random_state=random_state)

# # Calculate the number of samples for 1%, 10%, and 100% of the training data
# samples_100 = len(y_train)
# samples_10 = int(len(y_train)/10)
# samples_1 = int(len(y_train)/100)

# # Collect results on the learners
# results = {}
# for clf in [clf_A, clf_B, clf_C]:
#     clf_name = clf.__class__.__name__
#     results[clf_name] = {}
#     for i, samples in enumerate([samples_1, samples_10,   samples_100]):
#         results[clf_name][i] = \
#         y_train_pred(clf, samples, X_train, y_train, X_test, y_test)

# # Run metrics visualization for the three supervised learning models chosen
# vs.evaluate(results, accuracy, fscore)