In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Importing necessary libraries and read data

In [None]:
#Import modules
import numpy as np
import holidays
import pandas as pd
import seaborn as sns
import pickle
import time
import timeit


import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline

import datetime
import math
from collections import Counter

#scipy
import scipy.stats as stats
from scipy import stats
from scipy.stats import chi2_contingency

#sklearn
import sklearn
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, log_loss, recall_score 
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample

#for clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

#other learners
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


#imblearn
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

#webscraping
import requests
from bs4 import BeautifulSoup
import re
import urllib
from IPython.core.display import HTML

#time series
import statsmodels.api as sm
from pylab import rcParams
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA


#warning ignorer
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Reading in pre-processed and transformed data 
file = '/content/drive/MyDrive/Trial/data/Accidents/Visualized_and_manipulated.csv'
df = pd.read_csv(file, low_memory = False)
# Dropping unnamed column
df.drop(df.columns[0],axis=1,inplace=True)
df.head()

# Machine Learning

In [None]:
#made separate dataframe w. set index that wouldnt effect data vis above
df1=df
#set index to accident_index
df1.set_index('Accident_Index', inplace=True)
df1.head()

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
#create a new target variable - Reduced target class from a multi-class classification to a binary classification 
# problem to handle the imbalanced dataset and simplify analysis
 

df1.loc[df1.Accident_Severity !='Slight', 'Target_Severe_Indicator'] = 1
df1.loc[df1.Accident_Severity =='Slight', 'Target_Severe_Indicator'] = 0

In [None]:
df1["Target_Severe_Indicator"].value_counts()


In [None]:
df1["Accident_Severity"].value_counts()

In [None]:
df1.head()

In [None]:
print(df1.columns)

# Converting 'Object' to 'category' dtype - Saves memory

In [None]:
for col in set(df1.columns) - set(df1.describe().columns):
    df1[col] = df1[col].astype('category')

In [None]:
df.info()

# Random Sampling - removed rows at random to speed up model run times (for testing purposes only) 
Considering the imbalance of classes in the target variable, it may be worth using random stratified sampling to maintain proportionality of classes of the original dataset (Stratified sampling not carried out here however)

In [None]:
np.random.seed(150)

remove_n = 85342 #Sample size to remove from original dataset
df = df1
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df_subset = df.drop(drop_indices)

In [None]:
df_subset.shape
df_subset.head()

In [None]:
# 85% to 15% distribution of target class - Proportionality of the original dataset is still maintained
df_subset['Target_Severe_Indicator'].value_counts()

# Splitting target variable from predictor variables

In [None]:
df_X = df_subset.drop('Target_Severe_Indicator', axis=1)  
df_Y = df_subset['Target_Severe_Indicator']  

In [None]:
# Converting independent categorical features to Numerical by creating Dummy variables

df_X_dummy = pd.get_dummies(df_X)
#print(dataset_X_dummy.head())

In [None]:
df_X_dummy.shape

# Feature Selection

#  Applying VarianceThreshold filter

In [None]:
from sklearn.feature_selection import VarianceThreshold

# threshold set to 87% for variance 
# i.e. if 87% of the column data is the same (i.e. low variation), the column will not be as useful
# in the prediction
thresh=(.85 * (1 - .85))

In [None]:
# Wrapper function to identify low variance features and remove them from the dataframe 

def get_low_variance_columns(dframe=None, columns=None,
                             skip_columns=None, thresh=0.0,
                             autoremove=False):
    try:
        # get list of all the original df columns
        all_columns = dframe.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column)
                       for column
                       in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=thresh)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [remaining_columns[idx]
                         for idx, _
                         in enumerate(remaining_columns)
                         if idx
                         in feature_indices]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns,
                                             feature_names))
        print("Found {0} low-variance columns."
              .format(len(removed_features)))

        # remove the columns
        if autoremove:
            print("Removing low-variance features.")
            # remove the low-variance columns
            X_removed = vt.transform(X)

            print("Reassembling the dataframe (with low-variance "
                  "features removed).")
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed,
                                  columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index,
                              column=skip_columns[idx],
                              value=skipped_values[:, idx])
            print("Succesfully removed low-variance columns.")

        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something "
              "went wrong.")
        pass

    return dframe, removed_features

In [None]:
# retrieve new dataframe (with low variance features)
df_X_new, low_var_col = get_low_variance_columns(df_X_dummy,[],[],thresh, True) 
#Set to True to remove low variance columns

In [None]:
df_X_new.shape

In [None]:
df_X_new.head()

 **Normalizing data** - adjusting values measured on different scales to a notionally common scale (between 0 - 1)

In [None]:
df_X_normalized=(df_X_new-df_X_new.min())/(df_X_new.max()-df_X_new.min())

In [None]:
df_X_normalized.head()

In [None]:
df_X=df_X_normalized.round(3) 

In [None]:
df_X.head(3)

In [None]:
df_Y.value_counts()

In [None]:
# 80 train -20 test split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.2, random_state=42)

# Supervised Learning with classifier machine learning algorithms

In [None]:
#confusion matrix plot function
def cm_plot(var):
    plt.figure(figsize=(15,5))
    plt.style.use('dark_background')
    plt.clf()
    plt.imshow(var, interpolation='nearest', cmap='tab20')
    classNames = ['No Loyalty','Loyalty']
    plt.title('Confusion Matrix')
    plt.ylabel('Actual\n')
    plt.xlabel('Predicted\n')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+"="+str(var[i][j]),horizontalalignment='center', 
                     color='black')
    plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
plt.style.use('dark_background')

In [None]:
#Try modeling using  different classification models
classifiers = [
               BalancedBaggingClassifier(max_features=df_X.shape[1], n_estimators=500, replacement=True,
                              sampling_strategy='majority', random_state=42),
               GaussianNB(),
               SVC(kernel='linear',
                   class_weight='balanced', # penalize
                   probability=True)]                    
#putting results in df
res_cols=["Classifier", "Accuracy","precision", "Recall", "Roc Auc",]
results = pd.DataFrame(columns=res_cols)

for clf_0 in classifiers:
    clf_0.fit(X_train,y_train)
    name = clf_0.__class__.__name__
    
    print("\n"*3)
    print(name,"Results:")
       
    print('~'*40)
    y_pred = clf_0.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.4%}".format(acc))
    cv= np.mean(cross_val_score(clf_0, X_train, y_train, cv=3))
    print("Cross validation scores:",cv)
    
    
    train_predictions = clf_0.predict_proba(X_test)
    logloss = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(logloss))
    
    cm = confusion_matrix(y_test, y_pred)
    
    cm_plot(cm)
    
    #FPR and Error Rate setup
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    fpr = fp/(tn+fp)
    ers = 1-acc
    rec= recall_score(y_test, y_pred)
    roc=roc_auc_score(y_test, y_pred)
    precision= precision_score(y_test, y_pred)
    f1s=f1_score(y_test, y_pred)
    results_final = pd.DataFrame([[name, round(acc*100,3),round(precision*100,3), round(rec*100,3), round(roc*100,3),]],
                                 columns=res_cols)
    results = results.append(results_final)   
print("*"*40)

### Penalize Algorithms (Cost-Sensitive Training) 

During training, we can use the argument class_weight='balanced'  to penalize mistakes on the minority class by an amount proportional to how under-represented it is.

We also want to include the argument probability=True  if we want to enable probability estimates for SVM algorithms.

Let's train a model using Penalized-SVM on the original imbalanced dataset:

In [None]:
from sklearn.svm import SVC
# Train model
clf_2 = SVC(kernel='linear',
            class_weight='balanced', # penalize
            probability=True)
clf_2.fit(X_train, y_train)
#putting results in df
res_cols=["Classifier", "Accuracy", "Log Loss","precision", "Cross Val", "Recall", "Roc Auc","F1", 
          "False Positive Rate", "Error Rate"]
results = pd.DataFrame(columns=res_cols)
print("\n"*3)      
print('~'*40)
pred_y_2 = clf_2.predict(X_test)
acc = accuracy_score(y_test, pred_y_2)
print("Accuracy: {:.4%}".format(acc))
    
cv= np.mean(cross_val_score(clf_2, X_train, y_train, cv=3))
print("Cross validation scores:",cv)
    
    
train_predictions = clf_2.predict_proba(X_test)
logloss = log_loss(y_test, train_predictions)
print("Log Loss: {}".format(logloss))
    
cm = confusion_matrix(y_test, pred_y_2)
    
cm_plot(cm)
    
#FPR and Error Rate setup
tn, fp, fn, tp = confusion_matrix(y_test, pred_y_2).ravel()
    
fpr = fp/(tn+fp)
ers = 1-acc
rec= recall_score(y_test, pred_y_2)
prob_y_2 = clf_2.predict_proba(X_test)
prob_y_2 = [p[1] for p in prob_y_2]
print( roc_auc_score(y_test, prob_y_2) )
roc=roc_auc_score(y_test, prob_y_2)
prec= precision_score(y_test, pred_y_2)
f1s=f1_score(y_test,pred_y_2)
print("*"*40)
results_final10 = pd.DataFrame([[name, round(acc*100,3), round(logloss,3), 
                                   round(cv*100,3), round(prec*100,3),round(rec*100,3), round(roc*100,3),
                                   round(f1s*100,3),round(fpr*100,3),round(ers*100,3)]],
                                 columns=res_cols)
results10 = results.append(results_final10)
print("Results Shape",results10.shape)
results10.head(10)

In [None]:
from sklearn.metrics import roc_curve
# import matplotlib.pyplot as plt

prob_y_13 = clf_2.predict_proba(X_test)
# keep probabilities for the positive outcome only
prob_y_13 = [p[1] for p in prob_y_13]
# calculate AUC
auc = roc_auc_score(y_test, prob_y_13)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, prob_y_13)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show();

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_4 = GaussianNB()
clf_4.fit(X_train,y_train)
#putting results in df
res_cols=["Classifier", "Accuracy", "Log Loss","precision", "Cross Val", "Recall", "Roc Auc","F1", 
          "False Positive Rate", "Error Rate"]
results = pd.DataFrame(columns=res_cols)
print("\n"*3)      
print('~'*40)
pred_y_4 = clf_4.predict(X_test)
acc = accuracy_score(y_test, pred_y_4)
print("Accuracy: {:.4%}".format(acc))
    
cv= np.mean(cross_val_score(clf_4,X_train,y_train, cv=3))
print("Cross validation scores:",cv)
    
    
train_predictions = clf_4.predict_proba(X_test)
logloss = log_loss(y_test, train_predictions)
print("Log Loss: {}".format(logloss))
    
cm = confusion_matrix(y_test, pred_y_4)
    
cm_plot(cm)
    
#FPR and Error Rate setup
tn, fp, fn, tp = confusion_matrix(y_test, pred_y_4).ravel()
    
fpr = fp/(tn+fp)
ers = 1-acc
rec= recall_score(y_test, pred_y_4)
prob_y_4 = clf_4.predict_proba(X_test)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y_test, prob_y_4) )
roc=roc_auc_score(y_test, prob_y_4)
prec= precision_score(y_test, pred_y_4)
f1s=f1_score(y_test,pred_y_4)
print("*"*40)
results_final11 = pd.DataFrame([[name, round(acc*100,3), round(logloss,3), 
                                   round(cv*100,3), round(prec*100,3),round(rec*100,3), round(roc*100,3),
                                   round(f1s*100,3),round(fpr*100,3),round(ers*100,3)]],
                                 columns=res_cols)
results11 = results.append(results_final11)
print("Results Shape",results11.shape)
results11.head(10)

In [None]:
from sklearn.metrics import roc_curve
# import matplotlib.pyplot as plt

prob_y_13 = clf_4.predict_proba(X_test)
# keep probabilities for the positive outcome only
prob_y_13 = [p[1] for p in prob_y_13]
# calculate AUC
auc = roc_auc_score(y_test, prob_y_13)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, prob_y_13)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show();

#  Balanced Bagging Classifier
For the following Balanced algorithms from imblearn we will be using the standard testing and training sets (X_train, X_test, y_train, y_test) and will allow the algorithms to do the resampling.<br> <br>For the sampling_strategy, we will be using majority as the solution.<br><br>'majority': resample only the majority class

We will then gather the results of some scoring metrics (Accuracy, Log Loss, Cross Validation, Recall, Roc Auc, F1, False Positive Rate, Error Rate), and put those scores into a dataframe.

In [None]:
clf_5 = BalancedBaggingClassifier(max_features=df_X.shape[1], n_estimators=500, replacement=True,
                              sampling_strategy='majority', random_state=42)
clf_5.fit(X_train,y_train)
#putting results in df
res_cols=["Classifier", "Accuracy", "Log Loss","precision", "Cross Val", "Recall", "Roc Auc","F1", 
          "False Positive Rate", "Error Rate"]
results = pd.DataFrame(columns=res_cols)
print("\n"*3)      
print('~'*40)
pred_y_5 = clf_5.predict(X_test)
acc = accuracy_score(y_test, pred_y_5 )
print("Accuracy: {:.4%}".format(acc))
    
cv= np.mean(cross_val_score(clf_5,X_train,y_train, cv=3))
print("Cross validation scores:",cv)
    
    
train_predictions = clf_5.predict_proba(X_test)
logloss = log_loss(y_test, train_predictions)
print("Log Loss: {}".format(logloss))
    
cm = confusion_matrix(y_test, pred_y_5 )
    
cm_plot(cm)
    
#FPR and Error Rate setup
tn, fp, fn, tp = confusion_matrix(y_test, pred_y_5 ).ravel()
    
fpr = fp/(tn+fp)
ers = 1-acc
rec= recall_score(y_test, pred_y_5 )
prob_y_5 = clf_4.predict_proba(X_test)
prob_y_5 = [p[1] for p in prob_y_5]
print( roc_auc_score(y_test, prob_y_5) )
roc=roc_auc_score(y_test, prob_y_5)
prec= precision_score(y_test, pred_y_5 )
f1s=f1_score(y_test,pred_y_5 )
print("*"*40)
results_final = pd.DataFrame([[name, round(acc*100,3), round(logloss,3), 
                                   round(cv*100,3), round(prec*100,3),round(rec*100,3), round(roc*100,3),
                                   round(f1s*100,3),round(fpr*100,3),round(ers*100,3)]],
                                 columns=res_cols)
results = results.append(results_final)
print("Results Shape",results.shape)
results.head(10)

In [None]:
from sklearn.metrics import roc_curve
# import matplotlib.pyplot as plt

prob_y_13 = clf_5.predict_proba(X_test)
# keep probabilities for the positive outcome only
prob_y_13 = [p[1] for p in prob_y_13]
# calculate AUC
auc = roc_auc_score(y_test, prob_y_13)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, prob_y_13)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show();

# Choice
Based on the visualizations above, Balanced Bagging Classifier from imblearn is the algorithm of choice for this data. While some of the scores may have been close, Balanced Bagging Classifier had higher scores in Accuracy, Cross Validation, and Specificity. The algorithm also had the lower Error Rate and False Positive Rates of the group.

In [None]:

#start
start_res_bbag_w_lgbm = time.time()

# Balanced Bagging Classifier
res_bbag_w_lgbm = BalancedBaggingClassifier(base_estimator=LGBMClassifier(learning_rate =0.03, 
                                                                          max_depth=40, 
                                                                          min_data_in_leaf=10,
                                                                          n_estimators=500, 
                                                                          num_leaves=50, 
                                                                          random_state = 42), 
                                            max_features=df_X.shape[1], n_estimators=500, 
                                            replacement=True,
                                            random_state=42)
res_bbag_w_lgbm.fit(X_train, y_train)
pred_res_bbag_w_lgbm = res_bbag_w_lgbm.predict(X_test)

   
# Creates a confusion matrix
res_bbag_w_lgbm_cm = confusion_matrix(y_test,pred_res_bbag_w_lgbm)

# Transform to df for easier plotting
res_bbag_w_lgbm_cm_df = pd.DataFrame(res_bbag_w_lgbm_cm,
                     index = ['Not Severe','Severe'], 
                     columns = ['Not Severe','Severe'])

plt.figure(figsize=(15,5))
plt.style.use('dark_background')
sns.heatmap(res_bbag_w_lgbm_cm_df, annot=True, fmt="d", cmap='viridis', linecolor='black', linewidths=1)
plt.title('Balanced Bagging with LightGBM Accuracy: {0:.2f}%'.format(accuracy_score(y_test,pred_res_bbag_w_lgbm )*100),
          fontsize=15)
plt.ylabel('Actual\n')
plt.xlabel('Predicted\n')
plt.show()
#print("Resampled Balanced Bagging with LightGBM Classifier Cross Validation Score: {:0.2f}%"
#       .format(np.mean(cross_val_score(res_bbag_w_lgbm, X_train, y_train, cv=3)*100)))
print('\n')
#end
end_res_bbag_w_lgbm = time.time()
print("\n Balanced Bagging with LightGBM Time: ",end_res_bbag_w_lgbm - start_res_bbag_w_lgbm)


In [None]:
#extracting true_positives, false_positives, true_negatives, false_negatives
tn, fp, fn, tp = confusion_matrix(y_test,pred_res_bbag_w_lgbm).ravel()

accuracy = accuracy_score(y_test,pred_res_bbag_w_lgbm)*100
specificity = tn/(tn+fp)*100
fpr = fp/(tn+fp)*100
ers = 100-accuracy


train_predictions2 = res_bbag_w_lgbm.predict_proba(X_test)


print(" Balanced Bagging Classifier with LightGBM Specificity Score: {0:.2f}%".format(specificity))
print(" Balanced Bagging Classifier with LightGBM False Positive Rate Score: {0:.2f}%".format(fpr))
print(" Balanced Bagging Classifier with LightGBM Error Rate Score: {0:.2f}%".format(ers))

#Check scores
print("Balanced Bagging Classifier with LightGBM Accuracy Score: {:0.2f}%"
      .format(accuracy_score(y_test,pred_res_bbag_w_lgbm )*100))
print("Balanced Bagging Classifier with LightGBM F1 Score: {:0.2f}%"
      .format(f1_score(y_test, pred_res_bbag_w_lgbm,average="macro")*100))
print("Balanced Bagging Classifier with LightGBM Precision Scoreres_: {:0.2f}%"
      .format(precision_score(y_test, pred_res_bbag_w_lgbm, average="macro")*100))
print("Balanced Bagging Classifier with LightGBM Recall Score: {:0.2f}%"
      .format(recall_score(y_test, pred_res_bbag_w_lgbm, average="macro")*100))
print("Balanced Bagging Classifier with LightGBM Roc Auc Score: {0:.2f}%"
      .format(roc_auc_score(y_test, pred_res_bbag_w_lgbm)*100))
print("Balanced Bagging Classifier with LightGBM Log Loss {0:.2f}%"
      .format(log_loss(y_test, train_predictions2)*100))

In [None]:
from sklearn.metrics import roc_curve
# import matplotlib.pyplot as plt

prob_y_14 = res_bbag_w_lgbm.predict_proba(X_test)
# keep probabilities for the positive outcome only
prob_y_14 = [p[1] for p in prob_y_14]
# calculate AUC
auc = roc_auc_score(y_test, prob_y_14)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, prob_y_14)
# plot no skill
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show();