In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install kmodes

In [None]:
#Import modules
import numpy as np
import holidays
import pandas as pd
import seaborn as sns
import pickle
import time
import timeit


import matplotlib.pyplot as plt

%matplotlib inline

import datetime
import math
from collections import Counter

#scipy
import scipy.stats as stats
from scipy import stats
from scipy.stats import chi2_contingency

#sklearn
import sklearn
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, log_loss, recall_score 
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
import joblib
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             fbeta_score, make_scorer, classification_report, confusion_matrix)

#for clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

#other learners
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from kmodes.kmodes import KModes

#imblearn
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

#webscraping
import requests
from bs4 import BeautifulSoup
import re
import urllib
from IPython.core.display import HTML

#time series
import statsmodels.api as sm
from pylab import rcParams
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA


#warning ignorer
import warnings
warnings.filterwarnings("ignore")

from IPython.display import Image

# Undersampling and Oversampling for class imbalance
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Reading in pre-processed and transformed data 
file ='/content/drive/MyDrive/Trial/data/Accidents/Visualized_and_manipulated.csv'
df = pd.read_csv(file, low_memory = False)
# Dropping unnamed column
df.drop(df.columns[0],axis=1,inplace=True)
df.head()

In [None]:
#made separate dataframe w. set index that wouldnt effect data vis above
df1=df
#set index to accident_index
df1.set_index('Accident_Index', inplace=True)
df1.head()

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
#create a new target variable - Reduced target class from a multi-class classification to a binary classification 
# problem to handle the imbalanced dataset and simplify analysis
df1.loc[df1.Accident_Severity !='Slight', 'Target_Severe_Indicator'] = 1
df1.loc[df1.Accident_Severity =='Slight', 'Target_Severe_Indicator'] = 0

In [None]:
df1["Target_Severe_Indicator"].value_counts()


In [None]:
df1["Accident_Severity"].value_counts()

In [None]:
df1.head()

In [None]:
print(df1.columns)

In [None]:
for col in set(df1.columns) - set(df1.describe().columns):
    df1[col] = df1[col].astype('category')

In [None]:
df.info()

# Random Sampling - removed rows at random to speed up model run times (for testing purposes only) 
Considering the imbalance of classes in the target variable, it may be worth using random stratified sampling to maintain proportionality of classes of the original dataset (Stratified sampling not carried out here however)

In [None]:
np.random.seed(150)

remove_n = 85342 #Sample size to remove from original dataset
df = df1
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df_subset = df.drop(drop_indices)

In [None]:
df_subset.shape
df_subset.head()

In [None]:
# 85% to 15% distribution of target class - Proportionality of the original dataset is still maintained
df_subset['Target_Severe_Indicator'].value_counts()

# Splitting target variable from predictor variables

In [None]:
df_X = df_subset.drop('Target_Severe_Indicator', axis=1)  
df_Y = df_subset['Target_Severe_Indicator']

In [None]:
# Converting independent categorical features to Numerical by creating Dummy variables

df_X_dummy = pd.get_dummies(df_X)
#print(dataset_X_dummy.head())

In [None]:
df_X_dummy.shape

# Feature Selection


# Applying VarianceThreshold filter

In [None]:
from sklearn.feature_selection import VarianceThreshold

# threshold set to 87% for variance 
# i.e. if 87% of the column data is the same (i.e. low variation), the column will not be as useful
# in the prediction
thresh=(.85 * (1 - .85))

In [None]:
# Wrapper function to identify low variance features and remove them from the dataframe 

def get_low_variance_columns(dframe=None, columns=None,
                             skip_columns=None, thresh=0.0,
                             autoremove=False):
    try:
        # get list of all the original df columns
        all_columns = dframe.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column)
                       for column
                       in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=thresh)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [remaining_columns[idx]
                         for idx, _
                         in enumerate(remaining_columns)
                         if idx
                         in feature_indices]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns,
                                             feature_names))
        print("Found {0} low-variance columns."
              .format(len(removed_features)))

        # remove the columns
        if autoremove:
            print("Removing low-variance features.")
            # remove the low-variance columns
            X_removed = vt.transform(X)

            print("Reassembling the dataframe (with low-variance "
                  "features removed).")
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed,
                                  columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index,
                              column=skip_columns[idx],
                              value=skipped_values[:, idx])
            print("Succesfully removed low-variance columns.")

        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something "
              "went wrong.")
        pass

    return dframe, removed_features

In [None]:
# retrieve new dataframe (with low variance features)
df_X_new, low_var_col = get_low_variance_columns(df_X_dummy,[],[],thresh, True) 
#Set to True to remove low variance columns

In [None]:
df_X_new.shape

In [None]:
df_X_new.head()

**Normalizing data** - adjusting values measured on different scales to a notionally common scale (between 0 - 1)

In [None]:
df_X_normalized=(df_X_new-df_X_new.min())/(df_X_new.max()-df_X_new.min())

In [None]:
df_X_normalized.head()

In [None]:
df_X=df_X_normalized.round(3) 

In [None]:
df_X.head(3)

In [None]:
df_Y.value_counts()

In [None]:
from sklearn.model_selection import KFold,GridSearchCV

In [None]:
# 80 train -20 test split
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_train.shape


In [None]:
X_test.shape, y_test.shape

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix  

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))        

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnv_clf = GaussianNB()
gnv_clf.fit(X_train, y_train)

In [None]:
print_score(gnv_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(gnv_clf, X_train, y_train, X_test, y_test, train=False)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(kernel='linear',
            class_weight='balanced', # penalize
            probability=True)
svm_clf.fit(X_train, y_train)


In [None]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

# Balanced Bagging

In [None]:
bbc_clf=BalancedBaggingClassifier(max_features=df_X.shape[1], n_estimators=500, replacement=True,
                              sampling_strategy='majority', random_state=42)
bbc_clf.fit(X_train,y_train)

In [None]:
print_score(bbc_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(bbc_clf, X_train, y_train, X_test, y_test, train=False)

# Balanced BAgging+LGBM

In [None]:
#start
start_res_bbag_w_lgbm_clf = time.time()

# Balanced Bagging Classifier
res_bbag_w_lgbm_clf = BalancedBaggingClassifier(base_estimator=LGBMClassifier(learning_rate =0.03, 
                                                                          max_depth=40, 
                                                                          min_data_in_leaf=10,
                                                                          n_estimators=500, 
                                                                          num_leaves=50, 
                                                                          random_state = 42), 
                                            max_features=df_X.shape[1], n_estimators=500, 
                                            replacement=True,
                                            random_state=42)
res_bbag_w_lgbm_clf.fit(X_train, y_train)

In [None]:
print_score(res_bbag_w_lgbm_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
print_score(res_bbag_w_lgbm_clf, X_train, y_train, X_test, y_test, train=False)