In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_parquet("accident_vehicle_preprocessed.parquet")
df.head()

Unnamed: 0,Age_of_Vehicle,Driver_Home_Area_Type,Driver_IMD_Decile,Engine_Capacity_.CC.,Journey_Purpose_of_Driver,Junction_Location,make,model,Propulsion_Code,Sex_of_Driver,...,Road_Type,Speed_limit,Urban_or_Rural_Area,Weather_Conditions,Year_y,InScotland,Hour,Day,Month,Age_of_Driver
0,3.0,0,5.268217,8268.0,1,6,93,8118,5,1,...,0,30,1,1,2005,0,17,5,1,45.0
1,5.0,3,3.0,8300.0,1,8,93,8118,5,1,...,3,30,1,1,2005,0,0,6,1,35.0
2,10.0,0,5.268217,85.0,2,8,140,8118,8,1,...,3,30,1,1,2005,0,21,10,1,55.0
3,6.402357,0,5.268217,2142.73833,2,0,15,3699,0,1,...,3,30,1,1,2005,0,20,13,1,35.0
4,4.0,3,6.0,4266.0,2,0,240,12714,8,1,...,3,30,1,1,2005,0,20,13,1,45.0


In [3]:
df2=df.copy()

In [4]:
df2['Accident_Severity']=df2['Accident_Severity'].replace(to_replace=2, value=1)

In [5]:
df2['Accident_Severity'].value_counts()

0    420767
1     65757
Name: Accident_Severity, dtype: int64

In [6]:
#identify dependent and independent variables
X=df2.drop('Accident_Severity', axis=1)
y=df2.Accident_Severity

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Model Implementation: Random Forest

In [8]:
#models implementation and evaluation

from sklearn.ensemble import RandomForestClassifier
# creating a RF classifier
clf = RandomForestClassifier(n_estimators = 100) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
 
# metrics are used to find accuracy or error
from sklearn import metrics 
print()
 
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL:  0.8746210369456863


In [9]:
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[83887   276]
 [11924  1218]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     84163
           1       0.82      0.09      0.17     13142

    accuracy                           0.87     97305
   macro avg       0.85      0.54      0.55     97305
weighted avg       0.87      0.87      0.83     97305



# Undersampling

In [10]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

ns=RandomUnderSampler(random_state=33)
X_train_ns, y_train_ns=ns.fit_resample(X_train,y_train)
print("The number of classes before ns {}".format(Counter(y_train)))
print("The number of classes after ns {}".format(Counter(y_train_ns)))

The number of classes before ns Counter({0: 336604, 1: 52615})
The number of classes after ns Counter({0: 52615, 1: 52615})


In [11]:
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

In [12]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[59042 25121]
 [ 4151  8991]]
              precision    recall  f1-score   support

           0       0.93      0.70      0.80     84163
           1       0.26      0.68      0.38     13142

    accuracy                           0.70     97305
   macro avg       0.60      0.69      0.59     97305
weighted avg       0.84      0.70      0.74     97305



# Oversampling

In [13]:
from imblearn.over_sampling import RandomOverSampler

os=RandomOverSampler(random_state=13)
X_train_os, y_train_os=os.fit_resample(X_train,y_train)
print("The number of classes before os {}".format(Counter(y_train)))
print("The number of classes after os {}".format(Counter(y_train_os)))

The number of classes before os Counter({0: 336604, 1: 52615})
The number of classes after os Counter({0: 336604, 1: 336604})


In [14]:
classifier=RandomForestClassifier()
classifier.fit(X_train_os,y_train_os)

In [15]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[83205   958]
 [11195  1947]]
              precision    recall  f1-score   support

           0       0.88      0.99      0.93     84163
           1       0.67      0.15      0.24     13142

    accuracy                           0.88     97305
   macro avg       0.78      0.57      0.59     97305
weighted avg       0.85      0.88      0.84     97305



# Easy Ensembler

In [16]:
from imblearn.ensemble import EasyEnsembleClassifier

ez=EasyEnsembleClassifier(random_state=19)
ez.fit(X_train, y_train)

In [17]:
y_pred=ez.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[55515 28648]
 [ 4350  8792]]
              precision    recall  f1-score   support

           0       0.93      0.66      0.77     84163
           1       0.23      0.67      0.35     13142

    accuracy                           0.66     97305
   macro avg       0.58      0.66      0.56     97305
weighted avg       0.83      0.66      0.71     97305



# ADASYN: Adaptive Synthetic Sampling

In [18]:
from imblearn.over_sampling import ADASYN

ada=ADASYN(random_state=130)
X_train_ada, y_train_ada=ada.fit_resample(X_train,y_train)
print("The number of classes before ada {}".format(Counter(y_train)))
print("The number of classes after ada {}".format(Counter(y_train_ada)))

The number of classes before ada Counter({0: 336604, 1: 52615})
The number of classes after ada Counter({1: 345494, 0: 336604})


In [19]:
classifier=RandomForestClassifier()
classifier.fit(X_train_ada,y_train_ada)

In [20]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[81466  2697]
 [10964  2178]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     84163
           1       0.45      0.17      0.24     13142

    accuracy                           0.86     97305
   macro avg       0.66      0.57      0.58     97305
weighted avg       0.82      0.86      0.83     97305



# SMOTE

In [21]:
from imblearn.over_sampling import SMOTE

In [22]:
smt=SMOTE(random_state=80,n_jobs=-1)
X_train_smt, y_train_smt=smt.fit_resample(X_train,y_train)
print("The number of classes before smt {}".format(Counter(y_train)))
print("The number of classes after smt {}".format(Counter(y_train_smt)))

The number of classes before smt Counter({0: 336604, 1: 52615})
The number of classes after smt Counter({0: 336604, 1: 336604})


In [23]:
classifier=RandomForestClassifier()
classifier.fit(X_train_smt,y_train_smt)

In [24]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[81548  2615]
 [10933  2209]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     84163
           1       0.46      0.17      0.25     13142

    accuracy                           0.86     97305
   macro avg       0.67      0.57      0.58     97305
weighted avg       0.82      0.86      0.83     97305

