In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv("data_cleaned.csv")
data.drop(columns=['INCDTTM','year','day','month'],axis=1, inplace=True)

In [3]:
data.head(5)

Unnamed: 0,ADDRTYPE,SEVERITYCODE,COLLISIONTYPE,JUNCTIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
0,Intersection,2,Pedestrian,At Intersection (intersection related),0,Clear,Dry,Daylight,Y,0,N
1,Intersection,2,Angles,At Intersection (intersection related),0,Raining,Wet,Dark - Street Lights On,Y,0,N
2,Block,1,Sideswipe,Mid-Block (not related to intersection),0,Clear,Dry,Daylight,Y,0,N
3,Intersection,1,Left Turn,At Intersection (intersection related),0,Raining,Wet,Dark - Street Lights On,Y,0,N
4,Block,1,Other,Mid-Block (not related to intersection),0,Clear,Dry,Dark - Street Lights On,Y,0,N


In [4]:
data['SEVERITYCODE'].value_counts()

1    137485
2     58698
3      3098
4       349
Name: SEVERITYCODE, dtype: int64

In [5]:
data.dtypes

ADDRTYPE         object
SEVERITYCODE      int64
COLLISIONTYPE    object
JUNCTIONTYPE     object
UNDERINFL         int64
WEATHER          object
ROADCOND         object
LIGHTCOND        object
PEDROWNOTGRNT    object
SPEEDING          int64
HITPARKEDCAR     object
dtype: object

In [6]:
cat_columns = ['ADDRTYPE','COLLISIONTYPE','JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND','HITPARKEDCAR','PEDROWNOTGRNT']

In [7]:
data[cat_columns] = data[cat_columns].apply(lambda x: x.astype('category').cat.codes)

In [8]:
data.tail(6)

Unnamed: 0,ADDRTYPE,SEVERITYCODE,COLLISIONTYPE,JUNCTIONTYPE,UNDERINFL,WEATHER,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SPEEDING,HITPARKEDCAR
199624,1,1,0,3,0,5,0,5,0,0,0
199625,1,1,7,4,0,11,7,5,0,0,0
199626,1,2,7,4,0,2,0,5,0,0,0
199627,2,1,8,1,0,2,0,5,0,0,0
199628,1,1,0,2,0,2,0,5,0,0,0
199629,1,2,5,4,0,2,0,5,0,0,0


In [9]:
X = data.drop(columns=['SEVERITYCODE'],axis=1)
Y = data['SEVERITYCODE']

In [10]:
#apply SelectKBest to extract top 5 best features for housing df
from sklearn.feature_selection import SelectKBest, chi2, f_classif

bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  #naming the dataframe columns
print(featureScores.nlargest(14,'Score'))  #print 5best features

        Features        Score
1  COLLISIONTYPE  5670.606450
2   JUNCTIONTYPE  5312.475739
4        WEATHER  4875.036603
5       ROADCOND  2745.880663
9   HITPARKEDCAR  2092.978875
0       ADDRTYPE  1363.455201
3      UNDERINFL  1132.955601
8       SPEEDING   892.711942
6      LIGHTCOND   609.365801


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, stratify=Y,random_state=66)

In [12]:
y_train.value_counts()

1    96239
2    41089
3     2169
4      244
Name: SEVERITYCODE, dtype: int64

In [13]:
y_test.value_counts()

1    41246
2    17609
3      929
4      105
Name: SEVERITYCODE, dtype: int64

In [14]:
from sklearn.utils import class_weight

weights = class_weight.compute_class_weight('balanced', np.unique(y_train),y_train)

In [15]:
weights

array([  0.36300512,   0.85023364,  16.10661595, 143.1772541 ])

In [16]:
class_weights = {}
for i in range(4):
    class_weights[i+1] = weights[i]
class_weights

{1: 0.3630051226633693,
 2: 0.8502336391735015,
 3: 16.106615952051637,
 4: 143.17725409836066}

In [17]:
models_info_df_4_class = pd.DataFrame([])

## Trying different models having severity code 1,2,3,4

## Logistic Regression

In [18]:

lr1 = LogisticRegression(class_weight=class_weights,random_state = 0)
lr1.fit(X_train, y_train)
y_pred_lr1 = lr1.predict(X_test)


print('Accuracy on training set:',lr1.score(X_train,y_train))
print('Accuracy on test set:',lr1.score(X_test,y_test))
print("F1 score", f1_score(y_test, y_pred_lr1,average='weighted'))

print (classification_report(y_test, y_pred_lr1))

models_info_df_4_class = models_info_df_4_class.append([("Logistic Regression 4 class",lr1.score(X_train,y_train), lr1.score(X_test,y_test), f1_score(y_test, y_pred_lr1,average='weighted'))])

Accuracy on training set: 0.4841170451048726
Accuracy on test set: 0.48144066523067675
F1 score 0.554102110056551
              precision    recall  f1-score   support

           1       0.78      0.57      0.66     41246
           2       0.41      0.30      0.34     17609
           3       0.02      0.19      0.03       929
           4       0.01      0.45      0.01       105

    accuracy                           0.48     59889
   macro avg       0.30      0.38      0.26     59889
weighted avg       0.66      0.48      0.55     59889



## Random Forest Classifier

In [19]:

rf_clf1 = RandomForestClassifier(n_estimators = 10, random_state = 0,class_weight=class_weights)
rf_clf1.fit(X_train, y_train)
y_pred_rfc1 = rf_clf1.predict(X_test)
print('Accuracy on training set:',rf_clf1.score(X_train,y_train))
print('Accuracy on test set:',rf_clf1.score(X_test,y_test))
print("F1 Score : ",f1_score(y_test,y_pred_rfc1, average='weighted'))
print(classification_report(y_test, y_pred_rfc1))
models_info_df_4_class = models_info_df_4_class.append([("RandomForestClassifier 4 class",rf_clf1.score(X_train,y_train), rf_clf1.score(X_test,y_test), f1_score(y_test,y_pred_rfc1, average='weighted') )])

Accuracy on training set: 0.5717219713613041
Accuracy on test set: 0.5540416437075256
F1 Score :  0.6098873717988984
              precision    recall  f1-score   support

           1       0.88      0.56      0.68     41246
           2       0.41      0.56      0.48     17609
           3       0.05      0.29      0.08       929
           4       0.01      0.46      0.02       105

    accuracy                           0.55     59889
   macro avg       0.34      0.47      0.32     59889
weighted avg       0.73      0.55      0.61     59889



## Decision Tree Classifier

In [20]:

dtc1 = DecisionTreeClassifier(class_weight=class_weights)
dtc1.fit(X_train, y_train)
y_pred_dtc1 = dtc1.predict(X_test)
print('Accuracy on training set',dtc1.score(X_train, y_train))
print('Accuracy on test set:',dtc1.score(X_test, y_test))
print("F1 Score : ",f1_score(y_test,y_pred_dtc1, average='weighted'))
print(classification_report(y_test, y_pred_dtc1))
models_info_df_4_class = models_info_df_4_class.append([("DecisionTreeClassifier 4 class",dtc1.score(X_train, y_train), dtc1.score(X_test, y_test), f1_score(y_test,y_pred_dtc1, average='weighted') )])

Accuracy on training set 0.565782411747447
Accuracy on test set: 0.5461937918482526
F1 Score :  0.6054459686369363
              precision    recall  f1-score   support

           1       0.88      0.55      0.68     41246
           2       0.41      0.55      0.47     17609
           3       0.05      0.30      0.08       929
           4       0.01      0.51      0.02       105

    accuracy                           0.55     59889
   macro avg       0.34      0.48      0.31     59889
weighted avg       0.73      0.55      0.61     59889



## XGBClassifier

In [21]:

xgb_clf1= XGBClassifier(class_weight=class_weights)
xgb_clf1.fit(X_train, y_train)

y_pred_xgb1 = xgb_clf1.predict(X_test)

print('Accuracy score on train_data: ', accuracy_score(y_true = y_train, y_pred = xgb_clf1.predict(X_train).round()))
print('Accuracy score on test_data: ', accuracy_score(y_true = y_test, y_pred = xgb_clf1.predict(X_test).round()))
print("F1 score", f1_score(y_true = y_train, y_pred = xgb_clf1.predict(X_train).round(), average='weighted'))
print(classification_report(y_test, y_pred_xgb1))
models_info_df_4_class = models_info_df_4_class.append([("XGBClassifier 4 class",accuracy_score(y_true = y_train, y_pred = xgb_clf1.predict(X_train).round()),accuracy_score(y_true = y_test, y_pred = xgb_clf1.predict(X_test).round()), f1_score(y_true = y_train, y_pred = xgb_clf1.predict(X_train).round(), average='weighted') )])

Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy score on train_data:  0.7397971962416184
Accuracy score on test_data:  0.7345756315850991
F1 score 0.6807308313247457
              precision    recall  f1-score   support

           1       0.74      0.98      0.84     41246
           2       0.72      0.21      0.33     17609
           3       0.38      0.00      0.01       929
           4       0.00      0.00      0.00       105

    accuracy                           0.73     59889
   macro avg       0.46      0.30      0.29     59889
weighted avg       0.72      0.73      0.67     59889



## KNeighborsClassifier

In [22]:

knn_clf1 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)  
#You can change these hyperparameters like metric etc.
knn_clf1.fit(X_train, y_train)
y_pred_knn1 = knn_clf1.predict(X_test)
print('Accuracy on training set:',knn_clf1.score(X_train,y_train))
print('Accuracy on test set:',knn_clf1.score(X_test,y_test))
print("F1 Score : ",f1_score(y_test,y_pred_knn1, average='weighted'))
print(classification_report(y_test, y_pred_knn1))
models_info_df_4_class = models_info_df_4_class.append([("KNeighborsClassifier 4 class",knn_clf1.score(X_train,y_train), knn_clf1.score(X_test,y_test),f1_score(y_test,y_pred_knn1, average='weighted'))])

Accuracy on training set: 0.7065857550754611
Accuracy on test set: 0.6991100202040441
F1 Score :  0.685062395535323
              precision    recall  f1-score   support

           1       0.77      0.83      0.80     41246
           2       0.50      0.43      0.46     17609
           3       0.21      0.01      0.01       929
           4       0.00      0.00      0.00       105

    accuracy                           0.70     59889
   macro avg       0.37      0.32      0.32     59889
weighted avg       0.68      0.70      0.69     59889



## Trying Different models having severity code 1,2 = 1 and 3,4 = 2

In [23]:
data2 = pd.read_csv("data_cleaned.csv")

data2.drop(columns=['INCDTTM','year','day','month'],axis=1, inplace=True)

In [24]:
cat_columns = ['ADDRTYPE','COLLISIONTYPE','JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND','HITPARKEDCAR','PEDROWNOTGRNT']

In [25]:
data2[cat_columns] = data2[cat_columns].apply(lambda x: x.astype('category').cat.codes)

In [26]:
# # Ordering 1 & 2 as 1 and 3 & 4 as 2
data2['SEVERITYCODE'].replace(2,1,inplace=True)
data2['SEVERITYCODE'].replace(3,2,inplace=True)
data2['SEVERITYCODE'].replace(4,2,inplace=True)
data2['SEVERITYCODE'].value_counts()

1    196183
2      3447
Name: SEVERITYCODE, dtype: int64

In [27]:
X2 = data2.drop(columns=['SEVERITYCODE'],axis=1)
Y2 = data2['SEVERITYCODE']

In [28]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, Y2, test_size=0.30, stratify=Y2,random_state=66)

In [29]:
from sklearn.utils import class_weight

weights2 = class_weight.compute_class_weight('balanced', np.unique(y2_train),y2_train)

In [30]:
weights2

array([ 0.50878554, 28.95586407])

In [31]:
class_weights2 = {}
for i in range(2):
    class_weights2[i+1] = weights2[i]
class_weights2

{1: 0.5087855353605966, 2: 28.955864069622876}

In [32]:
models_info_df_2_class = pd.DataFrame([])

## Logistic Regression

In [33]:

lr2 = LogisticRegression(class_weight=class_weights2,random_state = 0)
lr2.fit(X2_train, y2_train)
y_pred_lr2 = lr2.predict(X2_test)


print('Accuracy on training set:',lr2.score(X2_train,y2_train))
print('Accuracy on test set:',lr2.score(X2_test,y2_test))
print("F1 score", f1_score(y2_test, y_pred_lr2, average='weighted'))

print (classification_report(y2_test, y_pred_lr2))

models_info_df_2_class = models_info_df_2_class.append([("Logistic Regression 2 class",lr2.score(X2_train,y2_train), lr2.score(X2_test,y2_test), f1_score(y2_test, y_pred_lr2,average='weighted'))])

Accuracy on training set: 0.6356044396419089
Accuracy on test set: 0.6349747032009218
F1 score 0.761400059381787
              precision    recall  f1-score   support

           1       0.99      0.64      0.77     58855
           2       0.03      0.61      0.05      1034

    accuracy                           0.63     59889
   macro avg       0.51      0.62      0.41     59889
weighted avg       0.97      0.63      0.76     59889



## Random Forest Clasifier

In [34]:

rf_clf2 = RandomForestClassifier(n_estimators = 10, random_state = 0)
rf_clf2.fit(X2_train, y2_train)
y_pred_rfc2 = rf_clf2.predict(X2_test)
print('Accuracy on training set:',rf_clf2.score(X2_train,y2_train))
print('Accuracy on test set:',rf_clf2.score(X2_test,y2_test))
print("F1 Score : ",f1_score(y2_test,y_pred_rfc2, average='weighted'))
print(classification_report(y2_test, y_pred_rfc2))
models_info_df_2_class = models_info_df_2_class.append([("RandomForestClassifier 2 class",rf_clf2.score(X2_train,y2_train), rf_clf2.score(X2_test,y2_test), f1_score(y2_test,y_pred_rfc2, average='weighted') )])

Accuracy on training set: 0.9833477647934393
Accuracy on test set: 0.9823172869809147
F1 Score :  0.9741292828086912
              precision    recall  f1-score   support

           1       0.98      1.00      0.99     58855
           2       0.14      0.00      0.01      1034

    accuracy                           0.98     59889
   macro avg       0.56      0.50      0.50     59889
weighted avg       0.97      0.98      0.97     59889



## Decision Tree Classifier

In [35]:

dtc2 = DecisionTreeClassifier(class_weight=class_weights2)
dtc2.fit(X2_train, y2_train)
y_pred_dtc2 = dtc2.predict(X2_test)
print('Accuracy on training set',dtc2.score(X2_train, y2_train))
print('Accuracy on test set:',dtc2.score(X2_test, y2_test))
print("F1 Score : ",f1_score(y2_test,y_pred_dtc2, average='weighted'))
print(classification_report(y2_test, y_pred_dtc2))
models_info_df_2_class = models_info_df_2_class.append([("DecisionTreeClassifier 2 class",dtc2.score(X2_train, y2_train), dtc2.score(X2_test, y2_test), f1_score(y2_test,y_pred_dtc2, average='weighted') )])

Accuracy on training set 0.7985058071718393
Accuracy on test set: 0.7941358179298369
F1 Score :  0.870235073849254
              precision    recall  f1-score   support

           1       0.99      0.80      0.88     58855
           2       0.05      0.60      0.09      1034

    accuracy                           0.79     59889
   macro avg       0.52      0.70      0.49     59889
weighted avg       0.98      0.79      0.87     59889



## XGBClassifier

In [36]:

xgb_clf2 = XGBClassifier()
xgb_clf2.fit(X2_train, y2_train)

y_pred_xgb2 = xgb_clf2.predict(X2_test)

print('Accuracy score on train_data: ', accuracy_score(y_true = y2_train, y_pred = xgb_clf2.predict(X2_train).round()))
print('Accuracy score on test_data: ', accuracy_score(y_true = y2_test, y_pred = xgb_clf2.predict(X2_test).round()))
print("F1 score", f1_score(y_true = y2_train, y_pred = xgb_clf2.predict(X2_train).round(), average='weighted'))
print(classification_report(y2_test, y_pred_xgb2))
models_info_df_2_class = models_info_df_2_class.append([("XGBClassifier 2 class",accuracy_score(y_true = y2_train, y_pred = xgb_clf2.predict(X2_train).round()),accuracy_score(y_true = y2_test, y_pred = xgb_clf2.predict(X2_test).round()), f1_score(y_true = y2_train, y_pred = xgb_clf2.predict(X2_train).round(), average='weighted') )])

Accuracy score on train_data:  0.9828683063667786
Accuracy score on test_data:  0.9826345405667151
F1 score 0.974579618482291
              precision    recall  f1-score   support

           1       0.98      1.00      0.99     58855
           2       0.25      0.00      0.01      1034

    accuracy                           0.98     59889
   macro avg       0.62      0.50      0.50     59889
weighted avg       0.97      0.98      0.97     59889



## KNeighborsClassifier

In [37]:

knn_clf2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)  
#You can change these hyperparameters like metric etc.
knn_clf2.fit(X2_train, y2_train)
y_pred_knn2 = knn_clf2.predict(X2_test)
print('Accuracy on training set:',knn_clf2.score(X2_train,y2_train))
print('Accuracy on test set:',knn_clf2.score(X2_test,y2_test))
print("F1 Score : ",f1_score(y2_test,y_pred_knn2, average='weighted'))
print(classification_report(y2_test, y_pred_knn2))
models_info_df_2_class = models_info_df_2_class.append([("KNeighborsClassifier 2 class",knn_clf2.score(X2_train,y2_train), knn_clf2.score(X2_test,y2_test),f1_score(y2_test,y_pred_knn2, average='weighted'))])

Accuracy on training set: 0.9816303017725649
Accuracy on test set: 0.9817161749236086
F1 Score :  0.9743891439677576
              precision    recall  f1-score   support

           1       0.98      1.00      0.99     58855
           2       0.22      0.02      0.04      1034

    accuracy                           0.98     59889
   macro avg       0.60      0.51      0.52     59889
weighted avg       0.97      0.98      0.97     59889



In [38]:
models_info_df_4_class.columns = ['models','Training Accuracy','Test Accuracy', 'f1 score']
models_info_df_4_class

Unnamed: 0,models,Training Accuracy,Test Accuracy,f1 score
0,Logistic Regression 4 class,0.484117,0.481441,0.554102
0,RandomForestClassifier 4 class,0.571722,0.554042,0.609887
0,DecisionTreeClassifier 4 class,0.565782,0.546194,0.605446
0,XGBClassifier 4 class,0.739797,0.734576,0.680731
0,KNeighborsClassifier 4 class,0.706586,0.69911,0.685062


In [39]:
models_info_df_2_class.columns = ['models','Training Accuracy','Test Accuracy', 'f1 score']
models_info_df_2_class

Unnamed: 0,models,Training Accuracy,Test Accuracy,f1 score
0,Logistic Regression 2 class,0.635604,0.634975,0.7614
0,RandomForestClassifier 2 class,0.983348,0.982317,0.974129
0,DecisionTreeClassifier 2 class,0.798506,0.794136,0.870235
0,XGBClassifier 2 class,0.982868,0.982635,0.97458
0,KNeighborsClassifier 2 class,0.98163,0.981716,0.974389
