In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, log_loss, jaccard_similarity_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('df_train.csv', index_col = 0)
df_test = pd.read_csv('df_test.csv', index_col = 0)

In [3]:
#Test set
X_test = df_test.drop(['P_ISEV'], axis =1).values
Y_test = df_test['P_ISEV'].values
print ('Test set:', X_test.shape,  Y_test.shape)

Test set: (165646, 14) (165646,)


### Upsample the minority class

In [4]:
from sklearn.utils import resample
X1 = df_train  # orginal train data will be used for resampling (labeled as X1)

# separate minority and majority classes
no_injury = X1[X1.P_ISEV==1]
injury = X1[X1.P_ISEV==2]
fatal = X1[X1.P_ISEV==3]

# upsample minority
fatal_upsampled = resample(fatal,
                          replace=True, # sample with replacement
                          n_samples=len(injury), # match number in majority class
                          random_state=16) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([no_injury, injury, fatal_upsampled])

# check new class counts
print(upsampled.P_ISEV.value_counts())

Y1 = upsampled['P_ISEV'].values
X1 = (upsampled.drop('P_ISEV', axis=1)).values

3    201218
2    201218
1    151205
Name: P_ISEV, dtype: int64


In [5]:
# Split the upsampled train data into train and validation sets
X1_train, X1_validate, Y1_train, Y1_validate = train_test_split( X1, Y1, test_size=0.2, random_state= 4) 
print ('Train set:', X1_train.shape,  Y1_train.shape)
print ('Validation set:', X1_validate.shape,  Y1_validate.shape)

Train set: (442912, 14) (442912,)
Validation set: (110729, 14) (110729,)


### Multinomial Logistic Regression

In [37]:
from sklearn import linear_model

#train model
upsampledMLR = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X1_train, Y1_train)

#Validate
upsampledMLR_validate = upsampledMLR.predict(X1_validate)

#Predict on test set
upsampledMLR_pred = upsampledMLR.predict(X_test)

# Checking accuracy of validation set
print("Multinomial Log Regression's accuracy_score: ", accuracy_score(Y1_validate, upsampledMLR_validate))
print("Multinomial Log Regression's f1_score: ", f1_score(Y1_validate, upsampledMLR_validate, average = 'weighted'))
print("Multinomial Log Regression's recall_score: ", recall_score(Y1_validate, upsampledMLR_validate, average = 'weighted'))
print (classification_report(Y1_validate, upsampledMLR_validate))

Multinomial Log Regression's accuracy_score:  0.5362100262803782
Multinomial Log Regression's f1_score:  0.5282353337222929
Multinomial Log Regression's recall_score:  0.5362100262803782
              precision    recall  f1-score   support

           1       0.49      0.35      0.41     30502
           2       0.50      0.51      0.50     40023
           3       0.59      0.70      0.64     40204

   micro avg       0.54      0.54      0.54    110729
   macro avg       0.53      0.52      0.52    110729
weighted avg       0.53      0.54      0.53    110729



In [41]:
# Checking accuracy of test set
print("Multinomial Log Regression's accuracy_score: ", accuracy_score(Y_test, upsampledMLR_pred))
print("Multinomial Log Regression's f1_score: ", f1_score(Y_test, upsampledMLR_pred, average = 'weighted'))
print("Multinomial Log Regression's recall_score: ", recall_score(Y_test, upsampledMLR_pred, average = 'weighted'))
print (classification_report(Y_test, upsampledMLR_pred))

Multinomial Log Regression's accuracy_score:  0.44333095879164003
Multinomial Log Regression's f1_score:  0.5078452187716705
Multinomial Log Regression's recall_score:  0.44333095879164003
              precision    recall  f1-score   support

           1       0.58      0.36      0.44     70849
           2       0.63      0.50      0.56     93875
           3       0.01      0.68      0.03       922

   micro avg       0.44      0.44      0.44    165646
   macro avg       0.41      0.51      0.34    165646
weighted avg       0.61      0.44      0.51    165646



### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

# train model
upsampledRFC = RandomForestClassifier(n_estimators=100).fit(X1_train, Y1_train)

#Validate
upsampledRFC_validate = upsampledRFC.predict(X1_validate)

# Predict on test set
upsampledRFC_pred = upsampledRFC.predict(X_test)

# Checking accuracy of validation set
print("Random Forest's accuracy_score: ", accuracy_score(Y1_validate, upsampledRFC_validate))
print("Random Forest's f1_score: ", f1_score(Y1_validate, upsampledRFC_validate, average = 'weighted'))
print("Random Forest's recall_score: ", recall_score(Y1_validate, upsampledRFC_validate, average = 'weighted'))
print (classification_report(Y1_validate, upsampledRFC_validate))

Random Forest's accuracy_score:  0.787174091701361
Random Forest's f1_score:  0.7853023901473754
Random Forest's recall_score:  0.787174091701361
              precision    recall  f1-score   support

           1       0.63      0.56      0.59     30502
           2       0.69      0.75      0.72     40023
           3       1.00      1.00      1.00     40204

   micro avg       0.79      0.79      0.79    110729
   macro avg       0.77      0.77      0.77    110729
weighted avg       0.79      0.79      0.79    110729



In [7]:
# Checking accuracy of test set
print("Random Forest's accuracy_score: ", accuracy_score(Y_test, upsampledRFC_pred))
print("Random Forest's f1_score: ", f1_score(Y_test, upsampledRFC_pred, average = 'weighted'))
print("Random Forest's recall_score: ", recall_score(Y_test, upsampledRFC_pred, average = 'weighted'))
print (classification_report(Y_test, upsampledRFC_pred))

Random Forest's accuracy_score:  0.6507672989387006
Random Forest's f1_score:  0.6463296796200824
Random Forest's recall_score:  0.6507672989387006
              precision    recall  f1-score   support

           1       0.61      0.55      0.58     70849
           2       0.68      0.74      0.71     93875
           3       0.41      0.04      0.08       922

   micro avg       0.65      0.65      0.65    165646
   macro avg       0.57      0.44      0.45    165646
weighted avg       0.65      0.65      0.65    165646



### Decision Tree Classifier

In [47]:
from sklearn.tree import DecisionTreeClassifier

#train the model
upsampledDTC = DecisionTreeClassifier(criterion="entropy", max_depth = 4).fit(X1_train,Y1_train)

#Validate
upsampledDTC_validate = upsampledDTC.predict(X1_validate)

#Predict on test set
upsampledDTC_pred = upsampledDTC.predict(X_test)

#Check accuracy of the validation set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y1_validate, upsampledDTC_validate))
print("DecisionTrees's f1_score:" , f1_score(Y1_validate, upsampledDTC_validate, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y1_validate, upsampledDTC_validate, average = 'weighted'))
print (classification_report(Y1_validate, upsampledDTC_validate))

DecisionTrees's accuracy_score:  0.5396057040161114
DecisionTrees's f1_score: 0.5118783145707869
DecisionTrees's recall_score: 0.5396057040161114
              precision    recall  f1-score   support

           1       0.54      0.22      0.31     30502
           2       0.49      0.51      0.50     40023
           3       0.58      0.81      0.68     40204

   micro avg       0.54      0.54      0.54    110729
   macro avg       0.54      0.51      0.50    110729
weighted avg       0.53      0.54      0.51    110729



In [48]:
#Check accuracy of the test set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y_test, upsampledDTC_pred))
print("DecisionTrees's f1_score:" , f1_score(Y_test, upsampledDTC_pred, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y_test, upsampledDTC_pred, average = 'weighted'))
print (classification_report(Y_test, upsampledDTC_pred))

DecisionTrees's accuracy_score:  0.38445238641440177
DecisionTrees's f1_score: 0.44084400299299115
DecisionTrees's recall_score: 0.38445238641440177
              precision    recall  f1-score   support

           1       0.58      0.22      0.32     70849
           2       0.58      0.51      0.54     93875
           3       0.01      0.78      0.03       922

   micro avg       0.38      0.38      0.38    165646
   macro avg       0.39      0.50      0.29    165646
weighted avg       0.57      0.38      0.44    165646



### K-Nearest Neighbor

In [54]:
from sklearn.neighbors import KNeighborsClassifier
k = 30
upsampledKNC = KNeighborsClassifier(n_neighbors = k).fit(X1_train,Y1_train)

# KNC_pred is predicted Y
upsampledKNC_pred = upsampledKNC.predict(X_test)

#Check accuracy of test set
print("K-Nearest Neighbor's Accuracy: ", accuracy_score(Y_test, upsampledKNC_pred))
print("K-Nearest Neighbor's f1_score: ",f1_score(Y_test, upsampledKNC_pred, average = 'weighted'))
print("K-Nearest Neighbor's recall_score: ",recall_score(Y_test, upsampledKNC_pred, average = 'weighted'))
print (classification_report(Y_test, upsampledKNC_pred))

K-Nearest Neighbor's Accuracy:  0.5463216739311544
K-Nearest Neighbor's f1_score:  0.5657277457382169
K-Nearest Neighbor's recall_score:  0.5463216739311544
              precision    recall  f1-score   support

           1       0.55      0.46      0.50     70849
           2       0.62      0.62      0.62     93875
           3       0.02      0.36      0.04       922

   micro avg       0.55      0.55      0.55    165646
   macro avg       0.40      0.48      0.39    165646
weighted avg       0.59      0.55      0.57    165646



### SGD Classifier

In [49]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier(loss="hinge", penalty="l2", max_iter=5).fit(X1_train, Y1_train)   
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
           early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
           n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
           random_state=None, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [52]:
#Validate
upsampledSGD_validate = SGD.predict(X1_validate)

# Check the accuracy of validation set
print("SGD's Accuracy: ", accuracy_score(Y1_validate, upsampledSGD_validate))
print("SGD's f1_score: ",f1_score(Y1_validate, upsampledSGD_validate, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y1_validate, upsampledSGD_validate, average = 'weighted'))
print (classification_report(Y1_validate, upsampledSGD_validate))

SGD's Accuracy:  0.5006547516910655
SGD's f1_score:  0.46146249674470013
SGD's recall_score:  0.5006547516910655
              precision    recall  f1-score   support

           1       0.50      0.11      0.18     30502
           2       0.44      0.72      0.55     40023
           3       0.59      0.58      0.58     40204

   micro avg       0.50      0.50      0.50    110729
   macro avg       0.51      0.47      0.44    110729
weighted avg       0.52      0.50      0.46    110729



In [53]:
#Predict on test set
upsampledSGD_pred = SGD.predict(X_test)

#Check accuracy of test set
print("SGD's Accuracy: ", accuracy_score(Y_test, upsampledSGD_pred))
print("SGD's f1_score: ",f1_score(Y_test, upsampledSGD_pred, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y_test, upsampledSGD_pred, average = 'weighted'))
print (classification_report(Y_test, upsampledSGD_pred))

SGD's Accuracy:  0.4571435470823322
SGD's f1_score:  0.44622725093443916
SGD's recall_score:  0.4571435470823322
              precision    recall  f1-score   support

           1       0.60      0.11      0.19     70849
           2       0.59      0.72      0.65     93875
           3       0.01      0.55      0.03       922

   micro avg       0.46      0.46      0.46    165646
   macro avg       0.40      0.46      0.29    165646
weighted avg       0.59      0.46      0.45    165646

