In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, log_loss, jaccard_similarity_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('df_train.csv', index_col = 0)
df_test = pd.read_csv('df_test.csv', index_col = 0)

In [3]:
#Test set
X_test = df_test.drop(['P_ISEV'], axis =1).values
Y_test = df_test['P_ISEV'].values
print ('Test set:', X_test.shape,  Y_test.shape)

Test set: (165646, 14) (165646,)


### Downsample the majority classes

In [4]:
from sklearn.utils import resample
X2 = df_train  # orginal train data will be used for resampling (labeled as X2)

# separate minority and majority classes
no_injury = X2[X2.P_ISEV==1]
injury = X2[X2.P_ISEV==2]
fatal = X2[X2.P_ISEV==3]

# downsample majority
no_injury_downsampled = resample(no_injury,
                                replace = False, # sample without replacement
                                n_samples = len(fatal), # match minority n
                                random_state = 16) # reproducible results

injury_downsampled = resample(injury,
                                replace = False, 
                                n_samples = len(fatal),
                                random_state = 16)

# combine minority and downsampled majority
downsampled = pd.concat([no_injury_downsampled, injury_downsampled, fatal])

# checking counts
print(downsampled.P_ISEV.value_counts())

#Prepare data
Y2 = downsampled['P_ISEV'].values
X2 = (downsampled.drop('P_ISEV', axis=1)).values


3    1890
2    1890
1    1890
Name: P_ISEV, dtype: int64


In [5]:
# Split the upsampled train data into train and validation sets
X2_train, X2_validate, Y2_train, Y2_validate = train_test_split( X2, Y2, test_size=0.2, random_state= 4) 
print ('Train set:', X2_train.shape,  Y2_train.shape)
print ('Validation set:', X2_validate.shape,  Y2_validate.shape)

Train set: (4536, 14) (4536,)
Validation set: (1134, 14) (1134,)


### Multinomial Logistic Regression

In [27]:
from sklearn import linear_model
#train model
downsampledMLR = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X2_train, Y2_train)

#Validate
downsampledMLR_validate = downsampledMLR.predict(X2_validate)

#Predict on test set
downsampledMLR_pred = downsampledMLR.predict(X_test)

# Checking accuracy of validation set
print("Multinomial Log Regression's accuracy_score: ", accuracy_score(Y2_validate, downsampledMLR_validate))
print("Multinomial Log Regression's f1_score: ", f1_score(Y2_validate, downsampledMLR_validate, average = 'weighted'))
print("Multinomial Log Regression's recall_score: ", recall_score(Y2_validate, downsampledMLR_validate, average = 'weighted'))
print (classification_report(Y2_validate, downsampledMLR_validate))

Multinomial Log Regression's accuracy_score:  0.5194003527336861
Multinomial Log Regression's f1_score:  0.5145175780578803
Multinomial Log Regression's recall_score:  0.5194003527336861
              precision    recall  f1-score   support

           1       0.52      0.53      0.53       371
           2       0.44      0.37      0.40       364
           3       0.57      0.64      0.60       399

   micro avg       0.52      0.52      0.52      1134
   macro avg       0.51      0.52      0.51      1134
weighted avg       0.51      0.52      0.51      1134



In [28]:
# Checking accuracy of test set
print("Multinomial Log Regression's accuracy_score: ", accuracy_score(Y_test, downsampledMLR_pred))
print("Multinomial Log Regression's f1_score: ", f1_score(Y_test, downsampledMLR_pred, average = 'weighted'))
print("Multinomial Log Regression's recall_score: ", recall_score(Y_test, downsampledMLR_pred, average = 'weighted'))
print(classification_report(Y_test, downsampledMLR_pred))

Multinomial Log Regression's accuracy_score:  0.46040954807239537
Multinomial Log Regression's f1_score:  0.5171103238221793
Multinomial Log Regression's recall_score:  0.46040954807239537
              precision    recall  f1-score   support

           1       0.55      0.52      0.54     70849
           2       0.66      0.41      0.51     93875
           3       0.01      0.65      0.03       922

   micro avg       0.46      0.46      0.46    165646
   macro avg       0.41      0.53      0.36    165646
weighted avg       0.61      0.46      0.52    165646



### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
# train model
downsampledRFC = RandomForestClassifier(n_estimators=100).fit(X2_train, Y2_train)

#Validate
downsampledRFC_validate = downsampledRFC.predict(X2_validate)

# Predict on test set
downsampledRFC_pred = downsampledRFC.predict(X_test)

# Checking accuracy of validation set
print("Random Forest's accuracy_score: ", accuracy_score(Y2_validate, downsampledRFC_validate))
print("Random Forest's f1_score: ", f1_score(Y2_validate, downsampledRFC_validate, average = 'weighted'))
print("Random Forest's recall_score: ", recall_score(Y2_validate, downsampledRFC_validate, average = 'weighted'))
print (classification_report(Y2_validate, downsampledRFC_validate))

Random Forest's accuracy_score:  0.591710758377425
Random Forest's f1_score:  0.5861974234059729
Random Forest's recall_score:  0.591710758377425
              precision    recall  f1-score   support

           1       0.56      0.60      0.58       371
           2       0.47      0.40      0.43       364
           3       0.71      0.76      0.73       399

   micro avg       0.59      0.59      0.59      1134
   macro avg       0.58      0.59      0.58      1134
weighted avg       0.58      0.59      0.59      1134



In [7]:
# Checking accuracy of test set
print("Random Forest's accuracy_score: ", accuracy_score(Y_test, downsampledRFC_pred))
print("Random Forest's f1_score: ", f1_score(Y_test, downsampledRFC_pred, average = 'weighted'))
print("Random Forest's recall_score: ", recall_score(Y_test, downsampledRFC_pred, average = 'weighted'))
print (classification_report(Y_test, downsampledRFC_pred))

Random Forest's accuracy_score:  0.500501068543762
Random Forest's f1_score:  0.5355010740958787
Random Forest's recall_score:  0.500501068543762
              precision    recall  f1-score   support

           1       0.55      0.62      0.58     70849
           2       0.66      0.41      0.51     93875
           3       0.02      0.73      0.05       922

   micro avg       0.50      0.50      0.50    165646
   macro avg       0.41      0.59      0.38    165646
weighted avg       0.61      0.50      0.54    165646



### Decision Tree Classifier

In [46]:
from sklearn.tree import DecisionTreeClassifier

#train the model
downsampledDTC = DecisionTreeClassifier(criterion="entropy", max_depth = 4).fit(X2_train,Y2_train)

#Validate
downsampledDTC_validate = downsampledDTC.predict(X2_validate)

#Predict on test set
downsampledDTC_pred = downsampledDTC.predict(X_test)

#Check accuracy of the validation set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y2_validate, downsampledDTC_validate))
print("DecisionTrees's f1_score:" , f1_score(Y2_validate, downsampledDTC_validate, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y2_validate, downsampledDTC_validate, average = 'weighted'))
print (classification_report(Y2_validate, downsampledDTC_validate))

DecisionTrees's accuracy_score:  0.5546737213403881
DecisionTrees's f1_score: 0.53315653569139
DecisionTrees's recall_score: 0.5546737213403881
              precision    recall  f1-score   support

           1       0.47      0.74      0.57       371
           2       0.52      0.23      0.32       364
           3       0.71      0.68      0.69       399

   micro avg       0.55      0.55      0.55      1134
   macro avg       0.56      0.55      0.53      1134
weighted avg       0.57      0.55      0.53      1134



In [47]:
#Check accuracy of the test set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y_test, downsampledDTC_pred))
print("DecisionTrees's f1_score:" , f1_score(Y_test, downsampledDTC_pred, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y_test, downsampledDTC_pred, average = 'weighted'))
print (classification_report(Y_test, downsampledDTC_pred))

DecisionTrees's accuracy_score:  0.44503338444634943
DecisionTrees's f1_score: 0.43891713006260685
DecisionTrees's recall_score: 0.44503338444634943
              precision    recall  f1-score   support

           1       0.49      0.75      0.59     70849
           2       0.65      0.22      0.33     93875
           3       0.02      0.64      0.04       922

   micro avg       0.45      0.45      0.45    165646
   macro avg       0.39      0.53      0.32    165646
weighted avg       0.58      0.45      0.44    165646



### K-Nearest Neighbor's

In [51]:
from sklearn.neighbors import KNeighborsClassifier
k = 30
downsampledKNC = KNeighborsClassifier(n_neighbors = k).fit(X2_train,Y2_train)

#Predict 
downsampledKNC_pred = downsampledKNC.predict(X_test)

#Check accuracy of test set
print("Test set Accuracy: ", accuracy_score(Y_test, downsampledKNC_pred))
print("Random Forest Classifier's f1_score: ",f1_score(Y_test, downsampledKNC_pred, average = 'weighted'))
print("Random Forest Classifier's recall_score: ",recall_score(Y_test, downsampledKNC_pred, average = 'weighted'))
print (classification_report(Y_test, downsampledKNC_pred))

Test set Accuracy:  0.42707339748620554
Random Forest Classifier's f1_score:  0.45611900705444536
Random Forest Classifier's recall_score:  0.42707339748620554
              precision    recall  f1-score   support

           1       0.50      0.60      0.55     70849
           2       0.60      0.29      0.39     93875
           3       0.02      0.69      0.04       922

   micro avg       0.43      0.43      0.43    165646
   macro avg       0.37      0.53      0.32    165646
weighted avg       0.55      0.43      0.46    165646



### SGD Classifier

In [48]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier(loss="hinge", penalty="l2", max_iter=5).fit(X2_train, Y2_train)   
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
           early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
           n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
           random_state=None, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
#Validate
downsampledSGD_validate = SGD.predict(X2_validate)

# Check the accuracy of validation set
print("SGD's Accuracy: ", accuracy_score(Y2_validate, downsampledSGD_validate))
print("SGD's f1_score: ",f1_score(Y2_validate, downsampledSGD_validate, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y2_validate, downsampledSGD_validate, average = 'weighted'))
print (classification_report(Y2_validate, downsampledSGD_validate))

SGD's Accuracy:  0.445326278659612
SGD's f1_score:  0.36624465193346595
SGD's recall_score:  0.445326278659612
              precision    recall  f1-score   support

           1       0.63      0.05      0.09       371
           2       0.41      0.39      0.40       364
           3       0.45      0.87      0.60       399

   micro avg       0.45      0.45      0.45      1134
   macro avg       0.50      0.43      0.36      1134
weighted avg       0.50      0.45      0.37      1134



In [50]:
#Predict on test set
downsampledSGD_pred = SGD.predict(X_test)

#Check the accuracy of test set
print("SGD's Accuracy: ", accuracy_score(Y_test, downsampledSGD_pred))
print("SGD's f1_score: ",f1_score(Y_test, downsampledSGD_pred, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y_test, downsampledSGD_pred, average = 'weighted'))
print (classification_report(Y_test, downsampledSGD_pred))

SGD's Accuracy:  0.23245958248312668
SGD's f1_score:  0.2778371787336483
SGD's recall_score:  0.23245958248312668
              precision    recall  f1-score   support

           1       0.65      0.03      0.07     70849
           2       0.53      0.38      0.44     93875
           3       0.01      0.88      0.02       922

   micro avg       0.23      0.23      0.23    165646
   macro avg       0.40      0.43      0.17    165646
weighted avg       0.58      0.23      0.28    165646

