In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, log_loss, jaccard_similarity_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [2]:
#Read train data (2015 and 2016) and test data (2017 data)
df1 = pd.read_csv('df2015_cleaned.csv', index_col = 0)
df2 = pd.read_csv('df2016_cleaned.csv', index_col = 0)
df_train = pd.concat([df1, df2])
df_test = pd.read_csv('df2017_cleaned.csv', index_col = 0)

In [3]:
# Train set
X = df_train.drop(['P_ISEV'], axis =1).values
Y = df_train['P_ISEV'].values
#Test set
X_test = df_test.drop(['P_ISEV'], axis =1).values
Y_test = df_test['P_ISEV'].values
print ('Train set:', X.shape,  Y.shape)
print ('Test set:', X_test.shape,  Y_test.shape)

Train set: (354313, 14) (354313,)
Test set: (165646, 14) (165646,)


In [4]:
# Split the train data into train and validation sets
X_train, X_validate, Y_train, Y_validate = train_test_split( X, Y, test_size=0.2, random_state= 4) 
print ('Train set:', X_train.shape,  Y_train.shape)
print ('Validation set:', X_validate.shape,  Y_validate.shape)

Train set: (283450, 14) (283450,)
Validation set: (70863, 14) (70863,)


### Multinomial Logistic Regression

In [5]:
from sklearn import linear_model

#Train the model
MLR = linear_model.LogisticRegression(multi_class='multinomial', solver='newton-cg').fit(X_train, Y_train)

#Validate
MLR_validate = MLR.predict(X_validate)

#Predict 
MLR_pred =  MLR.predict(X_test)

In [6]:
#Check accuracy of validation set
print("Multinomial Logistic regression's accuracy_score : ", accuracy_score(Y_validate, MLR_validate))
print("Multinomial Logistic regression's f1_score: ", f1_score(Y_validate, MLR_validate, average = 'weighted'))
print("Multinomial Logistic regression's recall_score: ", recall_score(Y_validate, MLR_validate, average = 'weighted'))
print (classification_report(Y_validate, MLR_validate))

Multinomial Logistic regression's accuracy_score :  0.6127739440892991
Multinomial Logistic regression's f1_score:  0.5966365101973815
Multinomial Logistic regression's recall_score:  0.6127739440892991
              precision    recall  f1-score   support

           1       0.57      0.40      0.47     30257
           2       0.63      0.78      0.70     40215
           3       0.00      0.00      0.00       391

   micro avg       0.61      0.61      0.61     70863
   macro avg       0.40      0.39      0.39     70863
weighted avg       0.60      0.61      0.60     70863



In [25]:
#Check the accuracy of test set
print("Multinomial Logistic regression's accuracy_score : ", accuracy_score(Y_test, MLR_pred))
print("Multinomial Logistic regression's f1_score: ", f1_score(Y_test, MLR_pred, average = 'weighted'))
print("Multinomial Logistic regression's recall_score: ", recall_score(Y_test, MLR_pred, average = 'weighted'))
print (classification_report(Y_test, MLR_pred))

Multinomial Logistic regression's accuracy_score :  0.6135191915289231
Multinomial Logistic regression's f1_score:  0.5976709177333176
Multinomial Logistic regression's recall_score:  0.6135191915289231
              precision    recall  f1-score   support

           1       0.58      0.41      0.48     70849
           2       0.63      0.78      0.70     93875
           3       0.00      0.00      0.00       922

   micro avg       0.61      0.61      0.61    165646
   macro avg       0.40      0.39      0.39    165646
weighted avg       0.60      0.61      0.60    165646



In [26]:
#Log loss of test set
MLR_pred_prob = MLR.predict_proba(X_test)
print("Log Loss: ", log_loss(Y_test, MLR_pred_prob))

Log Loss:  0.6767585549502647


### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# train model
RFC = RandomForestClassifier(n_estimators=1000).fit(X_train, Y_train)

#Validate
RFC_validate = RFC.predict(X_validate)

# predict on test set
RFC_pred = RFC.predict(X_test)

#Check accuracy of validation set
print("Random Forest Classifier's accuracy_score: ",accuracy_score(Y_validate, RFC_validate))
print("Random Forest Classifier's f1_score: ",f1_score(Y_validate, RFC_validate, average = 'weighted'))
print("Random Forest Classifier's recall_score: ",recall_score(Y_validate, RFC_validate, average = 'weighted'))
print (classification_report(Y_validate, RFC_validate))

In [7]:
#Check accuracy of test set
print("Random Forest Classifier's accuracy_score: ",accuracy_score(Y_test, RFC_pred))
print("Random Forest Classifier's f1_score: ",f1_score(Y_test, RFC_pred, average = 'weighted'))
print("Random Forest Classifier's recall_score: ",recall_score(Y_test, RFC_pred, average = 'weighted'))
print (classification_report(Y_test, RFC_pred))

Random Forest Classifier's accuracy_score:  0.6513951438610048
Random Forest Classifier's f1_score:  0.6471731617116983
Random Forest Classifier's recall_score:  0.6513951438610048
              precision    recall  f1-score   support

           1       0.61      0.55      0.58     70849
           2       0.68      0.73      0.70     93875
           3       0.88      0.03      0.06       922

   micro avg       0.65      0.65      0.65    165646
   macro avg       0.72      0.44      0.45    165646
weighted avg       0.65      0.65      0.65    165646



### Decision Tree Classifier

In [31]:
from sklearn.tree import DecisionTreeClassifier

#train the model
DTC = DecisionTreeClassifier(criterion="entropy", max_depth = 4).fit(X_train,Y_train)

#Validate
DTC_validate = DTC.predict(X_validate)

#predict on test set
DTC_pred = DTC.predict(X_test)

#Check accuracy of validation set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y_validate, DTC_validate))
print("DecisionTrees's f1_score:" , f1_score(Y_validate, DTC_validate, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y_validate, DTC_validate, average = 'weighted'))
print (classification_report(Y_validate, DTC_validate))

DecisionTrees's accuracy_score:  0.6225392659074552
DecisionTrees's f1_score: 0.6226466304249747
DecisionTrees's recall_score: 0.6225392659074552
              precision    recall  f1-score   support

           1       0.55      0.65      0.60     30257
           2       0.69      0.61      0.65     40215
           3       0.00      0.00      0.00       391

   micro avg       0.62      0.62      0.62     70863
   macro avg       0.42      0.42      0.42     70863
weighted avg       0.63      0.62      0.62     70863



In [32]:
#Check accuracy of test set
print("DecisionTrees's accuracy_score: ", accuracy_score(Y_test, DTC_pred))
print("DecisionTrees's f1_score:" , f1_score(Y_test, DTC_pred, average = 'weighted'))
print("DecisionTrees's recall_score:" , recall_score(Y_test, DTC_pred, average = 'weighted'))
print (classification_report(Y_test, DTC_pred))

DecisionTrees's accuracy_score:  0.6229730871859266
DecisionTrees's f1_score: 0.6230149553744464
DecisionTrees's recall_score: 0.6229730871859266
              precision    recall  f1-score   support

           1       0.55      0.66      0.60     70849
           2       0.70      0.60      0.65     93875
           3       0.00      0.00      0.00       922

   micro avg       0.62      0.62      0.62    165646
   macro avg       0.42      0.42      0.42    165646
weighted avg       0.63      0.62      0.62    165646



### K-Nearest Neighbor

In [33]:
from sklearn.neighbors import KNeighborsClassifier

k = 30
KNC = KNeighborsClassifier(n_neighbors = k).fit(X_train,Y_train)

#Validate
KNC_validate = KNC.predict(X_validate)

# Predict on test set
KNC_pred = KNC.predict(X_test)

# Check the accuracy of validation set
print("K-Nearest Neighbor's Accuracy: ", accuracy_score(Y_validate, KNC_validate))
print("K-Nearest Neighbor's f1_score: ",f1_score(Y_validate, KNC_validate, average = 'weighted'))
print("K-Nearest Neighbor's recall_score: ",recall_score(Y_validate, KNC_validate, average = 'weighted'))
print (classification_report(Y_validate, KNC_validate))

K-Nearest Neighbor's Accuracy:  0.6105725131591945
K-Nearest Neighbor's f1_score:  0.6039345106887882
K-Nearest Neighbor's recall_score:  0.6105725131591945
              precision    recall  f1-score   support

           1       0.56      0.48      0.52     30257
           2       0.64      0.71      0.68     40215
           3       0.00      0.00      0.00       391

   micro avg       0.61      0.61      0.61     70863
   macro avg       0.40      0.40      0.40     70863
weighted avg       0.60      0.61      0.60     70863



In [34]:
#Check the accuracy of test set
print("K-Nearest Neighbor's Accuracy: ", accuracy_score(Y_test, KNC_pred))
print("K-Nearest Neighbor's f1_score: ",f1_score(Y_test, KNC_pred, average = 'weighted'))
print("K-Nearest Neighbor's recall_score: ",recall_score(Y_test, KNC_pred, average = 'weighted'))
print (classification_report(Y_test, KNC_pred))

K-Nearest Neighbor's Accuracy:  0.6061540876326624
K-Nearest Neighbor's f1_score:  0.5992118915313438
K-Nearest Neighbor's recall_score:  0.6061540876326624
              precision    recall  f1-score   support

           1       0.55      0.47      0.51     70849
           2       0.64      0.71      0.67     93875
           3       0.00      0.00      0.00       922

   micro avg       0.61      0.61      0.61    165646
   macro avg       0.40      0.40      0.39    165646
weighted avg       0.60      0.61      0.60    165646



In [35]:
#Check for best k
ks = [1, 3, 5, 7, 10, 20, 30]
mean_accuracy = {}

for k in ks:
    acc_score= np.zeros(3)
    
    for n in range(3): # to generate 3 random train/test split for each k
        X1_train, X1_test, Y1_train, Y1_test = train_test_split( X_train, Y_train, test_size=0.2)
        # train  and predict
        KNC = KNeighborsClassifier(n_neighbors = k).fit(X1_train,Y1_train)
        KNC_pred = KNC.predict(X1_test)
        # evaluate accuracy
        acc_score[n] = accuracy_score(Y1_test, KNC_pred)
    
    mean_accuracy[k] = round(acc_score.mean(),3)
    
    
print('Mean accuracy score for each k \n', mean_accuracy)

Mean accuracy score for each k 
 {1: 0.587, 3: 0.591, 5: 0.596, 7: 0.6, 10: 0.597, 20: 0.608, 30: 0.612}


In [36]:
import re
highest = max(mean_accuracy.values())
k = {k for k, acc in mean_accuracy.items() if acc == highest}
k =re.sub("{|}","" , str(k))
print( "The best accuracy score was ",highest, "with k =",k) 

The best accuracy score was  0.612 with k = 30


### SGD Classifier

In [37]:
from sklearn.linear_model import SGDClassifier

SGD = SGDClassifier(loss="hinge", penalty="l2", max_iter=5).fit(X_train, Y_train)   
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
           early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
           n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
           random_state=None, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [38]:
#Validate
SGD_validate = SGD.predict(X_validate)

#Predict on test set
SGD_pred = SGD.predict(X_test)

In [39]:
# Check the accuracy of validation set
print("SGD's Accuracy: ", accuracy_score(Y_validate, SGD_validate))
print("SGD's f1_score: ",f1_score(Y_validate, SGD_validate, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y_validate, SGD_validate, average = 'weighted'))
print (classification_report(Y_validate, SGD_validate))

SGD's Accuracy:  0.5637215472108152
SGD's f1_score:  0.4628950644833641
SGD's recall_score:  0.5637215472108152
              precision    recall  f1-score   support

           1       0.52      0.08      0.14     30257
           2       0.57      0.93      0.71     40215
           3       0.02      0.04      0.03       391

   micro avg       0.56      0.56      0.56     70863
   macro avg       0.37      0.35      0.29     70863
weighted avg       0.55      0.56      0.46     70863



In [40]:
#Check the accuracy of test set
print("SGD's Accuracy: ", accuracy_score(Y_test, SGD_pred))
print("SGD's f1_score: ",f1_score(Y_test, SGD_pred, average = 'weighted'))
print("SGD's recall_score: ",recall_score(Y_test, SGD_pred, average = 'weighted'))
print (classification_report(Y_test, SGD_pred))

SGD's Accuracy:  0.5625852722069956
SGD's f1_score:  0.46105180594414075
SGD's recall_score:  0.5625852722069956
              precision    recall  f1-score   support

           1       0.52      0.08      0.14     70849
           2       0.57      0.93      0.71     93875
           3       0.02      0.03      0.02       922

   micro avg       0.56      0.56      0.56    165646
   macro avg       0.37      0.35      0.29    165646
weighted avg       0.55      0.56      0.46    165646



### Kernel Approximation

In [41]:
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
rbf_feature = RBFSampler(gamma=1, random_state=1)
X_features = rbf_feature.fit_transform(X_train)
kernel = SGDClassifier(max_iter=5).fit(X_features, Y_train)   
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)
kernel.score(X_features, Y_train)

0.5680119950608573

In [42]:
X_test_features = rbf_feature.fit_transform(X_test)
kernel.score(X_test_features, Y_test)

0.5667205969356338