### NOTE: Uncomment last line to create results CSV file

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, log_loss, jaccard_similarity_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [2]:
#Read train data (2015 and 2016) and test data (2017 data)
df1 = pd.read_csv('df2015_cleaned.csv', index_col = 0)
df2 = pd.read_csv('df2016_cleaned.csv', index_col = 0)
df_train = pd.concat([df1, df2])
df_test = pd.read_csv('df2017_cleaned.csv', index_col = 0)

In [3]:
# Train set
X = df_train.drop(['P_ISEV'], axis =1).values
Y = df_train['P_ISEV'].values
#Test set
X_test = df_test.drop(['P_ISEV'], axis =1).values
Y_test = df_test['P_ISEV'].values
print ('Train set:', X.shape,  Y.shape)
print ('Test set:', X_test.shape,  Y_test.shape)

Train set: (354313, 14) (354313,)
Test set: (165646, 14) (165646,)


### Split Training Data

In [21]:
# Split the train data into train and validation sets
X_train, X_validate, Y_train, Y_validate = train_test_split( X, Y, test_size=0.2, random_state= 4) 
print ('Train set:', X_train.shape,  Y_train.shape)
print ('Validation set:', X_validate.shape,  Y_validate.shape)

Train set: (283450, 14) (283450,)
Validation set: (70863, 14) (70863,)


### Random Forest Classifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

# train model
RFC = RandomForestClassifier(n_estimators=100).fit(X_train, Y_train)

#Validate
RFC_validate = RFC.predict(X_validate)

# predict on test set
RFC_pred = RFC.predict(X_test)

#Check accuracy of validation set
print("Random Forest Classifier's accuracy_score: ",accuracy_score(Y_validate, RFC_validate))
print("Random Forest Classifier's f1_score: ",f1_score(Y_validate, RFC_validate, average = 'weighted'))
print("Random Forest Classifier's recall_score: ",recall_score(Y_validate, RFC_validate, average = 'weighted'))
print (classification_report(Y_validate, RFC_validate))

Random Forest Classifier's accuracy_score:  0.6655941746750773
Random Forest Classifier's f1_score:  0.6614079466461286
Random Forest Classifier's recall_score:  0.6655941746750773
              precision    recall  f1-score   support

           1       0.63      0.57      0.60     30257
           2       0.69      0.75      0.72     40215
           3       0.50      0.02      0.04       391

   micro avg       0.67      0.67      0.67     70863
   macro avg       0.61      0.44      0.45     70863
weighted avg       0.66      0.67      0.66     70863



In [30]:
#Check accuracy of test set
print("Random Forest Classifier's accuracy_score: ",accuracy_score(Y_test, RFC_pred))
print("Random Forest Classifier's f1_score: ",f1_score(Y_test, RFC_pred, average = 'weighted'))
print("Random Forest Classifier's recall_score: ",recall_score(Y_test, RFC_pred, average = 'weighted'))
print (classification_report(Y_test, RFC_pred))

Random Forest Classifier's accuracy_score:  0.6519988409016819
Random Forest Classifier's f1_score:  0.6477064043888481
Random Forest Classifier's recall_score:  0.6519988409016819
              precision    recall  f1-score   support

           1       0.61      0.55      0.58     70849
           2       0.68      0.73      0.71     93875
           3       0.97      0.03      0.06       922

   micro avg       0.65      0.65      0.65    165646
   macro avg       0.75      0.44      0.45    165646
weighted avg       0.65      0.65      0.65    165646



### Combine result of Random Forest with train data and compare result

In [31]:
#Probability of test set
RFC_pred_proba = RFC.predict_proba(X_test)

#convert to dataframes
df_actual = pd.DataFrame(Y_test, columns = ['Actual P_ISEV'])
df_result_pred = pd.DataFrame(RFC_pred, columns = ['Predicted P_ISEV'])
df_result_proba = pd.DataFrame(RFC_pred_proba, columns = ['Probability(1)', 'Probability(2)', 'Probability(3)'])
df_test1 = df_test.reset_index()
df_result = pd.concat([df_actual, df_result_pred, df_result_proba, df_test1], axis = 1)

In [None]:
#df_result.to_csv('df_result_2017test.csv') #uncomment to create csv file