In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, classification_report
#df to store metrics
metrics_df_valid = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC-ROC'])
metrics_df_test = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC-ROC'])


In [2]:
# function to scale the data
def scale(X, Y, Z):
    scaler = StandardScaler()
    X_ = scaler.fit_transform(X)
    X__ = scaler.fit_transform(Y)
    X___ = scaler.fit_transform(Z)
    return X_, X__, X___

#function to get the model parameters
def info(X, Y, model_name, list_name):    
    accuracy = accuracy_score(X, Y)
    precision = precision_score(X, Y)
    recall = recall_score(X, Y)
    f1 = f1_score(X, Y)
    auc = roc_auc_score(X, Y)
    #print
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("AUC-ROC:", auc)
    print(classification_report(X, Y))

    #adding results to the metrics_df
    list_name.loc[len(list_name)] = [model_name, accuracy, precision, recall, f1, auc]
    
    #printing confusion matrix
    confusion_mat = confusion_matrix(X, Y)
    confusion_df = pd.DataFrame(confusion_mat, index=['Actual Negative', 'Actual Positive'],
                             columns=['Predicted Negative', 'Predicted Positive'])
    print(confusion_df)

#function to train the model
def train_model(X, Y, X_test, k=1):
    svm = SVC(kernel='linear',C=k)
    svm.fit(X, Y)
    predicted = svm.predict(X_test)
    return predicted
    

In [3]:
# Read the data
train = pd.read_csv('random_forest_train.csv')
test = pd.read_csv('random_forest_test.csv')

#converting date to datetime and then converting to integer
train['Date'] = pd.to_datetime(train['Date'])
train['Date'] = train['Date'].astype('int64')

test['Date'] = pd.to_datetime(test['Date'])
test['Date'] = test['Date'].astype('int64')

#trainning data
x_train = train.drop("Target", axis=1)
y_train = train["Target"]

#test data
x_test = test.drop("Target", axis=1)
y_test = test["Target"]

#spliting data into train and validation
X_train, X_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=15)

#scaling the data
X_train_scaled, X_validation_scaled, X_test_scaled = scale(X_train, X_validation, x_test)

<h4>Random Forest</h4>

In [4]:
# Fit the data
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=1)
classifier.fit(X_train_scaled, y_train)

In [5]:
#model prediction on validation set
predicted = classifier.predict(X_validation_scaled)
info(y_validation, predicted, 'Random Forest', metrics_df_valid)

print('\n________________________________________________________________\n')

#model prediction on test set
predicted = classifier.predict(X_test_scaled)
info(y_test, predicted, 'Random Forest', metrics_df_test)

Accuracy: 0.5200945626477541
Precision: 0.5225225225225225
Recall: 0.5446009389671361
F1-score: 0.5333333333333334
AUC-ROC: 0.5199195171026156
              precision    recall  f1-score   support

           0       0.52      0.50      0.51       210
           1       0.52      0.54      0.53       213

    accuracy                           0.52       423
   macro avg       0.52      0.52      0.52       423
weighted avg       0.52      0.52      0.52       423

                 Predicted Negative  Predicted Positive
Actual Negative                 104                 106
Actual Positive                  97                 116

________________________________________________________________

Accuracy: 0.5666666666666667
Precision: 0.6923076923076923
Recall: 0.5
F1-score: 0.5806451612903226
AUC-ROC: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.47      0.67      0.55        12
           1       0.69      0.50      0.58        18

   

<h4>SVM using linear kernel</h4>

In [6]:
#training the model
#using c = 0.1 as hyperparameter
svm = SVC(kernel='linear',C=0.1)
svm.fit(X_train_scaled, y_train)

In [7]:
#model prediction on validation set
predicted = svm.predict(X_validation_scaled)
info(y_validation, predicted, 'SVM-linear', metrics_df_valid)

print('\n________________________________________________________________\n')

#model prediction on test set
predicted = svm.predict(X_test_scaled)
info(y_test, predicted, 'SVM-linear', metrics_df_test)

Accuracy: 0.5271867612293144
Precision: 0.5214521452145214
Recall: 0.7417840375586855
F1-score: 0.6124031007751939
AUC-ROC: 0.5256539235412475
              precision    recall  f1-score   support

           0       0.54      0.31      0.39       210
           1       0.52      0.74      0.61       213

    accuracy                           0.53       423
   macro avg       0.53      0.53      0.50       423
weighted avg       0.53      0.53      0.50       423

                 Predicted Negative  Predicted Positive
Actual Negative                  65                 145
Actual Positive                  55                 158

________________________________________________________________

Accuracy: 0.6666666666666666
Precision: 0.6818181818181818
Recall: 0.8333333333333334
F1-score: 0.7499999999999999
AUC-ROC: 0.625
              precision    recall  f1-score   support

           0       0.62      0.42      0.50        12
           1       0.68      0.83      0.75        18

 

<h4>DecisionTree</h4>

In [8]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(X_train_scaled, y_train)

predicted = clf.predict(X_validation_scaled)
info(y_validation, predicted, 'Decision Tree', metrics_df_valid)

predicted = clf.predict(X_test_scaled)
info(y_test, predicted, 'Decision Tree', metrics_df_test)

Accuracy: 0.5035460992907801
Precision: 0.5057915057915058
Recall: 0.6150234741784038
F1-score: 0.5550847457627119
AUC-ROC: 0.5027498323272971
              precision    recall  f1-score   support

           0       0.50      0.39      0.44       210
           1       0.51      0.62      0.56       213

    accuracy                           0.50       423
   macro avg       0.50      0.50      0.50       423
weighted avg       0.50      0.50      0.50       423

                 Predicted Negative  Predicted Positive
Actual Negative                  82                 128
Actual Positive                  82                 131
Accuracy: 0.6
Precision: 0.6363636363636364
Recall: 0.7777777777777778
F1-score: 0.7000000000000001
AUC-ROC: 0.5555555555555556
              precision    recall  f1-score   support

           0       0.50      0.33      0.40        12
           1       0.64      0.78      0.70        18

    accuracy                           0.60        30
   macro avg    

<h2>now we use other models</h2>

<h4>Quadratic Discriminant Analysis</h4>

In [9]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train_scaled, y_train)

predicted = qda.predict(X_validation_scaled)
info(y_validation, predicted, 'QDA', metrics_df_valid)

predicted = qda.predict(X_test_scaled)
info(y_test, predicted, 'QDA', metrics_df_test)

Accuracy: 0.5437352245862884
Precision: 0.5588235294117647
Recall: 0.4460093896713615
F1-score: 0.49608355091383816
AUC-ROC: 0.5444332662642521
              precision    recall  f1-score   support

           0       0.53      0.64      0.58       210
           1       0.56      0.45      0.50       213

    accuracy                           0.54       423
   macro avg       0.55      0.54      0.54       423
weighted avg       0.55      0.54      0.54       423

                 Predicted Negative  Predicted Positive
Actual Negative                 135                  75
Actual Positive                 118                  95
Accuracy: 0.36666666666666664
Precision: 0.4
Recall: 0.1111111111111111
F1-score: 0.1739130434782609
AUC-ROC: 0.4305555555555556
              precision    recall  f1-score   support

           0       0.36      0.75      0.49        12
           1       0.40      0.11      0.17        18

    accuracy                           0.37        30
   macro avg  

<h5>Logistic Regression model</h5>

In [10]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

predicted = logreg.predict(X_validation_scaled)
info(y_validation, predicted, 'LogisticRegression', metrics_df_valid)

predicted = logreg.predict(X_test_scaled)
info(y_test, predicted, 'LogisticRegression', metrics_df_test)


Accuracy: 0.4728132387706856
Precision: 0.48188405797101447
Recall: 0.6244131455399061
F1-score: 0.5439672801635992
AUC-ROC: 0.4717303822937625
              precision    recall  f1-score   support

           0       0.46      0.32      0.38       210
           1       0.48      0.62      0.54       213

    accuracy                           0.47       423
   macro avg       0.47      0.47      0.46       423
weighted avg       0.47      0.47      0.46       423

                 Predicted Negative  Predicted Positive
Actual Negative                  67                 143
Actual Positive                  80                 133
Accuracy: 0.6333333333333333
Precision: 0.6666666666666666
Recall: 0.7777777777777778
F1-score: 0.717948717948718
AUC-ROC: 0.5972222222222221
              precision    recall  f1-score   support

           0       0.56      0.42      0.48        12
           1       0.67      0.78      0.72        18

    accuracy                           0.63        30
 

<h4>Adaboost model</h4>

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

base_classifier = LogisticRegression()
adaboost = AdaBoostClassifier(base_estimator=base_classifier)
adaboost.fit(X_train_scaled, y_train)

predicted = adaboost.predict(X_validation_scaled)
info(y_validation, predicted, 'AdaBoost', metrics_df_valid)

print('\n________________________________________________________________\n')

predicted = adaboost.predict(X_test_scaled)
info(y_test, predicted, 'AdaBoost', metrics_df_test)



Accuracy: 0.491725768321513
Precision: 0.4963768115942029
Recall: 0.6431924882629108
F1-score: 0.5603271983640081
AUC-ROC: 0.49064386317907444
              precision    recall  f1-score   support

           0       0.48      0.34      0.40       210
           1       0.50      0.64      0.56       213

    accuracy                           0.49       423
   macro avg       0.49      0.49      0.48       423
weighted avg       0.49      0.49      0.48       423

                 Predicted Negative  Predicted Positive
Actual Negative                  71                 139
Actual Positive                  76                 137

________________________________________________________________

Accuracy: 0.5666666666666667
Precision: 0.631578947368421
Recall: 0.6666666666666666
F1-score: 0.6486486486486486
AUC-ROC: 0.5416666666666666
              precision    recall  f1-score   support

           0       0.45      0.42      0.43        12
           1       0.63      0.67      0.65 

<h3>models metrics for validation data</h3>

In [12]:
metrics_df_valid

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC-ROC
0,Random Forest,0.520095,0.522523,0.544601,0.533333,0.51992
1,SVM-linear,0.527187,0.521452,0.741784,0.612403,0.525654
2,Decision Tree,0.503546,0.505792,0.615023,0.555085,0.50275
3,QDA,0.543735,0.558824,0.446009,0.496084,0.544433
4,LogisticRegression,0.472813,0.481884,0.624413,0.543967,0.47173
5,AdaBoost,0.491726,0.496377,0.643192,0.560327,0.490644


<h3>models metrics for test data</h3>

In [13]:
metrics_df_test

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC-ROC
0,Random Forest,0.566667,0.692308,0.5,0.580645,0.583333
1,SVM-linear,0.666667,0.681818,0.833333,0.75,0.625
2,Decision Tree,0.6,0.636364,0.777778,0.7,0.555556
3,QDA,0.366667,0.4,0.111111,0.173913,0.430556
4,LogisticRegression,0.633333,0.666667,0.777778,0.717949,0.597222
5,AdaBoost,0.566667,0.631579,0.666667,0.648649,0.541667
