In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('image_bins_lungs2.csv')
data.shape

(2200, 10)

In [3]:
data.head()

Unnamed: 0,filename,Bins0,Bins1,Bins2,Bins3,Bins4,Bins5,Bins6,Bins7,Class
0,transformed_image_covid_1.png,4722,15567,4,7683,12061,1,8864,16634,0
1,transformed_image_covid_2.png,6556,13701,25,9956,9437,0,12114,13747,0
2,transformed_image_covid_3.png,10512,12249,1,11502,7743,2,9619,13908,0
3,transformed_image_covid_4.png,7987,11854,2,10419,11895,9,11931,11439,0
4,transformed_image_covid_5.png,7761,14159,4,10898,10560,9,9153,12992,0


In [4]:
data = data.drop(['filename'], axis=1)
data.head()

Unnamed: 0,Bins0,Bins1,Bins2,Bins3,Bins4,Bins5,Bins6,Bins7,Class
0,4722,15567,4,7683,12061,1,8864,16634,0
1,6556,13701,25,9956,9437,0,12114,13747,0
2,10512,12249,1,11502,7743,2,9619,13908,0
3,7987,11854,2,10419,11895,9,11931,11439,0
4,7761,14159,4,10898,10560,9,9153,12992,0


In [5]:
import numpy as np
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(data)

Unnamed: 0,Bins0,Bins1,Bins2,Bins3,Bins4,Bins5,Bins6,Bins7,Class
0,4722.0,15567.0,4.0,7683.0,12061.0,1.0,8864.0,16634.0,0.0
1,6556.0,13701.0,25.0,9956.0,9437.0,0.0,12114.0,13747.0,0.0
2,10512.0,12249.0,1.0,11502.0,7743.0,2.0,9619.0,13908.0,0.0
3,7987.0,11854.0,2.0,10419.0,11895.0,9.0,11931.0,11439.0,0.0
4,7761.0,14159.0,4.0,10898.0,10560.0,9.0,9153.0,12992.0,0.0
...,...,...,...,...,...,...,...,...,...
2195,7794.0,14496.0,0.0,9859.0,11028.0,0.0,9656.0,12703.0,1.0
2196,7098.0,9460.0,0.0,11614.0,15969.0,0.0,6950.0,14445.0,1.0
2197,6380.0,10094.0,1.0,11283.0,13771.0,0.0,7565.0,16442.0,1.0
2198,6929.0,13336.0,1.0,10413.0,11515.0,0.0,9202.0,14140.0,1.0


In [6]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Class'], axis=1),
    data['Class'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((1540, 8), (660, 8))

In [7]:
# linear models benefit from feature scaling

scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [8]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel class from sklearn, which
# will select the features which coefficients are non-zero

sel_ = SelectFromModel(
    LogisticRegression(C=0.1, penalty='l1', solver='liblinear', random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.1, penalty='l1',
                                             random_state=10,
                                             solver='liblinear'))

In [9]:
# this command let's me visualise the index of the
# features that were selected

sel_.get_support()

array([ True,  True,  True,  True,  True,  True, False,  True])

In [10]:
# Now I make a list with the selected features
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 8
selected features: 7
features with coefficients shrank to zero: 1


In [11]:
# the number of features which coefficient was shrank to zero:
np.sum(sel_.estimator_.coef_ == 0)

1

In [12]:
# we can identify the removed features like this:

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['Bins6'], dtype='object')

In [13]:
# we can then remove the features from the training and testing set
# like this:

X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((1540, 7), (660, 7))

In [14]:
# create a function to build random forests and
# compare its performance in train and test sets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, mean_squared_error
import scikitplot as skplt
import matplotlib.pyplot as plt

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    y_pred = rf.predict(X_test_selected)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [15]:
run_randomForests(X_train_selected, X_test_selected, y_train, y_test)

Train set
Random Forests roc-auc: 0.9525135942897237
Test set
Random Forests roc-auc: 0.930846863062004
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       332
           1       0.86      0.86      0.86       328

    accuracy                           0.86       660
   macro avg       0.86      0.86      0.86       660
weighted avg       0.86      0.86      0.86       660

Confusion Matrix:
[[286  46]
 [ 47 281]]
Metrics:
Accuracy: 0.859
F1 Score: 0.858
Precision: 0.859
Recall: 0.857
After Cross Validation:
Accuracy: 86.17 %
Standard Deviation: 1.05 %


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def run_logistic(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(penalty='l1', random_state=44, max_iter=1000, solver='liblinear')
    logit.fit(X_train, y_train)
    
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = logit.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = logit, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [17]:
run_logistic(X_train_selected, X_test_selected, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.8433350874352332
Test set
Logistic Regression roc-auc: 0.8369177931237144
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.77      0.81       332
           1       0.79      0.87      0.83       328

    accuracy                           0.82       660
   macro avg       0.82      0.82      0.82       660
weighted avg       0.82      0.82      0.82       660

Confusion Matrix:
[[256  76]
 [ 42 286]]
Metrics:
Accuracy: 0.821
F1 Score: 0.829
Precision: 0.790
Recall: 0.872
After Cross Validation:
Accuracy: 80.65 %
Standard Deviation: 1.48 %


In [18]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.svm import SVC

def run_kernel_SVM(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = SVC(kernel = 'rbf', random_state = 0, probability=True)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [19]:
run_kernel_SVM(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.34711989961139894
Test set
Kernel SVM roc-auc: 0.33740449603291217
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82       332
           1       0.80      0.87      0.83       328

    accuracy                           0.83       660
   macro avg       0.83      0.83      0.83       660
weighted avg       0.83      0.83      0.83       660

Confusion Matrix:
[[262  70]
 [ 43 285]]
Metrics:
Accuracy: 0.829
F1 Score: 0.835
Precision: 0.803
Recall: 0.869
After Cross Validation:
Accuracy: 82.60 %
Standard Deviation: 2.72 %


In [20]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.neighbors import KNeighborsClassifier

def run_knn(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [21]:
run_knn(X_train_selected, X_test_selected, y_train, y_test)

Train set
KNN roc-auc: 0.5
Test set
KNN roc-auc: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.82      0.85       332
           1       0.83      0.89      0.86       328

    accuracy                           0.85       660
   macro avg       0.86      0.85      0.85       660
weighted avg       0.86      0.85      0.85       660

Confusion Matrix:
[[271  61]
 [ 35 293]]
Metrics:
Accuracy: 0.855
F1 Score: 0.859
Precision: 0.828
Recall: 0.893
After Cross Validation:
Accuracy: 85.91 %
Standard Deviation: 2.13 %


In [22]:
from sklearn.tree import DecisionTreeClassifier


def run_decision_tree(X_train, X_test, y_train, y_test):

    # function to train and test the performance of logistic regression
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(X_train)
    print('Decision Tree roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(X_test)
    print('Decision Tree roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [23]:
run_decision_tree(X_train_selected, X_test_selected, y_train, y_test)

Train set
Decision Tree roc-auc: 1.0
Test set
Decision Tree roc-auc: 0.8347965030855128
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       332
           1       0.84      0.83      0.83       328

    accuracy                           0.83       660
   macro avg       0.83      0.83      0.83       660
weighted avg       0.83      0.83      0.83       660

Confusion Matrix:
[[280  52]
 [ 57 271]]
Metrics:
Accuracy: 0.835
F1 Score: 0.833
Precision: 0.839
Recall: 0.826
After Cross Validation:
Accuracy: 84.29 %
Standard Deviation: 1.56 %


In [24]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.svm import SVC

def run_linear_SVM(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = SVC(kernel = 'linear', random_state = 0, probability=True)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [25]:
run_linear_SVM(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.8437904792746115
Test set
Kernel SVM roc-auc: 0.838322803408757
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       332
           1       0.78      0.88      0.83       328

    accuracy                           0.82       660
   macro avg       0.82      0.82      0.82       660
weighted avg       0.82      0.82      0.82       660

Confusion Matrix:
[[252  80]
 [ 40 288]]
Metrics:
Accuracy: 0.818
F1 Score: 0.828
Precision: 0.783
Recall: 0.878
After Cross Validation:
Accuracy: 81.17 %
Standard Deviation: 1.69 %


In [26]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.naive_bayes import GaussianNB

def run_naive_bayes(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [27]:
run_naive_bayes(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.25784454609240065
Test set
Kernel SVM roc-auc: 0.2487419188950926
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.53      0.67       332
           1       0.67      0.95      0.78       328

    accuracy                           0.74       660
   macro avg       0.79      0.74      0.73       660
weighted avg       0.79      0.74      0.72       660

Confusion Matrix:
[[176 156]
 [ 18 310]]
Metrics:
Accuracy: 0.736
F1 Score: 0.781
Precision: 0.665
Recall: 0.945
After Cross Validation:
Accuracy: 71.62 %
Standard Deviation: 1.51 %


In [30]:
# For comparison, I will fit a logistic regression with a
# Ridge regularisation, and evaluate the coefficients

l1_logit = LogisticRegression(C=0.5, penalty='l2', max_iter=300, random_state=10)
l1_logit.fit(scaler.transform(X_train), y_train)

# I count the number of coefficients with zero values
# and it is zero, as expected
np.sum(l1_logit.coef_ == 0)

0

In [32]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel class from sklearn, which
# will select the features which coefficients are non-zero

sel_ = SelectFromModel(
    LogisticRegression(C=0.5, penalty='l2', solver='liblinear', random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.5, random_state=10,
                                             solver='liblinear'))

In [33]:
# this command let's me visualise the index of the
# features that were selected

sel_.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
        True, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False,  True])

In [34]:
# Now I make a list with the selected features
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 96
selected features: 13
features with coefficients shrank to zero: 0


In [35]:
# we can identify the removed features like this:

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index([], dtype='object')

In [36]:
# we can then remove the features from the training and testing set
# like this:

X_train_ridge = sel_.transform(X_train)
X_test_ridge = sel_.transform(X_test)

X_train_ridge.shape, X_test_ridge.shape

((16799, 13), (7200, 13))

In [39]:
run_randomForests(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Random Forests roc-auc: 0.999772997745144
Test set
Random Forests roc-auc: 0.9998485143018937
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      3559
           1       0.98      0.87      0.92      3641

    accuracy                           0.93      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.93      0.93      7200

Confusion Matrix:
[[3488   71]
 [ 456 3185]]
Metrics:
Accuracy: 0.927
F1 Score: 0.924
Precision: 0.978
Recall: 0.875
After Cross Validation:
Accuracy: 99.57 %
Standard Deviation: 0.23 %


In [40]:
run_logistic(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.999753252872149
Test set
Logistic Regression roc-auc: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3558    1]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
After Cross Validation:
Accuracy: 100.00 %
Standard Deviation: 0.00 %


In [41]:
run_kernel_SVM(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9986423946944832
Test set
Kernel SVM roc-auc: 0.998696050004634
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3559
           1       0.97      0.94      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3467   92]
 [ 222 3419]]
Metrics:
Accuracy: 0.956
F1 Score: 0.956
Precision: 0.974
Recall: 0.939
After Cross Validation:
Accuracy: 95.77 %
Standard Deviation: 0.47 %


In [42]:
run_knn(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
KNN roc-auc: 0.510707022371097
Test set
KNN roc-auc: 0.5102941207111817
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3559
           1       0.98      0.99      0.99      3641

    accuracy                           0.99      7200
   macro avg       0.99      0.99      0.99      7200
weighted avg       0.99      0.99      0.99      7200

Confusion Matrix:
[[3490   69]
 [  32 3609]]
Metrics:
Accuracy: 0.986
F1 Score: 0.986
Precision: 0.981
Recall: 0.991
After Cross Validation:
Accuracy: 98.63 %
Standard Deviation: 0.31 %


In [43]:
run_decision_tree(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Decision Tree roc-auc: 1.0
Test set
Decision Tree roc-auc: 0.9983236251553924
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3550    9]
 [   3 3638]]
Metrics:
Accuracy: 0.998
F1 Score: 0.998
Precision: 0.998
Recall: 0.999
After Cross Validation:
Accuracy: 99.84 %
Standard Deviation: 0.10 %


In [44]:
run_linear_SVM(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9979560158503278
Test set
Kernel SVM roc-auc: 0.997918480012724
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3558    1]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
After Cross Validation:
Accuracy: 100.00 %
Standard Deviation: 0.00 %


In [45]:
run_naive_bayes(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.34678172035816884
Test set
Kernel SVM roc-auc: 0.34544318595645007
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3559
           1       0.95      0.91      0.93      3641

    accuracy                           0.93      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.93      0.93      7200

Confusion Matrix:
[[3387  172]
 [ 337 3304]]
Metrics:
Accuracy: 0.929
F1 Score: 0.928
Precision: 0.951
Recall: 0.907
After Cross Validation:
Accuracy: 93.33 %
Standard Deviation: 0.50 %
