In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('image_bins_stats.csv')
data.shape

(24000, 98)

In [3]:
data.head()

Unnamed: 0,filename,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,clean_p_1.jpg,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,clean_p_2.jpg,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,clean_p_3.jpg,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,clean_p_4.jpg,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,clean_p_5.jpg,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [4]:
data = data.drop(['filename'], axis=1)
data.head()

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,1.943118,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,0.0,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,0.0,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,2.0,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,0.0,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [5]:
import numpy as np
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(data)

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.000000,128.758621,0.000000,159.770015,11.472993,1.943118,...,7.358843,47.621590,3.092351,0.0,0.0,0.000000,8.421707,0.000000,9.181035,1.0
1,0.191129,91.000000,0.0,0.0,0.000000,122.485714,0.000000,149.839854,0.626280,0.000000,...,8.430080,7.154429,0.840896,0.0,0.0,0.000000,15.029039,0.000000,10.516990,1.0
2,1.218065,115.000000,0.0,0.0,121.730769,135.517857,0.000000,154.189458,10.132966,0.000000,...,7.949709,43.394240,0.420448,0.0,0.0,7.701832,13.599319,0.000000,10.354453,1.0
3,0.148524,98.000000,0.0,0.0,0.000000,129.906667,0.000000,157.583812,0.573290,2.000000,...,8.987692,6.601182,2.619225,0.0,0.0,0.000000,12.787280,0.000000,10.943418,1.0
4,0.183128,0.000000,0.0,0.0,0.000000,0.000000,0.000000,158.600042,0.602004,0.000000,...,7.204324,6.869720,0.000000,0.0,0.0,0.000000,0.000000,0.000000,8.924785,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,0.167535,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.580835,0.587364,0.000000,...,6.817875,7.277478,0.000000,0.0,0.0,0.000000,0.000000,0.000000,11.111529,0.0
23995,0.182460,0.000000,0.0,0.0,0.000000,0.000000,0.000000,142.290042,0.590877,0.000000,...,5.262551,8.170191,0.000000,0.0,0.0,0.000000,0.000000,0.000000,10.111068,0.0
23996,0.222222,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.978772,0.689536,0.000000,...,5.407731,7.859719,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.734824,0.0
23997,0.217425,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.915418,0.665380,0.000000,...,5.073114,7.887594,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.672132,0.0


In [6]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['class'], axis=1),
    data['class'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((16799, 96), (7200, 96))

In [7]:
# linear models benefit from feature scaling

scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [8]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel class from sklearn, which
# will select the features which coefficients are non-zero

sel_ = SelectFromModel(
    LogisticRegression(C=0.1, penalty='l1', solver='liblinear', random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.1, penalty='l1',
                                             random_state=10,
                                             solver='liblinear'))

In [9]:
# this command let's me visualise the index of the
# features that were selected

sel_.get_support()

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False,  True,  True, False, False, False, False,  True,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True])

In [10]:
# Now I make a list with the selected features
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 96
selected features: 13
features with coefficients shrank to zero: 83


In [11]:
# the number of features which coefficient was shrank to zero:
np.sum(sel_.estimator_.coef_ == 0)

83

In [12]:
# we can identify the removed features like this:

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['rmean_bins0', 'rmean_bins1', 'rmean_bins2', 'rmean_bins3',
       'rmean_bins4', 'rmean_bins6', 'rmean_bins7', 'rstd_bins0', 'rstd_bins1',
       'rstd_bins2', 'rstd_bins3', 'rstd_bins4', 'rstd_bins5', 'rstd_bins6',
       'rstd_bins7', 'rskew_bins0', 'rskew_bins1', 'rskew_bins2',
       'rskew_bins3', 'rskew_bins4', 'rskew_bins5', 'rskew_bins6',
       'rkurto_bins0', 'rkurto_bins1', 'rkurto_bins2', 'rkurto_bins3',
       'rkurto_bins4', 'rkurto_bins5', 'rkurto_bins6', 'rkurto_bins7',
       'gmean_bins0', 'gmean_bins1', 'gmean_bins2', 'gmean_bins3',
       'gmean_bins4', 'gmean_bins6', 'gmean_bins7', 'gstd_bins0', 'gstd_bins1',
       'gstd_bins2', 'gstd_bins3', 'gstd_bins4', 'gstd_bins6', 'gskew_bins1',
       'gskew_bins2', 'gskew_bins3', 'gskew_bins4', 'gskew_bins6',
       'gkurto_bins0', 'gkurto_bins1', 'gkurto_bins2', 'gkurto_bins3',
       'gkurto_bins4', 'gkurto_bins5', 'gkurto_bins6', 'gkurto_bins7',
       'bmean_bins0', 'bmean_bins1', 'bmean_bins2', 'bmean_bins3',


In [13]:
# we can then remove the features from the training and testing set
# like this:

X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

X_train_selected.shape, X_test_selected.shape

((16799, 13), (7200, 13))

In [16]:
# create a function to build random forests and
# compare its performance in train and test sets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, mean_squared_error
import scikitplot as skplt
import matplotlib.pyplot as plt

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    y_pred = rf.predict(X_test_selected)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [17]:
run_randomForests(X_train_selected, X_test_selected, y_train, y_test)

Train set
Random Forests roc-auc: 0.9996115093474184
Test set
Random Forests roc-auc: 0.9995160637733953
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3559
           1       0.98      1.00      0.99      3641

    accuracy                           0.99      7200
   macro avg       0.99      0.99      0.99      7200
weighted avg       0.99      0.99      0.99      7200

Confusion Matrix:
[[3491   68]
 [  16 3625]]
Metrics:
Accuracy: 0.988
F1 Score: 0.989
Precision: 0.982
Recall: 0.996
After Cross Validation:
Accuracy: 99.12 %
Standard Deviation: 0.27 %


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

def run_logistic(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(penalty='l1', random_state=44, max_iter=1000, solver='liblinear')
    logit.fit(X_train, y_train)
    
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = logit.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = logit, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [19]:
run_logistic(X_train_selected, X_test_selected, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9995560025831342
Test set
Logistic Regression roc-auc: 0.9994040122025087
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3557    2]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 0.999
Recall: 1.000
After Cross Validation:
Accuracy: 99.99 %
Standard Deviation: 0.02 %


In [20]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.svm import SVC

def run_kernel_SVM(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = SVC(kernel = 'rbf', random_state = 0, probability=True)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [21]:
run_kernel_SVM(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9970259090153984
Test set
Kernel SVM roc-auc: 0.9966295782655141
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3559
           1       1.00      0.93      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3546   13]
 [ 262 3379]]
Metrics:
Accuracy: 0.962
F1 Score: 0.961
Precision: 0.996
Recall: 0.928
After Cross Validation:
Accuracy: 96.32 %
Standard Deviation: 0.61 %


In [22]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.neighbors import KNeighborsClassifier

def run_knn(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [23]:
run_knn(X_train_selected, X_test_selected, y_train, y_test)

Train set
KNN roc-auc: 0.5
Test set
KNN roc-auc: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      3559
           1       0.99      0.96      0.97      3641

    accuracy                           0.97      7200
   macro avg       0.97      0.97      0.97      7200
weighted avg       0.97      0.97      0.97      7200

Confusion Matrix:
[[3527   32]
 [ 163 3478]]
Metrics:
Accuracy: 0.973
F1 Score: 0.973
Precision: 0.991
Recall: 0.955
After Cross Validation:
Accuracy: 97.58 %
Standard Deviation: 0.34 %


In [24]:
from sklearn.tree import DecisionTreeClassifier


def run_decision_tree(X_train, X_test, y_train, y_test):

    # function to train and test the performance of logistic regression
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(X_train)
    print('Decision Tree roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(X_test)
    print('Decision Tree roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [25]:
run_decision_tree(X_train_selected, X_test_selected, y_train, y_test)

Train set
Decision Tree roc-auc: 1.0
Test set
Decision Tree roc-auc: 0.9933198125466736
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3559
           1       0.99      0.99      0.99      3641

    accuracy                           0.99      7200
   macro avg       0.99      0.99      0.99      7200
weighted avg       0.99      0.99      0.99      7200

Confusion Matrix:
[[3531   28]
 [  20 3621]]
Metrics:
Accuracy: 0.993
F1 Score: 0.993
Precision: 0.992
Recall: 0.995
After Cross Validation:
Accuracy: 99.38 %
Standard Deviation: 0.15 %


In [26]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.svm import SVC

def run_linear_SVM(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = SVC(kernel = 'linear', random_state = 0, probability=True)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [27]:
run_linear_SVM(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9872949183812436
Test set
Kernel SVM roc-auc: 0.988150855060753
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3559    0]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
After Cross Validation:
Accuracy: 100.00 %
Standard Deviation: 0.00 %


In [28]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.naive_bayes import GaussianNB

def run_naive_bayes(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [29]:
run_naive_bayes(X_train_selected, X_test_selected, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.8121673066859287
Test set
Kernel SVM roc-auc: 0.8134402309435351
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3559
           1       0.96      0.94      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3433  126]
 [ 227 3414]]
Metrics:
Accuracy: 0.951
F1 Score: 0.951
Precision: 0.964
Recall: 0.938
After Cross Validation:
Accuracy: 95.09 %
Standard Deviation: 0.61 %


In [30]:
# For comparison, I will fit a logistic regression with a
# Ridge regularisation, and evaluate the coefficients

l1_logit = LogisticRegression(C=0.5, penalty='l2', max_iter=300, random_state=10)
l1_logit.fit(scaler.transform(X_train), y_train)

# I count the number of coefficients with zero values
# and it is zero, as expected
np.sum(l1_logit.coef_ == 0)

0

In [32]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel class from sklearn, which
# will select the features which coefficients are non-zero

sel_ = SelectFromModel(
    LogisticRegression(C=0.5, penalty='l2', solver='liblinear', random_state=10))

sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.5, random_state=10,
                                             solver='liblinear'))

In [33]:
# this command let's me visualise the index of the
# features that were selected

sel_.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
        True, False, False, False, False, False,  True, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False,  True])

In [34]:
# Now I make a list with the selected features
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 96
selected features: 13
features with coefficients shrank to zero: 0


In [35]:
# we can identify the removed features like this:

removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index([], dtype='object')

In [36]:
# we can then remove the features from the training and testing set
# like this:

X_train_ridge = sel_.transform(X_train)
X_test_ridge = sel_.transform(X_test)

X_train_ridge.shape, X_test_ridge.shape

((16799, 13), (7200, 13))

In [39]:
run_randomForests(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Random Forests roc-auc: 0.999772997745144
Test set
Random Forests roc-auc: 0.9998485143018937
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.93      3559
           1       0.98      0.87      0.92      3641

    accuracy                           0.93      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.93      0.93      7200

Confusion Matrix:
[[3488   71]
 [ 456 3185]]
Metrics:
Accuracy: 0.927
F1 Score: 0.924
Precision: 0.978
Recall: 0.875
After Cross Validation:
Accuracy: 99.57 %
Standard Deviation: 0.23 %


In [40]:
run_logistic(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Logistic Regression roc-auc: 0.999753252872149
Test set
Logistic Regression roc-auc: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3558    1]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
After Cross Validation:
Accuracy: 100.00 %
Standard Deviation: 0.00 %


In [41]:
run_kernel_SVM(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9986423946944832
Test set
Kernel SVM roc-auc: 0.998696050004634
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3559
           1       0.97      0.94      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3467   92]
 [ 222 3419]]
Metrics:
Accuracy: 0.956
F1 Score: 0.956
Precision: 0.974
Recall: 0.939
After Cross Validation:
Accuracy: 95.77 %
Standard Deviation: 0.47 %


In [42]:
run_knn(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
KNN roc-auc: 0.510707022371097
Test set
KNN roc-auc: 0.5102941207111817
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3559
           1       0.98      0.99      0.99      3641

    accuracy                           0.99      7200
   macro avg       0.99      0.99      0.99      7200
weighted avg       0.99      0.99      0.99      7200

Confusion Matrix:
[[3490   69]
 [  32 3609]]
Metrics:
Accuracy: 0.986
F1 Score: 0.986
Precision: 0.981
Recall: 0.991
After Cross Validation:
Accuracy: 98.63 %
Standard Deviation: 0.31 %


In [43]:
run_decision_tree(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Decision Tree roc-auc: 1.0
Test set
Decision Tree roc-auc: 0.9983236251553924
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3550    9]
 [   3 3638]]
Metrics:
Accuracy: 0.998
F1 Score: 0.998
Precision: 0.998
Recall: 0.999
After Cross Validation:
Accuracy: 99.84 %
Standard Deviation: 0.10 %


In [44]:
run_linear_SVM(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9979560158503278
Test set
Kernel SVM roc-auc: 0.997918480012724
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3558    1]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 1.000
Recall: 1.000
After Cross Validation:
Accuracy: 100.00 %
Standard Deviation: 0.00 %


In [45]:
run_naive_bayes(X_train_ridge, X_test_ridge, y_train, y_test)

Train set
Kernel SVM roc-auc: 0.34678172035816884
Test set
Kernel SVM roc-auc: 0.34544318595645007
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3559
           1       0.95      0.91      0.93      3641

    accuracy                           0.93      7200
   macro avg       0.93      0.93      0.93      7200
weighted avg       0.93      0.93      0.93      7200

Confusion Matrix:
[[3387  172]
 [ 337 3304]]
Metrics:
Accuracy: 0.929
F1 Score: 0.928
Precision: 0.951
Recall: 0.907
After Cross Validation:
Accuracy: 93.33 %
Standard Deviation: 0.50 %
