In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, f_classif, SelectKBest

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv('image_bins_stats.csv')
data.shape

(24000, 98)

In [3]:
data.head()

Unnamed: 0,filename,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,clean_p_1.jpg,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,clean_p_2.jpg,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,clean_p_3.jpg,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,clean_p_4.jpg,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,clean_p_5.jpg,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [4]:
data = data.drop(['filename'], axis=1)
data.head()

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,1.943118,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,0.0,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,0.0,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,2.0,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,0.0,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [5]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(data)

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.000000,128.758621,0.000000,159.770015,11.472993,1.943118,...,7.358843,47.621590,3.092351,0.0,0.0,0.000000,8.421707,0.000000,9.181035,1.0
1,0.191129,91.000000,0.0,0.0,0.000000,122.485714,0.000000,149.839854,0.626280,0.000000,...,8.430080,7.154429,0.840896,0.0,0.0,0.000000,15.029039,0.000000,10.516990,1.0
2,1.218065,115.000000,0.0,0.0,121.730769,135.517857,0.000000,154.189458,10.132966,0.000000,...,7.949709,43.394240,0.420448,0.0,0.0,7.701832,13.599319,0.000000,10.354453,1.0
3,0.148524,98.000000,0.0,0.0,0.000000,129.906667,0.000000,157.583812,0.573290,2.000000,...,8.987692,6.601182,2.619225,0.0,0.0,0.000000,12.787280,0.000000,10.943418,1.0
4,0.183128,0.000000,0.0,0.0,0.000000,0.000000,0.000000,158.600042,0.602004,0.000000,...,7.204324,6.869720,0.000000,0.0,0.0,0.000000,0.000000,0.000000,8.924785,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,0.167535,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.580835,0.587364,0.000000,...,6.817875,7.277478,0.000000,0.0,0.0,0.000000,0.000000,0.000000,11.111529,0.0
23995,0.182460,0.000000,0.0,0.0,0.000000,0.000000,0.000000,142.290042,0.590877,0.000000,...,5.262551,8.170191,0.000000,0.0,0.0,0.000000,0.000000,0.000000,10.111068,0.0
23996,0.222222,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.978772,0.689536,0.000000,...,5.407731,7.859719,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.734824,0.0
23997,0.217425,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.915418,0.665380,0.000000,...,5.073114,7.887594,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.672132,0.0


In [6]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['class'], axis=1),
    data['class'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((16799, 96), (7200, 96))

In [7]:
X_train

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins6,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7
15382,0.642674,0.000000,0.0,0.0,0.000000,0.000000,0.0,180.492704,1.398502,0.000000,...,0.0,7.233725,1.438841,0.000000,0.000000,0.0,0.000000,0.000000,0.0,12.239542
6786,0.499830,0.000000,0.0,0.0,0.000000,167.798913,0.0,183.028467,1.066107,0.000000,...,0.0,5.489400,1.556517,0.000000,0.000000,0.0,0.000000,4.494643,0.0,6.639987
16217,0.887297,0.000000,0.0,0.0,0.000000,0.000000,0.0,173.211795,1.848103,0.000000,...,0.0,11.129454,1.682182,0.000000,0.000000,0.0,0.000000,0.000000,0.0,17.794553
1047,0.782657,102.000000,101.0,0.0,0.000000,120.445183,104.0,144.223099,3.948451,0.838919,...,0.0,14.591023,21.604183,4.765680,0.420448,0.0,0.000000,12.471256,0.0,16.088283
6173,0.434416,0.000000,0.0,0.0,0.000000,167.104167,0.0,186.553922,1.029542,0.000000,...,0.0,6.289114,1.464988,0.000000,0.000000,0.0,0.000000,5.050620,0.0,7.963030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,0.580405,0.000000,0.0,0.0,0.000000,0.000000,0.0,154.206890,1.275322,0.000000,...,0.0,15.224769,6.592747,0.000000,0.000000,0.0,0.000000,0.000000,0.0,20.038586
19648,0.495913,0.000000,0.0,0.0,0.000000,0.000000,0.0,187.846749,1.143590,0.000000,...,0.0,7.137580,1.480392,0.000000,0.000000,0.0,0.000000,0.000000,0.0,12.405009
9845,1.401092,110.896552,0.0,0.0,116.866667,125.394161,0.0,149.918135,10.539109,2.355900,...,0.0,9.183102,43.518837,6.420400,0.000000,0.0,1.808437,11.588348,0.0,11.857062
10799,0.272432,0.000000,0.0,0.0,0.000000,132.720280,0.0,158.943398,0.772915,0.000000,...,0.0,10.071514,6.627811,0.000000,0.000000,0.0,0.000000,4.098590,0.0,12.355050


In [8]:
X_test

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins6,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7
5118,0.687943,0.000000,0.0,0.0,0.000000,0.000000,0.0,171.546210,1.607495,0.000000,...,0.0,12.683617,2.380153,0.000000,0.0,0.0,0.000000,0.000000,0.0,15.627832
10283,0.300245,102.666667,0.0,0.0,0.000000,120.374269,127.0,149.654260,2.505910,0.346944,...,0.0,12.604009,20.749182,1.192292,0.0,0.0,0.000000,14.662717,0.0,16.205029
6208,0.301754,0.000000,0.0,0.0,0.000000,158.352381,0.0,184.260842,0.828748,0.000000,...,0.0,7.358766,1.414617,0.000000,0.0,0.0,0.000000,5.024962,0.0,8.991926
3361,1.699742,98.807692,0.0,0.0,0.000000,119.115942,0.0,143.099246,11.242337,2.569207,...,0.0,8.709662,46.133754,4.357952,0.0,0.0,0.000000,12.654396,0.0,11.013844
7067,0.691308,0.000000,0.0,0.0,155.333333,167.072815,0.0,181.798058,1.450263,0.000000,...,0.0,9.183701,1.934695,0.000000,0.0,0.0,1.742281,10.050546,0.0,10.575285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17047,0.439822,0.000000,0.0,0.0,0.000000,0.000000,0.0,180.908768,1.066909,0.000000,...,0.0,8.094289,2.084459,0.000000,0.0,0.0,0.000000,0.000000,0.0,13.360762
11170,0.206696,0.000000,0.0,0.0,118.363636,132.035533,0.0,151.538431,0.665276,0.000000,...,0.0,6.948614,5.681294,0.000000,0.0,0.0,2.288082,8.089594,0.0,7.973840
3518,0.550650,99.764706,0.0,0.0,0.000000,118.115385,0.0,140.164166,5.077873,1.798328,...,0.0,7.836443,31.417834,3.161896,0.0,0.0,0.000000,13.009942,0.0,10.270765
19023,0.582020,0.000000,0.0,0.0,0.000000,0.000000,0.0,177.482669,1.221176,0.000000,...,0.0,5.025964,1.759347,0.000000,0.0,0.0,0.000000,0.000000,0.0,7.958754


In [9]:
# I keep a copy of the dataset with all the variables
# to compare the performance of machine learning models
# at the end of the notebook

X_train_original = X_train.copy()
X_test_original = X_test.copy()

In [10]:
# find and remove correlated features
def correlation(dataset, threshold):
    
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            # we are interested in absolute coeff value
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    
    return col_corr


corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)))

correlated features:  72


In [11]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((16799, 24), (7200, 24))

In [12]:
# keep a copy of the dataset at  this stage
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

In [13]:
sel_ = SelectKBest(f_classif, k=20).fit(X_train, y_train)

# capture selected feature names
features_to_keep = X_train.columns[sel_.get_support()]

# select features
X_train_anova = sel_.transform(X_train)
X_test_anova = sel_.transform(X_test)

# numpy array to dataframe
X_train_anova = pd.DataFrame(X_train_anova)
X_train_anova.columns = features_to_keep

X_test_anova = pd.DataFrame(X_test_anova)
X_test_anova.columns = features_to_keep

X_train_anova.shape, X_test_anova.shape

((16799, 20), (7200, 20))

In [14]:
# create a function to build random forests and
# compare its performance in train and test sets
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import scikitplot as skplt
import matplotlib.pyplot as plt

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = rf.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = rf, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [15]:
# original
run_randomForests(X_train_original,
                  X_test_original,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9986183833413939
Test set
Random Forests roc-auc: 0.998528358500821
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      3559
           1       0.98      0.94      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3501   58]
 [ 211 3430]]
Metrics:
Accuracy: 0.963
F1 Score: 0.962
Precision: 0.983
Recall: 0.942
After Cross Validation:
Accuracy: 96.58 %
Standard Deviation: 0.55 %


In [16]:
# filter methods - correlation
run_randomForests(X_train_corr,
                  X_test_corr,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9712432437948937
Test set
Random Forests roc-auc: 0.9709026301945491
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3559
           1       0.97      0.93      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3466   93]
 [ 243 3398]]
Metrics:
Accuracy: 0.953
F1 Score: 0.953
Precision: 0.973
Recall: 0.933
After Cross Validation:
Accuracy: 95.32 %
Standard Deviation: 0.65 %


In [17]:
# filter methods - univariate roc-auc
run_randomForests(X_train_anova,
                  X_test_anova,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9721790132836361
Test set
Random Forests roc-auc: 0.9717366504096713
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3559
           1       0.97      0.93      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3468   91]
 [ 244 3397]]
Metrics:
Accuracy: 0.953
F1 Score: 0.953
Precision: 0.974
Recall: 0.933
After Cross Validation:
Accuracy: 95.35 %
Standard Deviation: 0.64 %


In [18]:
# create a function to build logistic regression
# and compare its performance in train and test sets

def run_logistic(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(penalty='l1', random_state=44, max_iter=1000, solver='liblinear')
    logit.fit(X_train, y_train)
    
    print('Train set')
    pred = logit.predict_proba(scaler.transform(X_train))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = logit.predict_proba(scaler.transform(X_test))
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = logit.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = logit, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [19]:
# original
run_logistic(X_train_original,
             X_test_original,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9999998299077704
Test set
Logistic Regression roc-auc: 0.9997183276627162
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3559
           1       1.00      1.00      1.00      3641

    accuracy                           1.00      7200
   macro avg       1.00      1.00      1.00      7200
weighted avg       1.00      1.00      1.00      7200

Confusion Matrix:
[[3556    3]
 [   0 3641]]
Metrics:
Accuracy: 1.000
F1 Score: 1.000
Precision: 0.999
Recall: 1.000
After Cross Validation:
Accuracy: 99.99 %
Standard Deviation: 0.02 %


In [20]:
# filter methods - correlation

run_logistic(X_train_corr,
             X_test_corr,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.48687518745581143
Test set
Logistic Regression roc-auc: 0.49460512586547684
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3559
           1       0.97      0.93      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3462   97]
 [ 241 3400]]
Metrics:
Accuracy: 0.953
F1 Score: 0.953
Precision: 0.972
Recall: 0.934
After Cross Validation:
Accuracy: 95.19 %
Standard Deviation: 0.62 %


In [21]:
# filter methods - univariate anova

run_logistic(X_train_anova,
             X_test_anova,
             y_train, y_test)

Train set
Logistic Regression roc-auc: 0.49073044123625303
Test set
Logistic Regression roc-auc: 0.5013979050832132
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3559
           1       0.97      0.93      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3459  100]
 [ 243 3398]]
Metrics:
Accuracy: 0.952
F1 Score: 0.952
Precision: 0.971
Recall: 0.933
After Cross Validation:
Accuracy: 95.22 %
Standard Deviation: 0.60 %


In [22]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.svm import SVC

def run_kernel_SVM(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = SVC(kernel = 'rbf', random_state = 0, probability=True)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [23]:
# original
run_kernel_SVM(X_train_original,
             X_test_original,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9960229318344052
Test set
Kernel SVM roc-auc: 0.9955505031169553
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      3559
           1       1.00      0.93      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3544   15]
 [ 255 3386]]
Metrics:
Accuracy: 0.963
F1 Score: 0.962
Precision: 0.996
Recall: 0.930
After Cross Validation:
Accuracy: 96.42 %
Standard Deviation: 0.58 %


In [24]:
# filter methods - correlation

run_kernel_SVM(X_train_corr,
             X_test_corr,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9645389876904255
Test set
Kernel SVM roc-auc: 0.9634505061960582
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      3559
           1       0.97      0.93      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.95      0.95      0.95      7200
weighted avg       0.95      0.95      0.95      7200

Confusion Matrix:
[[3463   96]
 [ 238 3403]]
Metrics:
Accuracy: 0.954
F1 Score: 0.953
Precision: 0.973
Recall: 0.935
After Cross Validation:
Accuracy: 95.34 %
Standard Deviation: 0.65 %


In [25]:
# filter methods - univariate anova

run_kernel_SVM(X_train_anova,
             X_test_anova,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.9634873357830394
Test set
Kernel SVM roc-auc: 0.9623356239339378
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      3559
           1       0.97      0.94      0.95      3641

    accuracy                           0.95      7200
   macro avg       0.96      0.95      0.95      7200
weighted avg       0.96      0.95      0.95      7200

Confusion Matrix:
[[3465   94]
 [ 233 3408]]
Metrics:
Accuracy: 0.955
F1 Score: 0.954
Precision: 0.973
Recall: 0.936
After Cross Validation:
Accuracy: 95.43 %
Standard Deviation: 0.67 %


In [26]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.neighbors import KNeighborsClassifier

def run_knn(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [27]:
# original
run_knn(X_train_original,
             X_test_original,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.5208680132490507
Test set
Kernel SVM roc-auc: 0.5139464077092099
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      3559
           1       0.99      0.95      0.97      3641

    accuracy                           0.97      7200
   macro avg       0.97      0.97      0.97      7200
weighted avg       0.97      0.97      0.97      7200

Confusion Matrix:
[[3522   37]
 [ 169 3472]]
Metrics:
Accuracy: 0.971
F1 Score: 0.971
Precision: 0.989
Recall: 0.954
After Cross Validation:
Accuracy: 97.29 %
Standard Deviation: 0.39 %


In [28]:
# filter methods - correlation

run_knn(X_train_corr,
             X_test_corr,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.5
Test set
Kernel SVM roc-auc: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3559
           1       0.97      0.94      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3466   93]
 [ 203 3438]]
Metrics:
Accuracy: 0.959
F1 Score: 0.959
Precision: 0.974
Recall: 0.944
After Cross Validation:
Accuracy: 95.77 %
Standard Deviation: 0.50 %


In [29]:
# filter methods - univariate anova

run_knn(X_train_anova,
             X_test_anova,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.5
Test set
Kernel SVM roc-auc: 0.5
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3559
           1       0.97      0.94      0.96      3641

    accuracy                           0.96      7200
   macro avg       0.96      0.96      0.96      7200
weighted avg       0.96      0.96      0.96      7200

Confusion Matrix:
[[3466   93]
 [ 220 3421]]
Metrics:
Accuracy: 0.957
F1 Score: 0.956
Precision: 0.974
Recall: 0.940
After Cross Validation:
Accuracy: 95.48 %
Standard Deviation: 0.66 %


In [30]:
# create a function to build logistic regression
# and compare its performance in train and test sets
from sklearn.naive_bayes import GaussianNB

def run_naive_bayes(X_train, X_test, y_train, y_test):
    
    scaler = StandardScaler().fit(X_train)
    
    # function to train and test the performance of logistic regression
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    
    print('Train set')
    pred = classifier.predict_proba(scaler.transform(X_train))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = classifier.predict_proba(scaler.transform(X_test))
    print('Kernel SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))
    
    y_pred = classifier.predict(X_test)
    
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    
    print('Metrics:')
    print('Accuracy: {0:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('F1 Score: {0:.3f}'.format(metrics.f1_score(y_test, y_pred)))
    print('Precision: {0:.3f}'.format(metrics.precision_score(y_test, y_pred)))
    print('Recall: {0:.3f}'.format(metrics.recall_score(y_test, y_pred)))
    
    print('After Cross Validation:')
    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [31]:
# original
run_naive_bayes(X_train_original,
             X_test_original,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.8542856012958759
Test set
Kernel SVM roc-auc: 0.8513625108318449
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      3559
           1       0.97      0.90      0.93      3641

    accuracy                           0.94      7200
   macro avg       0.94      0.94      0.94      7200
weighted avg       0.94      0.94      0.94      7200

Confusion Matrix:
[[3443  116]
 [ 349 3292]]
Metrics:
Accuracy: 0.935
F1 Score: 0.934
Precision: 0.966
Recall: 0.904
After Cross Validation:
Accuracy: 93.58 %
Standard Deviation: 0.56 %


In [32]:
# filter methods - correlation

run_naive_bayes(X_train_corr,
             X_test_corr,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.6534249913111219
Test set
Kernel SVM roc-auc: 0.6429623317654087
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.95      0.88      3559
           1       0.94      0.81      0.87      3641

    accuracy                           0.88      7200
   macro avg       0.88      0.88      0.88      7200
weighted avg       0.89      0.88      0.88      7200

Confusion Matrix:
[[3376  183]
 [ 703 2938]]
Metrics:
Accuracy: 0.877
F1 Score: 0.869
Precision: 0.941
Recall: 0.807
After Cross Validation:
Accuracy: 88.18 %
Standard Deviation: 0.72 %


In [33]:
# filter methods - univariate anova

run_naive_bayes(X_train_anova,
             X_test_anova,
             y_train, y_test)

Train set
Kernel SVM roc-auc: 0.6531536658560827
Test set
Kernel SVM roc-auc: 0.6435440044345259
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      3559
           1       0.94      0.85      0.89      3641

    accuracy                           0.90      7200
   macro avg       0.90      0.90      0.90      7200
weighted avg       0.90      0.90      0.90      7200

Confusion Matrix:
[[3345  214]
 [ 535 3106]]
Metrics:
Accuracy: 0.896
F1 Score: 0.892
Precision: 0.936
Recall: 0.853
After Cross Validation:
Accuracy: 90.04 %
Standard Deviation: 0.70 %
