In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
    SmartCorrelatedSelection,
)

In [2]:
data = pd.read_csv('image_bins_stats.csv')
data.shape

(24000, 98)

In [3]:
data.head()

Unnamed: 0,filename,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,clean_p_1.jpg,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,clean_p_2.jpg,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,clean_p_3.jpg,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,clean_p_4.jpg,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,clean_p_5.jpg,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [4]:
data = data.drop(['filename'], axis=1)
data.head()

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.0,128.758621,0.0,159.770015,11.472993,1.943118,...,7.358843,47.62159,3.092351,0.0,0.0,0.0,8.421707,0.0,9.181035,1
1,0.191129,91.0,0.0,0.0,0.0,122.485714,0.0,149.839854,0.62628,0.0,...,8.43008,7.154429,0.840896,0.0,0.0,0.0,15.029039,0.0,10.51699,1
2,1.218065,115.0,0.0,0.0,121.730769,135.517857,0.0,154.189458,10.132966,0.0,...,7.949709,43.39424,0.420448,0.0,0.0,7.701832,13.599319,0.0,10.354453,1
3,0.148524,98.0,0.0,0.0,0.0,129.906667,0.0,157.583812,0.57329,2.0,...,8.987692,6.601182,2.619225,0.0,0.0,0.0,12.78728,0.0,10.943418,1
4,0.183128,0.0,0.0,0.0,0.0,0.0,0.0,158.600042,0.602004,0.0,...,7.204324,6.86972,0.0,0.0,0.0,0.0,0.0,0.0,8.924785,1


In [5]:
import numpy as np
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(data)

Unnamed: 0,rmean_bins0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins4,rmean_bins5,rmean_bins6,rmean_bins7,rstd_bins0,rstd_bins1,...,bskew_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins4,bkurto_bins5,bkurto_bins6,bkurto_bins7,class
0,1.485688,116.416667,0.0,0.0,0.000000,128.758621,0.000000,159.770015,11.472993,1.943118,...,7.358843,47.621590,3.092351,0.0,0.0,0.000000,8.421707,0.000000,9.181035,1.0
1,0.191129,91.000000,0.0,0.0,0.000000,122.485714,0.000000,149.839854,0.626280,0.000000,...,8.430080,7.154429,0.840896,0.0,0.0,0.000000,15.029039,0.000000,10.516990,1.0
2,1.218065,115.000000,0.0,0.0,121.730769,135.517857,0.000000,154.189458,10.132966,0.000000,...,7.949709,43.394240,0.420448,0.0,0.0,7.701832,13.599319,0.000000,10.354453,1.0
3,0.148524,98.000000,0.0,0.0,0.000000,129.906667,0.000000,157.583812,0.573290,2.000000,...,8.987692,6.601182,2.619225,0.0,0.0,0.000000,12.787280,0.000000,10.943418,1.0
4,0.183128,0.000000,0.0,0.0,0.000000,0.000000,0.000000,158.600042,0.602004,0.000000,...,7.204324,6.869720,0.000000,0.0,0.0,0.000000,0.000000,0.000000,8.924785,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,0.167535,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.580835,0.587364,0.000000,...,6.817875,7.277478,0.000000,0.0,0.0,0.000000,0.000000,0.000000,11.111529,0.0
23995,0.182460,0.000000,0.0,0.0,0.000000,0.000000,0.000000,142.290042,0.590877,0.000000,...,5.262551,8.170191,0.000000,0.0,0.0,0.000000,0.000000,0.000000,10.111068,0.0
23996,0.222222,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.978772,0.689536,0.000000,...,5.407731,7.859719,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.734824,0.0
23997,0.217425,0.000000,0.0,0.0,0.000000,0.000000,0.000000,145.915418,0.665380,0.000000,...,5.073114,7.887594,0.000000,0.0,0.0,0.000000,0.000000,0.000000,9.672132,0.0


In [6]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['class'], axis=1),
    data['class'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((16799, 96), (7200, 96))

In [7]:
# we stack all the selection methods inside a pipeline

pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('duplicated', DropDuplicateFeatures()),
    ('correlation', SmartCorrelatedSelection(selection_method='variance')),
])

pipe.fit(X_train)

Pipeline(steps=[('constant',
                 DropConstantFeatures(tol=0.998,
                                      variables=['rmean_bins0', 'rmean_bins1',
                                                 'rmean_bins2', 'rmean_bins3',
                                                 'rmean_bins4', 'rmean_bins5',
                                                 'rmean_bins6', 'rmean_bins7',
                                                 'rstd_bins0', 'rstd_bins1',
                                                 'rstd_bins2', 'rstd_bins3',
                                                 'rstd_bins4', 'rstd_bins5',
                                                 'rstd_bins6', 'rstd_bins7',
                                                 'rskew_bins0', 'rskew_bins1',
                                                 'rskew_bins2', 'rskew_bins3',
                                                 'rskew_b...
                                                     'rmean_bins5',
           

In [8]:
# remove features

X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((16799, 27), (7200, 27))

In [9]:
X_train

Unnamed: 0,rmean_bins1,rmean_bins2,rmean_bins3,rmean_bins6,rmean_bins7,rstd_bins0,rkurto_bins7,gmean_bins7,gkurto_bins1,gkurto_bins2,...,bmean_bins3,bmean_bins4,bmean_bins5,bmean_bins6,bmean_bins7,bkurto_bins0,bkurto_bins1,bkurto_bins2,bkurto_bins3,bkurto_bins7
15382,0.000000,0.0,0.0,0.0,180.492704,1.398502,7.438554,160.785296,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,171.012826,1.438841,0.000000,0.000000,0.0,12.239542
6786,0.000000,0.0,0.0,0.0,183.028467,1.066107,4.825459,173.427452,0.000000,0.000000,...,0.0,0.000000,154.559783,0.000000,175.295818,1.556517,0.000000,0.000000,0.0,6.639987
16217,0.000000,0.0,0.0,0.0,173.211795,1.848103,11.300903,143.832677,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,140.621193,1.682182,0.000000,0.000000,0.0,17.794553
1047,102.000000,101.0,0.0,104.0,144.223099,3.948451,14.908189,106.128845,13.004431,2.943137,...,0.0,0.000000,151.305648,114.000000,169.636332,21.604183,4.765680,0.420448,0.0,16.088283
6173,0.000000,0.0,0.0,0.0,186.553922,1.029542,5.148988,176.440176,0.000000,0.000000,...,0.0,0.000000,144.802083,0.000000,176.439318,1.464988,0.000000,0.000000,0.0,7.963030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13123,0.000000,0.0,0.0,0.0,154.206890,1.275322,10.266958,134.699035,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,179.443758,6.592747,0.000000,0.000000,0.0,20.038586
19648,0.000000,0.0,0.0,0.0,187.846749,1.143590,10.013961,171.844091,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,158.234181,1.480392,0.000000,0.000000,0.0,12.405009
9845,110.896552,0.0,0.0,0.0,149.918135,10.539109,14.073236,116.629596,16.116232,0.000000,...,139.0,131.775281,162.987768,124.636364,184.045613,43.518837,6.420400,0.000000,0.0,11.857062
10799,0.000000,0.0,0.0,0.0,158.943398,0.772915,6.241189,144.441515,0.000000,0.000000,...,0.0,0.000000,174.736842,0.000000,206.847955,6.627811,0.000000,0.000000,0.0,12.355050


In [10]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test):
    
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state = 0)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [11]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train)

run_logistic(scaler.transform(X_train),
             scaler.transform(X_test),
                  y_train, y_test)

Train set
Logistic Regression roc-auc: 0.9770977474686023
Test set
Logistic Regression roc-auc: 0.9742945053289706
