In [1]:
import pandas as pd
import numpy as np

In [2]:
data_1 = pd.read_csv('data/data_numeric.csv')
data_2 = pd.read_csv('data/data_categorical.csv')
data_2 = data_2.drop(columns='class')

In [3]:
credit = data_1.merge(data_2, how='inner', left_index=True, right_index=True)
credit.shape

(1000, 27)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

In [5]:
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

In [6]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [7]:
from sklearn.pipeline import Pipeline

## Scaling datasets and oversampling train datasets

In [8]:
from imblearn.over_sampling import RandomOverSampler

In [9]:
y = credit['class']
x = credit.drop(columns='class')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
x_train_stand, x_test_stand = x_train.copy(), x_test.copy()

# standardised x data
stand = StandardScaler().fit(x_train)
x_train_stand = stand.transform(x_train)
x_test_stand = stand.transform(x_test)

# noralised x data
norm = MinMaxScaler().fit(x_train)
x_train_norm = norm.transform(x_train)
x_test_norm = norm.transform(x_test)

# oversample the xtrain and ytrain dataset to increase the minority class by half
oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_os, y_train_os = oversample.fit_resample(x_train, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_stand_os, y_train_stand_os = oversample.fit_resample(x_train_stand, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_norm_os, y_train_norm_os = oversample.fit_resample(x_train_norm, y_train)

print(y_train_os.value_counts())

0    560
1    280
Name: class, dtype: int64


In [11]:
# all oversampled y train data is the same...whatever
sum(y_train_os == y_train_stand_os)

840

In [11]:
# packing it up
Xtrain = [x_train_os, x_train_stand_os, x_train_norm_os]
Xtest = [x_test, x_test_stand, x_test_norm]

# Testing models

We are going to create a function that does a gridsearch w/ cross validation and of:

- Type of scaling
- Scoring metric of gridsearch
- Scoring metric of performance

In [12]:
metrics = [accuracy_score, recall_score, roc_auc_score]
metric_names = ['accuracy', 'recall', 'roc_auc']
scaling_names = ['Original','Normalized','Standardized']

multilevelindex = []
for i in metric_names:
    for j in metric_names:
        multilevelindex.append((i, j))
        
multilevelindex = pd.MultiIndex.from_tuples(multilevelindex, names=['CV metric', 'metric'])

In [40]:
def gridsearch(classif):
    
    np.random.seed(1)
    scores_df = pd.DataFrame(index=multilevelindex)
    # loop over scaling
    for i in range(len(Xtrain)):
        train = Xtrain[i]
        test = Xtest[i]
        results = []
        # loop over cv scoring metric
        for m in metrics:
            # define cv evaluation method
            fs = SelectKBest(score_func=mutual_info_classif)
            cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

            # define the pipeline to evaluate
            pipeline = Pipeline(steps=[('anova', fs), ('lr', classif)])

            # define grid
            grid = dict()
            grid['anova__k'] = [i for i in range(1,20)] # this number cannot go higher

            # define the grid search
            search = GridSearchCV(estimator=pipeline, 
                                  param_grid=[grid], 
                                  scoring=metric_names[i], # you have to use the string or gridsearch will not work
                                  n_jobs=-1, 
                                  cv=cv)

            search.fit(train, y_train_os)
            print(search.best_params_)
            y_hat = search.predict(test)
            # loop over evaluation metric
            for n in metrics:
                score = n(y_test, y_hat)
                results.append(score)
        scores_df[scaling_names[i]] = pd.Series(results, index=multilevelindex)
    return scores_df



In [41]:
gridsearch(LogisticRegression(solver='liblinear',random_state=1))

{'anova__k': 19}
{'anova__k': 16}
{'anova__k': 17}
{'anova__k': 15}
{'anova__k': 19}
{'anova__k': 18}
{'anova__k': 18}
{'anova__k': 18}
{'anova__k': 17}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.77,0.785,0.78
accuracy,recall,0.65,0.616667,0.566667
accuracy,roc_auc,0.735714,0.736905,0.719048
recall,accuracy,0.805,0.775,0.775
recall,recall,0.6,0.583333,0.566667
recall,roc_auc,0.746429,0.720238,0.715476
roc_auc,accuracy,0.765,0.805,0.79
roc_auc,recall,0.583333,0.616667,0.583333
roc_auc,roc_auc,0.713095,0.75119,0.730952


In [22]:
gridsearch(RandomForestClassifier(random_state=1))

{'anova__k': 19}
{'anova__k': 18}
{'anova__k': 17}
{'anova__k': 17}
{'anova__k': 16}
{'anova__k': 11}
{'anova__k': 19}
{'anova__k': 17}
{'anova__k': 19}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.785,0.795,0.815
accuracy,recall,0.616667,0.65,0.6
accuracy,roc_auc,0.736905,0.753571,0.753571
recall,accuracy,0.78,0.815,0.785
recall,recall,0.583333,0.633333,0.65
recall,roc_auc,0.72381,0.763095,0.746429
roc_auc,accuracy,0.75,0.765,0.805
roc_auc,recall,0.533333,0.5,0.6
roc_auc,roc_auc,0.688095,0.689286,0.746429


In [23]:
gridsearch(SGDClassifier(random_state=1))

{'anova__k': 19}
{'anova__k': 6}
{'anova__k': 4}
{'anova__k': 18}
{'anova__k': 13}
{'anova__k': 16}
{'anova__k': 19}
{'anova__k': 13}
{'anova__k': 16}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.7,0.73,0.735
accuracy,recall,0.0,0.75,0.716667
accuracy,roc_auc,0.5,0.735714,0.729762
recall,accuracy,0.7,0.68,0.715
recall,recall,0.0,0.566667,0.183333
recall,roc_auc,0.5,0.647619,0.563095
roc_auc,accuracy,0.7,0.695,0.76
roc_auc,recall,0.0,0.5,0.816667
roc_auc,roc_auc,0.5,0.639286,0.77619


In [24]:
gridsearch(GradientBoostingClassifier(random_state=1))

{'anova__k': 19}
{'anova__k': 16}
{'anova__k': 19}
{'anova__k': 17}
{'anova__k': 17}
{'anova__k': 14}
{'anova__k': 19}
{'anova__k': 19}
{'anova__k': 17}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.79,0.81,0.79
accuracy,recall,0.6,0.616667,0.6
accuracy,roc_auc,0.735714,0.754762,0.735714
recall,accuracy,0.795,0.795,0.785
recall,recall,0.6,0.616667,0.533333
recall,roc_auc,0.739286,0.744048,0.713095
roc_auc,accuracy,0.79,0.81,0.775
roc_auc,recall,0.6,0.616667,0.6
roc_auc,roc_auc,0.735714,0.754762,0.725


In [27]:
gridsearch(AdaBoostClassifier(random_state=1))

{'anova__k': 13}
{'anova__k': 17}
{'anova__k': 18}
{'anova__k': 18}
{'anova__k': 19}
{'anova__k': 14}
{'anova__k': 11}
{'anova__k': 18}
{'anova__k': 18}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.78,0.8,0.77
accuracy,recall,0.6,0.65,0.55
accuracy,roc_auc,0.728571,0.757143,0.707143
recall,accuracy,0.765,0.8,0.77
recall,recall,0.6,0.666667,0.6
recall,roc_auc,0.717857,0.761905,0.721429
roc_auc,accuracy,0.77,0.745,0.79
roc_auc,recall,0.566667,0.5,0.6
roc_auc,roc_auc,0.711905,0.675,0.735714


In [42]:
gridsearch(AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1), random_state=1))

{'anova__k': 19}
{'anova__k': 14}
{'anova__k': 17}
{'anova__k': 19}
{'anova__k': 18}
{'anova__k': 19}
{'anova__k': 19}
{'anova__k': 19}
{'anova__k': 19}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.775,0.78,0.78
accuracy,recall,0.616667,0.616667,0.516667
accuracy,roc_auc,0.729762,0.733333,0.704762
recall,accuracy,0.76,0.785,0.79
recall,recall,0.5,0.616667,0.566667
recall,roc_auc,0.685714,0.736905,0.72619
roc_auc,accuracy,0.765,0.795,0.795
roc_auc,recall,0.55,0.6,0.583333
roc_auc,roc_auc,0.703571,0.739286,0.734524
