In [1]:
import pandas as pd
import numpy as np

In [2]:
data_1 = pd.read_csv('data/data_numeric.csv')
data_2 = pd.read_csv('data/data_categorical.csv')
data_2 = data_2.drop(columns='class')

In [3]:
credit = data_1.merge(data_2, how='inner', left_index=True, right_index=True)
credit.shape

(1000, 27)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

In [5]:
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

In [25]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [7]:
from sklearn.pipeline import Pipeline

## Scaling datasets and oversampling train datasets

The positive classes only comprise 30% of the data. Becuase I want the model to accurately detect the postive (bad client) classes accurately as **the cost of a false negative is high to the business**, I will oversample the positive classes to 50% of the data from 30% to influence the model to predict positive classes better.

The recall metric is the metric I will pay attention to, and the model must meet a minumum standard of performance in recall.


DEFINITION OF RECALL: What proportion of actual positives was identified correctly?

In [8]:
from imblearn.over_sampling import RandomOverSampler

In [9]:
y = credit['class']
x = credit.drop(columns='class')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [10]:
x_train_stand, x_test_stand = x_train.copy(), x_test.copy()

# standardised x data
stand = StandardScaler().fit(x_train)
x_train_stand = stand.transform(x_train)
x_test_stand = stand.transform(x_test)

# noralised x data
norm = MinMaxScaler().fit(x_train)
x_train_norm = norm.transform(x_train)
x_test_norm = norm.transform(x_test)

# oversample the xtrain and ytrain dataset to increase the minority class by half
oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_os, y_train_os = oversample.fit_resample(x_train, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_stand_os, y_train_stand_os = oversample.fit_resample(x_train_stand, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_norm_os, y_train_norm_os = oversample.fit_resample(x_train_norm, y_train)

print(y_train_os.value_counts())

0    560
1    280
Name: class, dtype: int64


In [11]:
# all oversampled y train data is the same...whatever
sum(y_train_os == y_train_stand_os)

840

In [12]:
# packing it up
Xtrain = [x_train_os, x_train_stand_os, x_train_norm_os]
Xtest = [x_test, x_test_stand, x_test_norm]

# Testing models

We are going to create a function that does a gridsearch w/ cross validation and of:

- A given input estimator
- Type of scaling on the training and testing data
- Scoring metric of gridsearch
- Scoring metric of performance

In [13]:
metrics = [accuracy_score, recall_score, roc_auc_score]
metric_names = ['accuracy', 'recall', 'roc_auc']
scaling_names = ['Original','Normalized','Standardized']

multilevelindex = []
for i in metric_names:
    for j in metric_names:
        multilevelindex.append((i, j))
        
multilevelindex = pd.MultiIndex.from_tuples(multilevelindex, names=['CV metric', 'metric'])

In [14]:
def gridsearch(classif):
    
    np.random.seed(1)
    scores_df = pd.DataFrame(index=multilevelindex)
    # loop over scaling
    for i in range(len(Xtrain)):
        train = Xtrain[i]
        test = Xtest[i]
        results = []
        # loop over cv scoring metric
        for m in metrics:
            # define cv evaluation method
            fs = SelectKBest(score_func=mutual_info_classif)
            cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

            # define the pipeline to evaluate
            pipeline = Pipeline(steps=[('anova', fs), ('lr', classif)])

            # define grid
            grid = dict()
            grid['anova__k'] = [i for i in range(1,25)]

            # define the grid search
            search = GridSearchCV(estimator=pipeline, 
                                  param_grid=[grid], 
                                  scoring=metric_names[i], # you have to use the string or gridsearch will not work
                                  n_jobs=-1, 
                                  cv=cv)

            search.fit(train, y_train_os)
            print(search.best_params_)
            
            y_hat = search.predict(test)
            # loop over evaluation metric
            for n in metrics:
                score = n(y_test, y_hat)
                results.append(score)
        scores_df[scaling_names[i]] = pd.Series(results, index=multilevelindex)
    return scores_df



In [24]:
gridsearch(LogisticRegression(solver='liblinear',random_state=1))

{'anova__k': 20}
{'anova__k': 19}
{'anova__k': 23}
{'anova__k': 23}
{'anova__k': 18}
{'anova__k': 19}
{'anova__k': 20}
{'anova__k': 24}
{'anova__k': 23}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.78,0.79,0.76
accuracy,recall,0.666667,0.616667,0.566667
accuracy,roc_auc,0.747619,0.740476,0.704762
recall,accuracy,0.81,0.775,0.78
recall,recall,0.616667,0.583333,0.583333
recall,roc_auc,0.754762,0.720238,0.72381
roc_auc,accuracy,0.77,0.805,0.775
roc_auc,recall,0.55,0.616667,0.566667
roc_auc,roc_auc,0.707143,0.75119,0.715476


In [15]:
gridsearch(RandomForestClassifier(random_state=1))

{'anova__k': 17}
{'anova__k': 21}
{'anova__k': 21}
{'anova__k': 18}
{'anova__k': 15}
{'anova__k': 17}
{'anova__k': 22}
{'anova__k': 19}
{'anova__k': 23}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.785,0.81,0.785
accuracy,recall,0.6,0.633333,0.566667
accuracy,roc_auc,0.732143,0.759524,0.722619
recall,accuracy,0.79,0.81,0.79
recall,recall,0.5,0.583333,0.6
recall,roc_auc,0.707143,0.745238,0.735714
roc_auc,accuracy,0.775,0.76,0.8
roc_auc,recall,0.583333,0.483333,0.566667
roc_auc,roc_auc,0.720238,0.680952,0.733333


In [16]:
gridsearch(SGDClassifier(random_state=1))

{'anova__k': 2}
{'anova__k': 5}
{'anova__k': 2}
{'anova__k': 21}
{'anova__k': 19}
{'anova__k': 20}
{'anova__k': 18}
{'anova__k': 20}
{'anova__k': 21}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.7,0.71,0.705
accuracy,recall,0.0,0.55,0.1
accuracy,roc_auc,0.5,0.664286,0.532143
recall,accuracy,0.7,0.67,0.805
recall,recall,0.0,0.433333,0.55
recall,roc_auc,0.5,0.602381,0.732143
roc_auc,accuracy,0.71,0.655,0.75
roc_auc,recall,0.833333,0.45,0.3
roc_auc,roc_auc,0.745238,0.596429,0.621429


In [18]:
gridsearch(GradientBoostingClassifier(random_state=1))

{'anova__k': 23}
{'anova__k': 24}
{'anova__k': 22}
{'anova__k': 20}
{'anova__k': 20}
{'anova__k': 20}
{'anova__k': 23}
{'anova__k': 24}
{'anova__k': 20}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.78,0.805,0.775
accuracy,recall,0.583333,0.616667,0.6
accuracy,roc_auc,0.72381,0.75119,0.725
recall,accuracy,0.785,0.795,0.79
recall,recall,0.616667,0.6,0.566667
recall,roc_auc,0.736905,0.739286,0.72619
roc_auc,accuracy,0.77,0.78,0.81
roc_auc,recall,0.55,0.6,0.633333
roc_auc,roc_auc,0.707143,0.728571,0.759524


In [25]:
gridsearch(AdaBoostClassifier(random_state=1)) # default est is a decision tree

{'anova__k': 21}
{'anova__k': 13}
{'anova__k': 18}
{'anova__k': 18}
{'anova__k': 21}
{'anova__k': 17}
{'anova__k': 22}
{'anova__k': 24}
{'anova__k': 21}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.8,0.8,0.785
accuracy,recall,0.65,0.65,0.616667
accuracy,roc_auc,0.757143,0.757143,0.736905
recall,accuracy,0.8,0.8,0.795
recall,recall,0.633333,0.683333,0.6
recall,roc_auc,0.752381,0.766667,0.739286
roc_auc,accuracy,0.77,0.78,0.775
roc_auc,recall,0.566667,0.566667,0.6
roc_auc,roc_auc,0.711905,0.719048,0.725


In [20]:
gridsearch(AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear',random_state=1), random_state=1))

{'anova__k': 22}
{'anova__k': 19}
{'anova__k': 23}
{'anova__k': 21}
{'anova__k': 23}
{'anova__k': 18}
{'anova__k': 20}
{'anova__k': 24}
{'anova__k': 23}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.765,0.785,0.775
accuracy,recall,0.616667,0.6,0.5
accuracy,roc_auc,0.722619,0.732143,0.696429
recall,accuracy,0.79,0.8,0.775
recall,recall,0.533333,0.633333,0.533333
recall,roc_auc,0.716667,0.752381,0.705952
roc_auc,accuracy,0.755,0.795,0.78
roc_auc,recall,0.55,0.6,0.533333
roc_auc,roc_auc,0.696429,0.739286,0.709524


In [30]:
gridsearch(GaussianProcessClassifier(random_state=1))

{'anova__k': 22}
{'anova__k': 18}
{'anova__k': 23}
{'anova__k': 17}
{'anova__k': 24}
{'anova__k': 22}
{'anova__k': 19}
{'anova__k': 16}
{'anova__k': 20}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.65,0.72,0.78
accuracy,recall,0.216667,0.516667,0.533333
accuracy,roc_auc,0.52619,0.661905,0.709524
recall,accuracy,0.62,0.73,0.78
recall,recall,0.2,0.55,0.566667
recall,roc_auc,0.5,0.678571,0.719048
roc_auc,accuracy,0.59,0.69,0.78
roc_auc,recall,0.233333,0.45,0.533333
roc_auc,roc_auc,0.488095,0.621429,0.709524


In [31]:
gridsearch(SVC(random_state=1))

{'anova__k': 10}
{'anova__k': 11}
{'anova__k': 11}
{'anova__k': 21}
{'anova__k': 18}
{'anova__k': 21}
{'anova__k': 24}
{'anova__k': 24}
{'anova__k': 23}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.71,0.76,0.78
accuracy,recall,0.133333,0.5,0.566667
accuracy,roc_auc,0.545238,0.685714,0.719048
recall,accuracy,0.71,0.755,0.775
recall,recall,0.133333,0.5,0.533333
recall,roc_auc,0.545238,0.682143,0.705952
roc_auc,accuracy,0.71,0.775,0.78
roc_auc,recall,0.133333,0.483333,0.533333
roc_auc,roc_auc,0.545238,0.691667,0.709524


In [27]:
gridsearch(XGBClassifier(random_state=1))

{'anova__k': 19}
{'anova__k': 20}
{'anova__k': 23}
{'anova__k': 22}
{'anova__k': 16}
{'anova__k': 16}
{'anova__k': 19}
{'anova__k': 22}
{'anova__k': 24}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.785,0.78,0.75
accuracy,recall,0.633333,0.65,0.6
accuracy,roc_auc,0.741667,0.742857,0.707143
recall,accuracy,0.75,0.76,0.765
recall,recall,0.616667,0.616667,0.6
recall,roc_auc,0.711905,0.719048,0.717857
roc_auc,accuracy,0.76,0.755,0.745
roc_auc,recall,0.566667,0.633333,0.566667
roc_auc,roc_auc,0.704762,0.720238,0.694048


In [19]:
gridsearch(AdaBoostClassifier(random_state=2)) 

{'anova__k': 22}
{'anova__k': 24}
{'anova__k': 24}
{'anova__k': 23}
{'anova__k': 22}
{'anova__k': 24}
{'anova__k': 20}
{'anova__k': 20}
{'anova__k': 19}


Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.795,0.79,0.775
accuracy,recall,0.616667,0.633333,0.583333
accuracy,roc_auc,0.744048,0.745238,0.720238
recall,accuracy,0.78,0.795,0.78
recall,recall,0.65,0.65,0.616667
recall,roc_auc,0.742857,0.753571,0.733333
roc_auc,accuracy,0.77,0.78,0.81
roc_auc,recall,0.566667,0.583333,0.666667
roc_auc,roc_auc,0.711905,0.72381,0.769048
