In [21]:
import pandas as pd
import numpy as np

## Adding heuristics

The heuristic was: 

21% of 180 clients who are credit_history 'critical/other existing credit' and savings_status: '<100' defaulted

In [22]:
# read in new data, and create heuristic columns
data = pd.read_csv('credit.csv')
data['class'] = 0
data.loc[((data.credit_history=='\'critical/other existing credit\'')
          &(data.savings_status == '\'<100\'')), 'class'] = 1

x_heuristic = data.iloc[:,:-1]
y_heuristic = data.iloc[:,-1]
print(y_heuristic.sum())

from sklearn.model_selection import train_test_split

_, _, y_train_aug, _ = train_test_split(x_heuristic, y_heuristic, test_size=0.2, random_state=1, stratify=y_heuristic)
print(y_train_aug.sum())

180
144


In [23]:
data_1 = pd.read_csv('data/data_numeric.csv')
data_2 = pd.read_csv('data/data_categorical.csv')
data_2 = data_2.drop(columns='class')

In [24]:
credit = data_1.merge(data_2, how='inner', left_index=True, right_index=True)
credit.shape

(1000, 27)

In [25]:

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

In [26]:
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

In [27]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [28]:
from sklearn.pipeline import Pipeline

## Scaling datasets and oversampling train datasets

In [29]:
from imblearn.over_sampling import RandomOverSampler

In [30]:
y = credit['class']
x = credit.drop(columns='class')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [31]:
x_train_stand, x_test_stand = x_train.copy(), x_test.copy()

# standardised x data
# change train back to df
stand = StandardScaler().fit(x_train)
x_train_stand = stand.transform(x_train)
x_test_stand = stand.transform(x_test)
x_train_stand = pd.DataFrame(x_train_stand, columns=x_train.columns)

# noralised x data
# change train back to df
norm = MinMaxScaler().fit(x_train)
x_train_norm = norm.transform(x_train)
x_test_norm = norm.transform(x_test)
x_train_norm = pd.DataFrame(x_train_norm, columns=x_train.columns)

# oversample the xtrain and ytrain dataset to increase the minority class by half
oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_os, y_train_os = oversample.fit_resample(x_train, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_stand_os, y_train_stand_os = oversample.fit_resample(x_train_stand, y_train)

oversample = RandomOverSampler(sampling_strategy=0.5, random_state=1)
x_train_norm_os, y_train_norm_os = oversample.fit_resample(x_train_norm, y_train)

print(y_train_os.value_counts())

0    560
1    280
Name: class, dtype: int64


In [32]:
# all oversampled y train data is the same...whatever
sum(y_train_os == y_train_stand_os)

840

In [33]:
# adding y_train_aug to y_train_os and adding x_train to x_train_os
# the first 840 samples are normal samples
# the last 800 samples are heuristic samples

y_train_os_aug = pd.concat([y_train_os, y_train_aug], axis=0)
print(y_train_os_aug.shape)

x_train_os_aug = pd.concat([x_train_os, x_train], axis=0)
print(x_train_os_aug.shape)

x_train_stand_os_aug = pd.concat([x_train_stand_os, x_train_stand], axis=0)
print(x_train_stand_os_aug.shape)

x_train_norm_os_aug = pd.concat([x_train_norm_os, x_train_norm], axis=0)
print(x_train_norm_os_aug.shape)

# packing it up
Xtrain = [x_train_os_aug, x_train_stand_os_aug, x_train_norm_os_aug]
Xtest = [x_test, x_test_stand, x_test_norm]

(1640,)
(1640, 26)
(1640, 26)
(1640, 26)


In [34]:
# weights for heuristic
weights = [1]*840 + [0.1]*800
print(len(weights))

1640


In [35]:
metrics = [accuracy_score, recall_score, roc_auc_score]
metric_names = ['accuracy', 'recall', 'roc_auc']
scaling_names = ['Original','Normalized','Standardized']

multilevelindex = []
for i in metric_names:
    for j in metric_names:
        multilevelindex.append((i, j))
        
multilevelindex = pd.MultiIndex.from_tuples(multilevelindex, names=['CV metric', 'metric'])

In [36]:
scores_df = pd.DataFrame(index=multilevelindex)

# loop over scaling
for i in range(len(Xtrain)):
    train = Xtrain[i]
    test = Xtest[i]
    results = []
    # loop over cv scoring metric
    for m in metrics:
        # define cv evaluation method
        fs = SelectKBest(score_func=mutual_info_classif)
        classif = LogisticRegression(random_state=1, class_weight=weights)
        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

        # define the pipeline to evaluate
        pipeline = Pipeline(steps=[('anova', fs), ('lr', classif)])
        
        # define grid
        grid = dict()
        grid['anova__k'] = [i for i in range(1,20)] # this number cannot go higher

        # define the grid search
        search = GridSearchCV(estimator=pipeline, 
                              param_grid=[grid], 
                              scoring=metric_names[i], # you have to use the string or gridsearch will not work
                              n_jobs=-1, 
                              cv=cv)
        
        search.fit(train, y_train_os_aug)
        print(search.best_params_)
        y_hat = search.best_estimator_.predict(test)
        # loop over evaluation metric
        for n in metrics:
            score = n(y_test, y_hat)
            results.append(score)
    scores_df[scaling_names[i]] = pd.Series(results, index=multilevelindex)
scores_df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'anova__k': 18}
{'anova__k': 17}
{'anova__k': 7}
{'anova__k': 19}




{'anova__k': 19}




{'anova__k': 19}




{'anova__k': 18}




{'anova__k': 18}




{'anova__k': 18}




Unnamed: 0_level_0,Unnamed: 1_level_0,Original,Normalized,Standardized
CV metric,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,accuracy,0.715,0.74,0.705
accuracy,recall,0.116667,0.2,0.033333
accuracy,roc_auc,0.544048,0.585714,0.513095
recall,accuracy,0.71,0.745,0.73
recall,recall,0.05,0.216667,0.166667
recall,roc_auc,0.521429,0.594048,0.569048
roc_auc,accuracy,0.7,0.715,0.73
roc_auc,recall,0.0,0.133333,0.183333
roc_auc,roc_auc,0.5,0.54881,0.57381
