In [9]:
import imblearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data_all = pd.read_csv('../data/data.csv')

train, test = train_test_split(data_all, test_size=0.2, random_state=42)

X_train_imbalanced = train.drop(['CREDIT_SCORE','DEFAULT'], axis=1)
y_train_imbalanced = train['DEFAULT']

X_test = test.drop(['CREDIT_SCORE','DEFAULT'], axis=1)
y_test = test['DEFAULT']



In [10]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X_train, y_train = ros.fit_resample(X_train_imbalanced, y_train_imbalanced)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutliersReplacer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in X.columns:
            if column == 'SAVINGS':
                X.loc[X[column] > 2500000, column] = 2500000
            elif column == 'DEBT':
                X.loc[X[column] > 4000000, column] = 4000000
            elif column == 'T_CLOTHING_12':
                X.loc[X[column] > 32000, column] = 32000
            elif column == 'T_CLOTHING_6':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_12':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_6':
                X.loc[X[column] > 18000, column] = 18000
            elif column == 'T_TRAVEL_12':
                X.loc[X[column] > 150000, column] = 150000
            elif column == 'T_TRAVEL_6':
                X.loc[X[column] > 110000, column] = 110000
        return X

In [12]:
class DropColumns(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        potentialColumnsToDrop = ['T_EDUCATION_12', 'T_FINES_12', 'T_GAMBLING_12', 'T_HOUSING_12', 'T_TAX_12', 'T_TRAVEL_12', 'T_EDUCATION_6','T_ENTERTAINMENT_6','T_GAMBLING_6','T_GROCERIES_6','T_HOUSING_6','T_EXPENDITURE_12', 'T_EXPENDITURE_6','R_GROCERIES_DEBT', 'INCOME', 'T_UTILITIES_6', 'R_EDUCATION_DEBT', 'T_UTILITIES_12', 'R_CLOTHING_DEBT',
        'CAT_DEPENDENTS', 'R_ENTERTAINMENT_SAVINGS', 'R_FINES_INCOME',
        'R_FINES_SAVINGS', 'R_FINES_DEBT', 'R_GROCERIES_SAVINGS',
        'CAT_SAVINGS_ACCOUNT', 'R_HOUSING_INCOME', 'R_TAX_INCOME',
        'R_TAX_SAVINGS', 'R_TRAVEL_DEBT', 'R_UTILITIES_DEBT', 'CAT_GAMBLING',
        'CAT_DEBT', 'CAT_MORTGAGE', 'SAVINGS', 'R_UTILITIES_SAVINGS', 'R_EDUCATION', 'R_FINES', 'R_GAMBLING', 'R_HOUSING', 'R_GROCERIES_INCOME', 'T_ENTERTAINMENT_12', 'R_ENTERTAINMENT',
       'R_TRAVEL_SAVINGS', 'R_GAMBLING_SAVINGS', 'T_CLOTHING_6', 'CUST_ID']
        for column in potentialColumnsToDrop:
            if column in X.columns:
                X.drop(column, axis=1, inplace=True)
        return X

In [14]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(13, 24, 1):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 13, max_depth: 2, learning_rate: 0.1, accuracy: 0.6623148590539895, recall: 0.6673893768944317, precision: 0.6605438048359396, f1: 0.6598495442740866
n_estimators: 13, max_depth: 2, learning_rate: 0.2, accuracy: 0.6830745341614908, recall: 0.6740928628405071, precision: 0.6803635308656124, f1: 0.676500026888873
n_estimators: 13, max_depth: 2, learning_rate: 0.3, accuracy: 0.7092928810320115, recall: 0.7424532271149789, precision: 0.6949834546154713, f1: 0.7119856380886086
n_estimators: 13, max_depth: 3, learning_rate: 0.1, accuracy: 0.7201385570950788, recall: 0.7208109563543718, precision: 0.7219235187867337, f1: 0.7196555429252351
n_estimators: 13, max_depth: 3, learning_rate: 0.2, accuracy: 0.7321070234113712, recall: 0.7733311970540355, precision: 0.7169265217230618, f1: 0.7429839201710143
n_estimators: 13, max_depth: 3, learning_rate: 0.3, accuracy: 0.7549569995222168, recall: 0.7768623418894985, precision: 0.734905190509343, f1: 0.7540701083038609
n_estimators: 14, 

In [16]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(24, 30, 1):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3, 0.4]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 24, max_depth: 2, learning_rate: 0.1, accuracy: 0.6787386526516961, recall: 0.682356766727443, precision: 0.6781522198089944, f1: 0.6755349940903919
n_estimators: 24, max_depth: 2, learning_rate: 0.2, accuracy: 0.711371237458194, recall: 0.7456497894670813, precision: 0.7081265810336485, f1: 0.7158172481785078
n_estimators: 24, max_depth: 2, learning_rate: 0.3, accuracy: 0.758265647396082, recall: 0.7734494811632129, precision: 0.7426642198381328, f1: 0.7630697068364846
n_estimators: 24, max_depth: 2, learning_rate: 0.4, accuracy: 0.74848303870043, recall: 0.7757595178350785, precision: 0.7367386408633907, f1: 0.7523524281824202
n_estimators: 24, max_depth: 3, learning_rate: 0.1, accuracy: 0.7505852842809364, recall: 0.7851806366212355, precision: 0.7312547500180012, f1: 0.7506584639934842
n_estimators: 24, max_depth: 3, learning_rate: 0.2, accuracy: 0.773471094123268, recall: 0.8197013869603135, precision: 0.7424256418267913, f1: 0.7781706950326708
n_estimators: 24, max_

KeyboardInterrupt: 

In [17]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

pipeline = Pipeline([
        ('outliers_replacer', OutliersReplacer()),
        ('drop_columns', DropColumns()),
        ('classifier', GradientBoostingClassifier(n_estimators=27, max_depth=3, learning_rate=0.4))
    ])


#validation 
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
print(f"accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


accuracy: 0.8105112279025322, recall: 0.858609212239851, precision: 0.7796499186631607, f1: 0.815952473093291


In [18]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.675
[[87 27]
 [25 21]]
              precision    recall  f1-score   support

           0       0.78      0.76      0.77       114
           1       0.44      0.46      0.45        46

    accuracy                           0.68       160
   macro avg       0.61      0.61      0.61       160
weighted avg       0.68      0.68      0.68       160

