In [5]:
import imblearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data_all = pd.read_csv('../data/data.csv')

train, test = train_test_split(data_all, test_size=0.2, random_state=42)

X_train_imbalanced = train.drop(['CREDIT_SCORE','DEFAULT', 'CUST_ID', 'CAT_GAMBLING'], axis=1)
y_train_imbalanced = train['DEFAULT']

X_test = test.drop(['CREDIT_SCORE','DEFAULT', 'CUST_ID', 'CAT_GAMBLING'], axis=1)
y_test = test['DEFAULT']



In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train, y_train = smote.fit_resample(X_train_imbalanced, y_train_imbalanced)

print(X_train.shape, y_train.shape)

print(y_train.value_counts())

(918, 83) (918,)
0    459
1    459
Name: DEFAULT, dtype: int64


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutliersReplacer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in X.columns:
            if column == 'SAVINGS':
                X.loc[X[column] > 2500000, column] = 2500000
            elif column == 'DEBT':
                X.loc[X[column] > 4000000, column] = 4000000
            elif column == 'T_CLOTHING_12':
                X.loc[X[column] > 32000, column] = 32000
            elif column == 'T_CLOTHING_6':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_12':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_6':
                X.loc[X[column] > 18000, column] = 18000
            elif column == 'T_TRAVEL_12':
                X.loc[X[column] > 150000, column] = 150000
            elif column == 'T_TRAVEL_6':
                X.loc[X[column] > 110000, column] = 110000
        return X

In [8]:
class DropColumns(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        potentialColumnsToDrop = ['T_EDUCATION_12', 'T_FINES_12', 'T_GAMBLING_12', 'T_HOUSING_12', 'T_TAX_12', 'T_TRAVEL_12', 'T_EDUCATION_6','T_ENTERTAINMENT_6','T_GAMBLING_6','T_GROCERIES_6','T_HOUSING_6','T_EXPENDITURE_12', 'T_EXPENDITURE_6','R_GROCERIES_DEBT', 'INCOME', 'T_UTILITIES_6', 'R_EDUCATION_DEBT', 'T_UTILITIES_12', 'R_CLOTHING_DEBT',
        'CAT_DEPENDENTS', 'R_ENTERTAINMENT_SAVINGS', 'R_FINES_INCOME',
        'R_FINES_SAVINGS', 'R_FINES_DEBT', 'R_GROCERIES_SAVINGS',
        'CAT_SAVINGS_ACCOUNT', 'R_HOUSING_INCOME', 'R_TAX_INCOME',
        'R_TAX_SAVINGS', 'R_TRAVEL_DEBT', 'R_UTILITIES_DEBT', 'CAT_GAMBLING',
        'CAT_DEBT', 'CAT_MORTGAGE', 'SAVINGS', 'R_UTILITIES_SAVINGS', 'R_EDUCATION', 'R_FINES', 'R_GAMBLING', 'R_HOUSING', 'R_GROCERIES_INCOME', 'T_ENTERTAINMENT_12', 'R_ENTERTAINMENT',
       'R_TRAVEL_SAVINGS', 'R_GAMBLING_SAVINGS', 'T_CLOTHING_6', 'CUST_ID']
        for column in potentialColumnsToDrop:
            if column in X.columns:
                X.drop(column, axis=1, inplace=True)
        return X

In [9]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(13, 24, 1):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 13, max_depth: 2, learning_rate: 0.1, accuracy: 0.6526516961299571, recall: 0.7848213783516428, precision: 0.6225928399086295, f1: 0.692125651725886
n_estimators: 13, max_depth: 2, learning_rate: 0.2, accuracy: 0.6820473005255614, recall: 0.7485651403848881, precision: 0.6613200916686641, f1: 0.7004303112588595
n_estimators: 13, max_depth: 2, learning_rate: 0.3, accuracy: 0.7005375059722885, recall: 0.7509082479737517, precision: 0.6827896708390089, f1: 0.7130288170891472
n_estimators: 13, max_depth: 3, learning_rate: 0.1, accuracy: 0.6940277114190158, recall: 0.7654801435409514, precision: 0.6729200151226755, f1: 0.7122533862653597
n_estimators: 13, max_depth: 3, learning_rate: 0.2, accuracy: 0.7212732919254659, recall: 0.7596683327891992, precision: 0.7090190236301239, f1: 0.7319153143080754
n_estimators: 13, max_depth: 3, learning_rate: 0.3, accuracy: 0.7147873865265169, recall: 0.726735947674978, precision: 0.6984245470167192, f1: 0.7142697470481187
n_estimators: 14, 

In [10]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(24, 30, 1):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3, 0.4]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 24, max_depth: 2, learning_rate: 0.1, accuracy: 0.6853201146679407, recall: 0.7520248769011991, precision: 0.667071599838028, f1: 0.7036589730042057
n_estimators: 24, max_depth: 2, learning_rate: 0.2, accuracy: 0.7147396082178691, recall: 0.7553478866830486, precision: 0.6985592825611402, f1: 0.7241465222242598
n_estimators: 24, max_depth: 2, learning_rate: 0.3, accuracy: 0.7266961299569996, recall: 0.7628388108785586, precision: 0.7117611125239582, f1: 0.7369938628981444
n_estimators: 24, max_depth: 2, learning_rate: 0.4, accuracy: 0.7179765886287626, recall: 0.7387573108943445, precision: 0.7084429317060265, f1: 0.7217766704384905
n_estimators: 24, max_depth: 3, learning_rate: 0.1, accuracy: 0.7158385093167702, recall: 0.7809631007000404, precision: 0.6957016517978023, f1: 0.7338191097513596
n_estimators: 24, max_depth: 3, learning_rate: 0.2, accuracy: 0.7353917821309126, recall: 0.7576514259639301, precision: 0.7278447505720769, f1: 0.7385653061126101
n_estimators: 24,

In [13]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(30, 40, 2):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3, 0.4]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 30, max_depth: 2, learning_rate: 0.1, accuracy: 0.6951027233635929, recall: 0.7569013935524269, precision: 0.6777158284898223, f1: 0.7122503248344889
n_estimators: 30, max_depth: 2, learning_rate: 0.2, accuracy: 0.7191352126134735, recall: 0.7633127236287122, precision: 0.707701974778924, f1: 0.7293488176470485
n_estimators: 30, max_depth: 2, learning_rate: 0.3, accuracy: 0.7353559483994266, recall: 0.7645986575260916, precision: 0.7256448998140883, f1: 0.742517461094029
n_estimators: 30, max_depth: 2, learning_rate: 0.4, accuracy: 0.7223005255613951, recall: 0.7503297032140125, precision: 0.7117051769831273, f1: 0.7277146278285905
n_estimators: 30, max_depth: 3, learning_rate: 0.1, accuracy: 0.7212732919254659, recall: 0.7711415285439736, precision: 0.7057605189199874, f1: 0.733749888821664
n_estimators: 30, max_depth: 3, learning_rate: 0.2, accuracy: 0.749569995222169, recall: 0.760345541199307, precision: 0.7399474926128192, f1: 0.7497536402467582
n_estimators: 30, max

KeyboardInterrupt: 

In [16]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for n_est in range(38, 60, 2):
    for depth in range(2, 4, 1):
        for lr in [0.1, 0.2, 0.3, 0.4]:
            pipeline = Pipeline([
                ('outliers_replacer', OutliersReplacer()),
                ('drop_columns', DropColumns()),
                ('classifier', GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr))
            ])
            kfold = KFold(n_splits=10, random_state=42, shuffle=True)
            score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
            score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
            score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
            score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
            print(f"n_estimators: {n_est}, max_depth: {depth}, learning_rate: {lr}, accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


n_estimators: 38, max_depth: 2, learning_rate: 0.1, accuracy: 0.7082059245102723, recall: 0.7605191246591647, precision: 0.694185810628863, f1: 0.7218302261038758
n_estimators: 38, max_depth: 2, learning_rate: 0.2, accuracy: 0.7256450071667463, recall: 0.7555227554073495, precision: 0.7148314744251276, f1: 0.7313595142647644
n_estimators: 38, max_depth: 2, learning_rate: 0.3, accuracy: 0.7266483516483516, recall: 0.7475112615591797, precision: 0.7185204136412108, f1: 0.7349041778919171
n_estimators: 38, max_depth: 2, learning_rate: 0.4, accuracy: 0.7354276158623985, recall: 0.7410143209392404, precision: 0.7296215004723148, f1: 0.7328314094839553
n_estimators: 38, max_depth: 3, learning_rate: 0.1, accuracy: 0.7288819875776398, recall: 0.7688580424112398, precision: 0.7139431213139299, f1: 0.7373014987919391
n_estimators: 38, max_depth: 3, learning_rate: 0.2, accuracy: 0.7484710941232681, recall: 0.748652505875721, precision: 0.7466567813037678, f1: 0.7454854666842612
n_estimators: 38, 

KeyboardInterrupt: 

In [20]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

pipeline = Pipeline([
        ('outliers_replacer', OutliersReplacer()),
        ('drop_columns', DropColumns()),
        ('classifier', GradientBoostingClassifier(n_estimators=36, max_depth=3, learning_rate=0.2))
    ])


#validation 
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
print(f"accuracy: {score_acc.mean()}, recall: {score_recall.mean()}, precision: {score_precision.mean()}, f1: {score_f1.mean()}")


accuracy: 0.7517439082656473, recall: 0.7619084972134825, precision: 0.7467211612184828, f1: 0.749992870082142


In [21]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.69375
[[93 21]
 [28 18]]
              precision    recall  f1-score   support

           0       0.77      0.82      0.79       114
           1       0.46      0.39      0.42        46

    accuracy                           0.69       160
   macro avg       0.62      0.60      0.61       160
weighted avg       0.68      0.69      0.69       160

