# 1. Setting up the notebook

In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from sklearn.metrics import recall_score, fbeta_score, roc_auc_score, make_scorer

In [31]:
df_train = pd.read_csv("../data/train.csv")

y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8101 entries, 0 to 8100
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   dependent_count           8101 non-null   int64  
 1   total_relationship_count  8101 non-null   int64  
 2   months_inactive_12_mon    8101 non-null   int64  
 3   contacts_count_12_mon     8101 non-null   int64  
 4   credit_limit              8101 non-null   float64
 5   total_revolving_bal       8101 non-null   int64  
 6   total_amt_change_q4_q1    8101 non-null   float64
 7   total_trans_amt           8101 non-null   float64
 8   total_count_change_q4_q1  8101 non-null   float64
 9   avg_utilization_ratio     8101 non-null   float64
dtypes: float64(5), int64(5)
memory usage: 633.0 KB


# 2. Running base model

In [28]:
def cv_evaluate_model(xgb):
    scale_features = x_train.columns
    oversampler = SMOTE(random_state=2021)

    scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])

    pipeline = Pipeline(steps = [['scaler', scaler],
                                 ['smote', oversampler],
                                 ['classifier', xgb]
                                ])

    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

    scoring = {"recall": 'recall',
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(xgb, x_train, y_train.values.ravel(), cv=stratified_kfold, scoring=scoring)
    recall = [ val for val in scores['test_recall'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    recall.append( sum(recall) / len(recall) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )
    score_df = pd.DataFrame(data=[recall, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Fold 4', 'Fold 5', 'Average'],
                            index=['Recall','Fbeta2','AUC'])
    display(score_df)

In [29]:
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)
cv_evaluate_model(xgb)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
Recall,0.785441,0.842308,0.761538,0.8,0.877395,0.813336
Fbeta2,0.807087,0.849496,0.781991,0.815686,0.875382,0.825928
AUC,0.885,0.910124,0.870475,0.890074,0.92582,0.896299


# 3. Hyperparameter tuning with GridSearchCV

## 3.1 First Grid Search

In [55]:
# https://blog.dataiku.com/narrowing-the-search-which-hyperparameters-really-matter
# Creating parameter grid to search
learning_rates = [0.2, 0.3, 0.4, 0.5]
max_depths = [5, 10, 15, 20]
gammas = [0.2, 0.5, 0.7, 1.0]
subsamples = [0.5, 0.6, 0.8, 0.9]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__learning_rate [0.2, 0.3, 0.4, 0.5]
classifier__max_depth [5, 10, 15, 20]
classifier__gamma [0.2, 0.5, 0.7, 1.0]
classifier__subsample [0.5, 0.6, 0.8, 0.9]
-----------------
Total combinations: 256


In [56]:
scale_features = x_train.columns
oversampler = SMOTE(random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', oversampler],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)

{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__subsample': 0.9}


## 3.2 Second Grid Search

#### Best Params of First GridSearch
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 
'classifier__max_depth': 5, 'classifier__subsample': 0.9}

In [60]:
learning_rates = [0.25, 0.28, 0.3, 0.32, 0.35]
max_depths = [3, 4, 5]
gammas = [1]
subsamples = [0.85, 0.9, 0.95]


params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

scale_features = x_train.columns
oversampler = SMOTE(random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', oversampler],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)

{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}


## 3.3 Third Grid Search

#### Best params from Second Grid Search:
{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}

In [63]:
learning_rates = [0.22, 0.23, 0.24, 0.25, 0.26]
max_depths = [3]
gamma = [1]
subsamples = [0.87, 0.9, 0.92]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

scale_features = x_train.columns
oversampler = SMOTE(random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', oversampler],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)

{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}


# 4. Evaluation on test

#### Best params from First Grid Search:
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__subsample': 0.9}
#### Best params from Second & Third Grid Search:
{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}

In [33]:
df_test = pd.read_csv("../data/test.csv")
y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

scale_features = x_train.columns
oversampler = SMOTE(random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])
xgb = XGBClassifier(learning_rate=0.25,
                    max_depth=3,
                    gamma=1,
                    subsample=0.9,
                    eval_metric="logloss", 
                    use_label_encoder=False, 
                    random_state=2021)

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', oversampler],
                             ['classifier', xgb]])

pipeline.fit(x_train, y_train.values.ravel() )
y_pred = pipeline.predict(x_test)

print("-------------------------TEST SCORES-----------------------  ")
print(f"Recall: {recall_score(y_test, y_pred)}  ")
print(f"Fbeta2: {fbeta_score(y_test, y_pred, beta=2)}  ")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}  ")

-------------------------TEST SCORES-----------------------  
Recall: 0.8738461538461538  
Fbeta2: 0.8647990255785628  
AUC Score: 0.9198742820965042  
