# Model: XGBoost
---

# 1. Setting up the notebook

In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from sklearn.metrics import recall_score, fbeta_score, roc_auc_score, make_scorer

In [11]:
df_train = pd.read_csv("../Data/train.csv")
y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

# 2. Running base model

In [4]:
def cv_evaluate_model(xgb):
    scale_features = x_train.drop("gender", axis=1).columns
    scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')
    pipeline = Pipeline(steps = [['scaler', scaler ],
                                 ['smote', SMOTE(random_state=2021)],
                                 ['classifier', xgb]
                                ])

    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"accuracy": "accuracy",
               "recall": 'recall',
               "precision": "precision",
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    accuracy = [ val for val in scores['test_accuracy'] ]
    recall = [ val for val in scores['test_recall'] ]
    precision = [ val for val in scores['test_precision'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]

    accuracy.append( sum(accuracy) / len(accuracy) )
    recall.append( sum(recall) / len(recall) )
    precision.append( sum(precision) / len(precision) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                                index=['Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])
    return score_df

In [5]:
xgb_base = XGBClassifier(eval_metric="logloss", 
                    use_label_encoder=False, 
                    random_state=2021)

base_model = cv_evaluate_model(xgb_base)
display(base_model)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.95187,0.957778,0.954815,0.954821
Recall,0.832168,0.852804,0.871795,0.852255
Precision,0.860241,0.877404,0.848073,0.861906
Fbeta2,0.837635,0.857613,0.866945,0.854064
AUC,0.90332,0.915178,0.921146,0.913215


# 3. Hyperparameter tuning with GridSearchCV

## 3.1 First Grid Search

In [6]:
learning_rates = [0.2, 0.3, 0.4, 0.5]
max_depths = [5, 10, 15, 20]
gammas = [0.2, 0.5, 0.7, 1.0]
subsamples = [0.5, 0.6, 0.8, 0.9]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__learning_rate [0.2, 0.3, 0.4, 0.5]
classifier__max_depth [5, 10, 15, 20]
classifier__gamma [0.2, 0.5, 0.7, 1.0]
classifier__subsample [0.5, 0.6, 0.8, 0.9]
-----------------
Total combinations: 256


In [None]:
# scale_features = x_train.drop("gender", axis=1).columns
scale_features = x_train.columns

xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)


{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__subsample': 0.9}

## 3.2 Second Grid Search

#### Best Params of First Grid Search
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__subsample': 0.9}

In [9]:
learning_rates = [0.25, 0.27, 0.3, 0.32, 0.35]
max_depths = [3, 5, 7]
gammas = [0.8, 0.9, 1]
subsamples = [0.85, 0.9, 0.95]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__learning_rate [0.25, 0.27, 0.3, 0.32, 0.35]
classifier__max_depth [3, 5, 7]
classifier__gamma [0.8, 0.9, 1]
classifier__subsample [0.85, 0.9, 0.95]
-----------------
Total combinations: 135


In [None]:
scale_features = x_train.drop("gender", axis=1).columns
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ], remainder='passthrough')

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)

{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}

## 3.3 Third Grid Search

#### Best params from Second Grid Search:
{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}

In [8]:
learning_rates = [0.24, 0.25, 0.26]
max_depths = [3, 4, 5]
gamma = [1]
subsamples = [0.87, 0.9, 0.92]

params_grid = {
                'classifier__learning_rate': learning_rates,
                'classifier__max_depth': max_depths,
                'classifier__gamma': gammas,
                'classifier__subsample': subsamples
              }

total_combi = 1
for param, value in params_grid.items():
    print(param, value)
    total_combi *= len(value)

print('-----------------')
print('Total combinations:', total_combi)

classifier__learning_rate [0.24, 0.25, 0.26]
classifier__max_depth [3, 4, 5]
classifier__gamma [0.8, 0.9, 1]
classifier__subsample [0.87, 0.9, 0.92]
-----------------
Total combinations: 81


In [None]:

scale_features = x_train.columns
oversampler = SMOTE(random_state=2021)
scaler = ColumnTransformer(transformers=[ ('scaler', MinMaxScaler(), scale_features) ])
xgb = XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=2021)

pipeline = Pipeline(steps = [['scaler', scaler],
                             ['smote', oversampler],
                             ['classifier', xgb]])

stratified_kfold = StratifiedKFold(shuffle=True, n_splits=5, random_state=2021)

xgb_gridsearch = GridSearchCV(
                                estimator = pipeline,
                                param_grid = params_grid,
                                scoring = 'recall',
                                cv = stratified_kfold,
                                refit = True,
                                n_jobs = -1
                             )

xgb_gridsearch.fit(x_train, y_train.values.ravel())

print(xgb_gridsearch.best_params_)

{'classifier__gamma': 1, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}

# 4. Conclusion

#### Best params from First Grid Search:
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.3, 'classifier__max_depth': 5, 'classifier__subsample': 0.9}
#### Best params from Second Grid Search:
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}
#### Best params from Third Grid Search:
{'classifier__gamma': 1.0, 'classifier__learning_rate': 0.25, 'classifier__max_depth': 3, 'classifier__subsample': 0.9}