# XGBoost

## Imports

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import balanced_accuracy_score
from time import time
import xgboost as xgb

## Data

In [20]:
client_attrition = pd.read_csv('../data/preprocessed/client_attrition_train.csv', sep=";")
client_attrition_test = pd.read_csv('../data/preprocessed/client_attrition_test.csv', sep=";")
X = client_attrition.drop("account_status",axis=1)
y = client_attrition["account_status"]
print(X.shape)

(10127, 37)


## Default XGBoost

In [6]:
# One time split before cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345,stratify=y)

In [9]:
start = time()
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
print(f"Training time: {time()-start} seconds")

Training time: 0.6646804809570312 seconds


In [37]:
# prediction and evaluation
y_pred = model.predict(X_test)
print(y_pred)
print(balanced_accuracy_score(y_test,y_pred))

[0 1 0 ... 0 0 0]
0.9222448333559445


## Grid search
Using repeated Stratified KFold Cross Valdidation

In [12]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid.keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i, params in enumerate(grid_search.cv_results_['params']):
        new_row = params | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df
    

In [31]:
frac = np.count_nonzero(y_train)/(y_train.size)
start_time = time()
# Grid 
param_grid = {'objective': ['binary:logistic'],'scale_pos_weight': [frac], 'eta':[0.01, 0.3, 1],'max_depth': [4,6,8], 'lambda': [0.1, 1, 10] ,'verbosity': [2]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 27 candidates, totalling 540 fits
[14:41:58] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 66 extra nodes, 0 pruned nodes, max_depth=6
[14:41:58] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 56 extra nodes, 0 pruned nodes, max_depth=6
[14:41:58] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 98 extra nodes, 0 pruned nodes, max_depth=6
[14:41:58] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[14:41:58] INFO: C:\buildkite-agent\build

In [32]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/xgb00_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,objective,scale_pos_weight,eta,max_depth,lambda,verbosity,mean_train_score,std_train_score,mean_test_score,std_test_score
19,binary:logistic,0.160721,1.0,6,0.1,2,1.0,0.0,0.904949,0.012727
21,binary:logistic,0.160721,1.0,4,1.0,2,0.995663,0.001042,0.903581,0.012438
22,binary:logistic,0.160721,1.0,6,1.0,2,1.0,0.0,0.903046,0.01172
10,binary:logistic,0.160721,0.3,6,0.1,2,0.987246,0.001859,0.902776,0.011558
18,binary:logistic,0.160721,1.0,4,0.1,2,0.998924,0.000685,0.901699,0.01402
23,binary:logistic,0.160721,1.0,8,1.0,2,1.0,0.0,0.901201,0.014959
26,binary:logistic,0.160721,1.0,8,10.0,2,0.990422,0.00112,0.900939,0.01583
11,binary:logistic,0.160721,0.3,8,0.1,2,0.997951,0.000756,0.900201,0.016149
20,binary:logistic,0.160721,1.0,8,0.1,2,1.0,0.0,0.898932,0.011401
25,binary:logistic,0.160721,1.0,6,10.0,2,0.984822,0.001401,0.898713,0.015574


We can notice overfitting let's try to avoid it
Interestingly adding scale_pos_weight argument decreaases the score, no matter if it is the 19% or 16%

In [30]:
print("train: ", np.count_nonzero(y_train)/(y_train.size - np.count_nonzero(y_train)))
print("train: ", np.count_nonzero(y_train)/(y_train.size))

train:  0.19149874981614942
train:  0.1607208986544871


In [26]:
frac = np.count_nonzero(y_train)/(y_train.size - np.count_nonzero(y_train))
start_time = time()
# Grid 
param_grid = {'objective': ['binary:logistic'],'scale_pos_weight': [frac] ,'eta':[0.01, 0.3, 1],'max_depth': [2,4,6], 'lambda': [0.1, 1, 10], 'min_child_weight':[1,10,100] ,'verbosity': [2]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 81 candidates, totalling 1620 fits
[14:27:21] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=4
[14:27:21] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 28 extra nodes, 0 pruned nodes, max_depth=4
[14:27:21] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 28 extra nodes, 0 pruned nodes, max_depth=4
[14:27:21] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 24 extra nodes, 0 pruned nodes, max_depth=4
[14:27:21] INFO: C:\buildkite-agent\buil

In [27]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/xgb1_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,objective,scale_pos_weight,eta,max_depth,lambda,min_child_weight,verbosity,mean_train_score,std_train_score,mean_test_score,std_test_score
66,binary:logistic,0.191499,1.00,4,1.0,1,2,0.997746,0.000833,0.904465,0.015790
57,binary:logistic,0.191499,1.00,4,0.1,1,2,0.999505,0.000396,0.904403,0.013560
33,binary:logistic,0.191499,0.30,6,0.1,1,2,0.992966,0.001471,0.903696,0.013278
69,binary:logistic,0.191499,1.00,6,1.0,1,2,1.000000,0.000000,0.902165,0.014254
78,binary:logistic,0.191499,1.00,6,10.0,1,2,0.991686,0.001069,0.901581,0.014325
...,...,...,...,...,...,...,...,...,...,...,...
8,binary:logistic,0.191499,0.01,6,0.1,100,2,0.500000,0.000000,0.500000,0.000000
5,binary:logistic,0.191499,0.01,4,0.1,100,2,0.500000,0.000000,0.500000,0.000000
23,binary:logistic,0.191499,0.01,4,10.0,100,2,0.500000,0.000000,0.500000,0.000000
2,binary:logistic,0.191499,0.01,2,0.1,100,2,0.500000,0.000000,0.500000,0.000000


Combining conclusions from above we revert the pos_weight parameter and we set some parameters to fight back the overfitting

In [33]:
start_time = time()
# Grid 
param_grid = {'objective': ['binary:logistic'], 'colsample_bytree': [0.5, 0.75, 1], 'gamma': [0.01, 0.1, 1], 'min_child_weight':[1,10],
                'eta':[0.1, 1, 10],'max_depth': [2,4,6], 'lambda': [0.1, 1, 10] ,'verbosity': [2]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 486 candidates, totalling 9720 fits
[20:29:59] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 76 extra nodes, 0 pruned nodes, max_depth=6
[20:29:59] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 72 extra nodes, 0 pruned nodes, max_depth=6
[20:29:59] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 92 extra nodes, 0 pruned nodes, max_depth=6
[20:29:59] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 90 extra nodes, 0 pruned nodes, max_depth=6
[20:29:59] INFO: C:\buildkite-agent\bui

In [34]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/xgb2_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,objective,colsample_bytree,gamma,min_child_weight,eta,max_depth,lambda,verbosity,mean_train_score,std_train_score,mean_test_score,std_test_score
394,binary:logistic,1.00,0.01,1,1.0,6,10.0,2,1.000000,0.000000,0.924831,0.013445
408,binary:logistic,1.00,0.10,1,1.0,2,10.0,2,0.960292,0.002436,0.923720,0.013283
392,binary:logistic,1.00,0.01,1,1.0,4,10.0,2,0.999934,0.000106,0.923589,0.017579
402,binary:logistic,1.00,0.10,1,1.0,2,1.0,2,0.972272,0.001968,0.923517,0.011886
184,binary:logistic,0.75,0.10,1,0.1,6,0.1,2,0.987423,0.001811,0.923472,0.012889
...,...,...,...,...,...,...,...,...,...,...,...,...
275,binary:logistic,0.75,0.01,10,10.0,6,0.1,2,0.448037,0.108579,0.440323,0.110915
311,binary:logistic,0.75,1.00,10,10.0,6,0.1,2,0.447948,0.108468,0.440169,0.110741
314,binary:logistic,0.75,1.00,1,10.0,4,1.0,2,0.434375,0.093362,0.437251,0.088378
278,binary:logistic,0.75,0.01,1,10.0,4,1.0,2,0.434263,0.093294,0.436360,0.088207


In [35]:
start_time = time()
# Grid 
param_grid = {'objective': ['binary:logistic'], 'colsample_bytree': [0.5, 0.75, 1], 'gamma': [0.01, 0.1, 1], 'min_child_weight':[5],
                'eta':[0.1, 1, 10],'max_depth': [2,4,6], 'lambda': [0.1, 1, 10] ,'verbosity': [2]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 243 candidates, totalling 4860 fits
[22:30:38] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 96 extra nodes, 8 pruned nodes, max_depth=6
[22:30:38] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 92 extra nodes, 8 pruned nodes, max_depth=6
[22:30:38] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 96 extra nodes, 4 pruned nodes, max_depth=6
[22:30:38] INFO: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_prune.cc:98: tree pruning end, 102 extra nodes, 4 pruned nodes, max_depth=6
[22:30:38] INFO: C:\buildkite-agent\bu

In [36]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/xgb3_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,objective,colsample_bytree,gamma,min_child_weight,eta,max_depth,lambda,verbosity,mean_train_score,std_train_score,mean_test_score,std_test_score
182,binary:logistic,1.00,1.00,5,0.1,6,0.1,2,0.972100,0.001836,0.922489,0.012369
201,binary:logistic,1.00,0.10,5,1.0,2,1.0,2,0.967937,0.001835,0.921301,0.014191
192,binary:logistic,1.00,0.01,5,1.0,2,1.0,2,0.967937,0.001835,0.921301,0.014191
11,binary:logistic,0.50,0.10,5,0.1,6,0.1,2,0.970045,0.001430,0.920934,0.013815
101,binary:logistic,0.75,1.00,5,0.1,6,0.1,2,0.971913,0.001429,0.920920,0.014660
...,...,...,...,...,...,...,...,...,...,...,...,...
239,binary:logistic,1.00,1.00,5,10.0,6,1.0,2,0.452950,0.101816,0.451161,0.103600
236,binary:logistic,1.00,1.00,5,10.0,6,0.1,2,0.451876,0.118996,0.451028,0.116527
148,binary:logistic,0.75,0.10,5,10.0,4,1.0,2,0.450540,0.066453,0.447341,0.072572
157,binary:logistic,0.75,1.00,5,10.0,4,1.0,2,0.450554,0.066476,0.447341,0.072572


In [44]:
results2 = pd.read_csv('./results/xgb2_metrics_2_10_cv.csv', delimiter=';')
results3 = pd.read_csv('./results/xgb3_metrics_2_10_cv.csv', delimiter=';')
results = pd.concat([results2, results3], axis=0, ignore_index=True).sort_values(ascending=False,by="mean_test_score")
results.head(100)

Unnamed: 0,objective,colsample_bytree,gamma,min_child_weight,eta,max_depth,lambda,verbosity,mean_train_score,std_train_score,mean_test_score,std_test_score
0,binary:logistic,1.0,0.01,1,1.0,6,10.0,2,1.0,0.0,0.924831,0.013445
1,binary:logistic,1.0,0.1,1,1.0,2,10.0,2,0.960292,0.002436,0.92372,0.013283
2,binary:logistic,1.0,0.01,1,1.0,4,10.0,2,0.999934,0.000106,0.923589,0.017579
3,binary:logistic,1.0,0.1,1,1.0,2,1.0,2,0.972272,0.001968,0.923517,0.011886
4,binary:logistic,0.75,0.1,1,0.1,6,0.1,2,0.987423,0.001811,0.923472,0.012889
5,binary:logistic,1.0,0.01,1,1.0,4,1.0,2,1.0,0.0,0.923374,0.015687
6,binary:logistic,0.75,1.0,1,0.1,6,0.1,2,0.98536,0.001539,0.923232,0.013489
7,binary:logistic,1.0,0.01,1,1.0,2,1.0,2,0.972167,0.00187,0.923213,0.013315
8,binary:logistic,0.75,0.01,1,1.0,4,10.0,2,0.999587,0.000377,0.92318,0.01409
9,binary:logistic,0.5,0.1,1,1.0,4,10.0,2,0.999202,0.000489,0.923011,0.010706


## Seeing the overfitting issue we choose the second model as the final one
Very simmilar mean test score but lower std and lower potential of overfitting train score 0.96 insted of almost 1

Additionally second best model

In [45]:
# intervals
means = [0.924831, 0.923720, 0.922489, 0.921301]
stds = [0.013445, 0.012283, 0.012369, 0.014191]
for i in range(4):
    print(means[i]-stds[i], means[i], means[i]+stds[i])

0.9113859999999999 0.924831 0.938276
0.9114369999999999 0.92372 0.936003
0.91012 0.922489 0.934858
0.9071100000000001 0.921301 0.935492
