# Optimize XGBoost

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import math

# own modules
import eda_methods as eda

# visualization
import seaborn as sns
sns.set(style="white")  
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

# warnings handler
import warnings
warnings.filterwarnings("ignore")

# Machine Learning Libraries
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import fbeta_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler

random_state=101

## Loading data & set up

In [2]:
# new feature dataframe
df_importance = pd.read_csv('data/df_clean_engineered_all.csv')

# split label
y = df_importance['churn']

# drop obvious drops
df_importance = df_importance.drop(['churn','plz_3','abo_registrierung_min','nl_registrierung_min','ort'], axis = 1)

# get dummies
df_importance = pd.get_dummies(df_importance, columns = ['kanal', 'objekt_name', 'aboform_name', 'zahlung_rhythmus_name','zahlung_weg_name', 'plz_1', 'plz_2', 'land_iso_code', 'anrede','titel'], drop_first = True)

In [3]:
# defined list of important features
important_features_combined_dropping = ['zahlung_weg_name_Rechnung',
                                        'zahlung_rhythmus_name_halbjährlich',
                                        'rechnungsmonat',
                                        'received_anzahl_6m',
                                        'openedanzahl_6m',
                                        'objekt_name_ZEIT Digital',
                                        'nl_zeitbrief',
                                        'nl_aktivitaet',
                                        'liefer_beginn_evt',
                                        'cnt_umwandlungsstatus2_dkey',
                                        'clickrate_3m',
                                        'anrede_Frau',
                                        'aboform_name_Geschenkabo',
                                        'unsubscribed_anzahl_1m',
                                        'studentenabo',
                                        'received_anzahl_bestandskunden_6m',
                                        'openrate_produktnews_3m',
                                        'opened_anzahl_bestandskunden_6m',
                                        'objekt_name_DIE ZEIT - CHRIST & WELT',
                                        'nl_zeitshop',
                                        'nl_opt_in_sum',
                                        'nl_opened_1m',
                                        'kanal_andere',
                                        'kanal_B2B',
                                        'clicked_anzahl_6m',
                                        'che_reg',
                                        'MONTH_DELTA_nl_min',
                                        'zon_zp_red',
                                        'zahlung_rhythmus_name_vierteljährlich',
                                        'unsubscribed_anzahl_hamburg_1m',
                                        'unsubscribed_anzahl_6m',
                                        'sum_zon',
                                        'sum_reg',
                                        'shop_kauf',
                                        'plz_2_10',
                                        'plz_1_7',
                                        'plz_1_1',
                                        'openrate_zeitbrief_3m',
                                        'openrate_produktnews_1m',
                                        'openrate_3m',
                                        'openrate_1m',
                                        'nl_unsubscribed_6m',
                                        'nl_fdz_organisch',
                                        'metropole',
                                        'cnt_abo_magazin',
                                        'cnt_abo_diezeit_digital',
                                        'cnt_abo',
                                        'clicked_anzahl_bestandskunden_3m',
                                        'aboform_name_Probeabo',
                                        'aboform_name_Negative Option',
                                        'MONTH_DELTA_abo_min']

len(important_features_combined_dropping)

51

In [4]:
# choose important features
print(df_importance.shape)
X = df_importance[important_features_combined_dropping]
print(X.shape)


(184660, 307)
(184660, 51)


In [5]:
def train_predict(modelname, y_train, y_test, predictions_train, predictions_test):
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       -
       - y_train: income training set
       -
       - y_test: income testing set
    '''
    results = {}
    # model name
    results['model'] = modelname
    # accuracy
    results['acc_train'] = accuracy_score(y_train,predictions_train)
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    # F-score
    #results[‘f_train’] = fbeta_score(y_train,predictions_train,0.5)
    #results[‘f_test’] = fbeta_score(y_test,predictions_test,0.5)
    # F1-score
    results['f1_train'] = f1_score(y_train,predictions_train)
    results['f1_test'] = f1_score(y_test,predictions_test)
    # Recall
    results['recall_train'] = recall_score(y_train,predictions_train)
    results['recall_test'] = recall_score(y_test,predictions_test)
    # Precision
    results['precision_train'] = precision_score(y_train,predictions_train)
    results['precision_test'] = precision_score(y_test,predictions_test)
    # Return the results
    return results

## RandomSearchCV

In [23]:
def pipeline_optimization(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])
    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        
    # Hyperparameter grid
    param_XGBoost = {
        # tree param
        'XGBoost__eta': [0.01, 0.2],                           # 0.001 to 0.2
        'XGBoost__max_depth': [8, 9],                          # around 6 higher depth - overfitting
        'XGBoost__min_child_weight': list(range(0,13,1)),      # min 0 no limit
        'XGBoost__gamma': list(range(0, 5, 1)),                # the more conservative the algorithm will be
        'XGBoost__subsample': list(range(0,9,2)),              # 3 out of 3 times 1       
        # performance param
        'XGBoost__sampling_method': ['uniform','gradient_based'],
        'XGBoost__tree_method': ['approx','hist']              # 3 out of 4 times 'hist'
    }
        
    models={
        'XGBoost' : XGBClassifier(random_state=random_state, n_jobs=-1, n_estimators=100)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
            ('imputer_num', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
    ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test',
                                          'recall_train','recall_test','precision_train','precision_test'])
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])
        
        grid_XGBoost = RandomizedSearchCV(pipe, param_XGBoost, cv=5, scoring='recall', 
                           verbose=5, n_jobs=-1, n_iter = 100)
        # fit model
        grid_XGBoost.fit(X_train, y_train)
        
        # Show best parameters
        print('Best score:{:.2f}'.format(grid_XGBoost.best_score_))
        print('Best parameters:{}'.format(grid_XGBoost.best_params_))
        
        # Save best model as best_model
        best_model = grid_XGBoost.best_estimator_
        y_train_pred = grid_XGBoost.predict(X_train)
        y_test_pred = grid_XGBoost.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])

        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
        
    return model_results

### First shot

In [48]:
RandomizedSearch_XGBoost = pipeline_optimization(X,y,balance='under')
RandomizedSearch_XGBoost

Undersampling
XGBoost
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 21.2min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 30.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 33.9min finished


Parameters: { validate_parameter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Best score:
0.71
Best parameters:
{'XGBoost__validate_parameter': False, 'XGBoost__tree_method': 'hist', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'uniform', 'XGBoost__min_child_weight': 12, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 5, 'XGBoost__eta': 0.1}

Confusion matrix on test
[[24407  7763]
 [ 3938 10057]]




Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test
0,XGBoost,0.79607,0.74654,0.792195,0.632218,0.777421,0.718614,0.807541,0.564366


### Second shot (higher fits)

In [51]:
RandomizedSearch_XGBoost = pipeline_optimization(X,y,balance='under')
RandomizedSearch_XGBoost

Undersampling
XGBoost
Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 41.7min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 59.8min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 66.8min finished


Parameters: { validate_parameter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Best score:0.73
Best parameters:{'XGBoost__validate_parameter': True, 'XGBoost__tree_method': 'approx', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'gradient_based', 'XGBoost__min_child_weight': 4, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 0, 'XGBoost__eta': 1}

Confusion matrix on test
[[23387  8783]
 [ 3657 10338]]




Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test
0,XGBoost,0.910313,0.730532,0.910366,0.624351,0.910897,0.738692,0.909835,0.540662


### Third shot

In [16]:
RandomizedSearch_XGBoost = pipeline_optimization(X,y,balance='under')
RandomizedSearch_XGBoost

Undersampling
XGBoost
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   39.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   39.7s finished


Best score:0.72
Best parameters:{'XGBoost__tree_method': 'hist', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'gradient_based', 'XGBoost__min_child_weight': 11, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 2, 'XGBoost__eta': 0.2}

Confusion matrix on test
[[23834  8336]
 [ 3916 10079]]




Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test
0,XGBoost,0.816339,0.734604,0.814532,0.621969,0.806598,0.720186,0.822625,0.547326


### Conclusion
It was possible to improve the recall from min 0.718 to max 0.738 and the F1 score was pushed to 0.638 (worst: 0.624). So both scores we are targeting in this model are imprved. Let´s find the perfect parameter with a GridSearchCV and use the best param list to define it.

- Best parameters:{'XGBoost__validate_parameter': False, 'XGBoost__tree_method': 'hist', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'uniform', 'XGBoost__min_child_weight': 12, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 5, 'XGBoost__eta': 0.1}

- Best parameters:{'XGBoost__validate_parameter': True, 'XGBoost__tree_method': 'approx', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'gradient_based', 'XGBoost__min_child_weight': 4, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 0, 'XGBoost__eta': 1}

- Best parameters:{'XGBoost__tree_method': 'hist', 'XGBoost__subsample': 1, 'XGBoost__sampling_method': 'gradient_based', 'XGBoost__min_child_weight': 11, 'XGBoost__max_depth': 9, 'XGBoost__gamma': 2, 'XGBoost__eta': 0.2}


# GridSearchCV

In [21]:
def pipeline_optimization(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])
    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)

    # Hyperparameter grid
    param_XGBoost = {
        # tree param
        'XGBoost__eta': [0.1,0.2],
        'XGBoost__max_depth': [8,9],                           # around 6 higher depth - overfitting
        'XGBoost__min_child_weight': list(range(4,10,2)),      # min 0 no limit
        'XGBoost__gamma': list(range(0,3,1)),                  # the more conservative the algorithm will be
        'XGBoost__subsample': [1],                             # 3 out of 3 times 1       
        # performance param
        'XGBoost__validate_parameter': [True],
        'XGBoost__sampling_method': ['gradient_based'],
        'XGBoost__tree_method': ['hist']                       # 3 out of 4 times 'hist'
    }
        
    models={
        'XGBoost' : XGBClassifier(random_state=random_state, n_jobs=-1, n_estimators=100)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
            ('imputer_num', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
    ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test',
                                          'recall_train','recall_test','precision_train','precision_test'])
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])
        
        grid_XGBoost = GridSearchCV(pipe, param_XGBoost, cv=5, scoring='recall', 
                           verbose=5, n_jobs=-1)
        # fit model
        grid_XGBoost.fit(X_train, y_train)
        
        # Show best parameters
        print("\n")
        print('Best score: {:.2f}'.format(grid_XGBoost.best_score_))
        print('Best parameters: {}'.format(grid_XGBoost.best_params_))
        
        # Save best model as best_model
        best_model = grid_XGBoost.best_estimator_
        y_train_pred = grid_XGBoost.predict(X_train)
        y_test_pred = grid_XGBoost.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])

        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
        
    return model_results

In [22]:
GridSearch_XGBoost = pipeline_optimization(X,y,balance='under')
GridSearch_XGBoost

Undersampling
XGBoost
Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 27.9min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 39.8min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed: 54.2min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 70.2min
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed: 83.4min finished


Parameters: { validate_parameter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




Best score: 0.73
Best parameters: {'XGBoost__eta': 0.2, 'XGBoost__gamma': 0, 'XGBoost__max_depth': 9, 'XGBoost__min_child_weight': 4, 'XGBoost__sampling_method': 'gradient_based', 'XGBoost__subsample': 1, 'XGBoost__tree_method': 'hist', 'XGBoost__validate_parameter': True}

Confusion matrix on test
[[23984  8186]
 [ 3809 10186]]




Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test
0,XGBoost,0.844504,0.740171,0.843394,0.629406,0.837418,0.727831,0.849456,0.554431
