# Optimize XGBoost

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import math

# own modules
import eda_methods as eda

# visualization
import seaborn as sns
sns.set(style="white")  
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

# warnings handler
import warnings
warnings.filterwarnings("ignore")

# Machine Learning Libraries
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import fbeta_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler

random_state=101

## Loading data & set up

In [2]:
# new feature dataframe
df_importance = pd.read_csv('data/df_clean_engineered_all.csv')

# split label
y = df_importance['churn']

# drop obvious drops
df_importance = df_importance.drop(['churn','plz_3','abo_registrierung_min','nl_registrierung_min','ort'], axis = 1)

# get dummies
df_importance = pd.get_dummies(df_importance, columns = ['kanal', 'objekt_name', 'aboform_name', 'zahlung_rhythmus_name','zahlung_weg_name', 'plz_1', 'plz_2', 'land_iso_code', 'anrede','titel'], drop_first = True)

In [3]:
# defined list of important features
important_features_combined_dropping = ['zahlung_weg_name_Rechnung',
                                        'zahlung_rhythmus_name_halbjährlich',
                                        'rechnungsmonat',
                                        'received_anzahl_6m',
                                        'openedanzahl_6m',
                                        'objekt_name_ZEIT Digital',
                                        'nl_zeitbrief',
                                        'nl_aktivitaet',
                                        'liefer_beginn_evt',
                                        'cnt_umwandlungsstatus2_dkey',
                                        'clickrate_3m',
                                        'anrede_Frau',
                                        'aboform_name_Geschenkabo',
                                        'unsubscribed_anzahl_1m',
                                        'studentenabo',
                                        'received_anzahl_bestandskunden_6m',
                                        'openrate_produktnews_3m',
                                        'opened_anzahl_bestandskunden_6m',
                                        'objekt_name_DIE ZEIT - CHRIST & WELT',
                                        'nl_zeitshop',
                                        'nl_opt_in_sum',
                                        'nl_opened_1m',
                                        'kanal_andere',
                                        'kanal_B2B',
                                        'clicked_anzahl_6m',
                                        'che_reg',
                                        'MONTH_DELTA_nl_min',
                                        'zon_zp_red',
                                        'zahlung_rhythmus_name_vierteljährlich',
                                        'unsubscribed_anzahl_hamburg_1m',
                                        'unsubscribed_anzahl_6m',
                                        'sum_zon',
                                        'sum_reg',
                                        'shop_kauf',
                                        'plz_2_10',
                                        'plz_1_7',
                                        'plz_1_1',
                                        'openrate_zeitbrief_3m',
                                        'openrate_produktnews_1m',
                                        'openrate_3m',
                                        'openrate_1m',
                                        'nl_unsubscribed_6m',
                                        'nl_fdz_organisch',
                                        'metropole',
                                        'cnt_abo_magazin',
                                        'cnt_abo_diezeit_digital',
                                        'cnt_abo',
                                        'clicked_anzahl_bestandskunden_3m',
                                        'aboform_name_Probeabo',
                                        'aboform_name_Negative Option',
                                        'MONTH_DELTA_abo_min']

len(important_features_combined_dropping)

51

In [4]:
# choose important features
print(df_importance.shape)
X = df_importance[important_features_combined_dropping]
print(X.shape)


(184660, 307)
(184660, 51)


In [5]:
def train_predict(modelname, y_train, y_test, predictions_train, predictions_test):
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       -
       - y_train: income training set
       -
       - y_test: income testing set
    '''
    results = {}
    # model name
    results['model'] = modelname
    # accuracy
    results['acc_train'] = accuracy_score(y_train,predictions_train)
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    # F-score
    #results[‘f_train’] = fbeta_score(y_train,predictions_train,0.5)
    #results[‘f_test’] = fbeta_score(y_test,predictions_test,0.5)
    # F1-score
    results['f1_train'] = f1_score(y_train,predictions_train)
    results['f1_test'] = f1_score(y_test,predictions_test)
    # Recall
    results['recall_train'] = recall_score(y_train,predictions_train)
    results['recall_test'] = recall_score(y_test,predictions_test)
    # Precision
    results['precision_train'] = precision_score(y_train,predictions_train)
    results['precision_test'] = precision_score(y_test,predictions_test)
    # Return the results
    return results

## RandomSearchCV

In [8]:
def pipeline_optimization(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])
    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        
    # Hyperparameter grid
    param_SVC = {
        'SVC__C': [1, 10, 100, 1000],
        'SVC__kernel': ['linear', 'rbf', 'poly'],
        'SVC__gamma': [1/(X.shape[1]), 1, 0.1, 0.01, 0.001, 0.0001]
    }
        
    models={
        'SVC' : SVC(random_state=random_state)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
            ('imputer_num', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
    ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test',
                                          'recall_train','recall_test','precision_train','precision_test'])
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])
        
        grid_SVC = RandomizedSearchCV(pipe, param_SVC, cv=3, scoring='precision', 
                           verbose=5, n_jobs=-1)
        # fit model
        grid_SVC.fit(X_train, y_train)
        
        # Show best parameters
        print('Best score:{:.2f}'.format(grid_SVC.best_score_))
        print('Best parameters:{}'.format(grid_SVC.best_params_))
        
        # Save best model as best_model
        best_model = grid_SVC.best_estimator_
        y_train_pred = grid_SVC.predict(X_train)
        y_test_pred = grid_SVC.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])

        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
        
    return model_results

In [9]:
RandomizedSearch_SVC = pipeline_optimization(X,y,balance='under')
RandomizedSearch_SVC

Undersampling
SVC
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done  22 out of  30 | elapsed: 162.1min remaining: 59.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 919.1min finished


Best score:0.77
Best parameters:{'SVC__kernel': 'poly', 'SVC__gamma': 0.0001, 'SVC__C': 10}

Confusion matrix on test
[[30937  1233]
 [12150  1845]]




Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test
0,SVC,0.54691,0.710105,0.224975,0.216131,0.131523,0.131833,0.777199,0.599415
