# Train a XGBoost binary classifier

Given XGBoost's popularity and success on Kaggle I wanted to try it out. This workbook contains 2 sections:

* Create the datasets needed to train, validate, and test the model.
        Import the features and labels files, combine and split them into 80:20 train:test sets
* Tune some hyper-parameters to check if the model can be improved.
        Use a random search cross validation to detemine if we can improve on the base model.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 75 # default for me was 75
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GridSearchCV #cross_validate
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import SCORERS, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
#from sklearn.model_selection import validation_curve

#conda install -c conda-forge shap
import shap
#Load JS visualisation code to Notebook
shap.initjs()

seed = 207

import pickle
from datetime import datetime

## 1. Create the data sets

In [None]:
# label to evaluate
labelToEval = 'cityScore'

# As I am generating all the data I need to extract a decent set of data for eventual testing.
featuresDataFilename = 'ModelInput/features.csv'
labelsDataFilename = 'ModelInput/labels.csv'
# All feature and label data I have available
featuresDf = pd.read_csv(featuresDataFilename)
print('Features: {}'.format(featuresDf.shape))
labelsDf = pd.read_csv(labelsDataFilename)[['cityId', labelToEval]]
print('  Labels: {}'.format(labelsDf.shape))

# Merge the features and labels so I can extract the various sets
dataDf = pd.merge(featuresDf, labelsDf, on='cityId', how='inner')
print('Combined: {}'.format(dataDf.shape))

# Start at 2nd column, i.e. exclude cityId
allX = pd.get_dummies(dataDf.iloc[:,2:len(featuresDf.columns)])
print('\n   All X: {}'.format(allX.shape))
allY = dataDf[labelToEval]
print('   All y: {}'.format(allY.shape))

# Create train and test split values. 
# train will be split further using the StratifiedKFold random search below for parameter Opt...
trainXouter, testX, trainYouter, testY = train_test_split(allX, allY, test_size=0.20, random_state=seed)

print('\n  testX: {}'.format(testX.shape))
print('  testY: {}'.format(testY.shape))
print('\n trainXouter: {}'.format(trainXouter.shape))
print(' trainYouter: {}'.format(trainYouter.shape))

# Now create the DMatrix files that XGBoost prefers
#testDM = xgb.DMatrix(testX, testY)
#trainOuterDM = xgb.DMatrix(trainXouter, trainYouter)
#allDM = xgb.DMatrix(allX, allY)

## Create the model and evaluate it

In [None]:
# As I only have the top quantile at "good" I need to adjust the post weights!
# using ratio of 1 creates poor models for me...
ratio = float(trainYouter.groupby(trainYouter).count()[0] / trainYouter.groupby(trainYouter).count()[1])
print('Est ratio: {:.3f}'.format(ratio))

# As we are using top 25% this ratio for the "real" situation is 3, hardcoding this
ratio = 3

## Use a random search to see if we can improve the baseline model

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=ratio) #, seed=seed)
param_dist = {
    'n_estimators': stats.randint(10, 300),
    'learning_rate': [0.05],
    'max_depth': [1, 2, 3, 4],
    'min_child_weight': [1, 2, 3, 4],
    'subsample': stats.uniform(0.0, 0.3),          # range is 0.10 - 0.30
    'colsample_bytree': stats.uniform(0.7, 0.3),   # range is 0.60 - 0.35
#    'colsample_bylevel': stats.uniform(0.1, 0.85),  # range is 0.60 - 0.35
}
clf = RandomizedSearchCV(clf_xgb, param_distributions = param_dist, n_iter = 20,
                         scoring='balanced_accuracy', error_score = 0, verbose = 3, n_jobs = 2,
                         cv=10, iid=False, refit='balanced_accuracy') #, random_state=seed)

# scoring='average_precision' which is meant to be the 'aucpr' equivalent
# Using it gives great recall, but also horrible precision!
# neg_log_loss, balanced_accuracy, f1, neg_log_loss
numFolds = 10
skf = StratifiedKFold(n_splits=numFolds) #, random_state=seed)

results = []
counter = 1
for train_index, val_index in skf.split(trainXouter, trainYouter):
    print('\nEntering loop number {} of {}\n'.format(counter, numFolds))
    counter += 1
    X_train, X_val = trainXouter.iloc[train_index], trainXouter.iloc[val_index]
    y_train, y_val = trainYouter.iloc[train_index], trainYouter.iloc[val_index]
    clf.fit(X_train, y_train,
            eval_set = [(X_train, y_train), (X_val, y_val)],
            eval_metric= 'auc',
            verbose=30,
            early_stopping_rounds=30,
           )

    # 'eval_metric': ['logloss', 'error', 'auc', 'aucpr'],

    # Create the evaluation metrics for choosing a model
    estimator = clf.best_estimator_
    y_pred = estimator.predict(testX)
    recall=recall_score(testY, y_pred)
    ct = pd.crosstab(
        pd.Series(testY.values, name='Actual'),
        pd.Series(y_pred, name='Predicted'),
        margins=True
    )
    falsePositives = ct.iloc[0,1]
    falseNegatives = ct.iloc[1,0]
    total = len(testX)
    misclassification = round((falseNegatives + falsePositives)/total, 4)
    
    # Add to results
    result = {
        'estNumber': (counter - 2),
        'bestEstimator': estimator,
        'recall': recall,
        'misclassification': misclassification,
        'confusionMatrix': ct
    }
    results.append(result)

print("\n\nAll done!")

# Fitting (cv=5) folds for each of (n_iter=60) candidates

In [None]:
df = pd.DataFrame(results)
df.sort_values(['recall', 'misclassification'], ascending=[False, True], inplace=True)
print(df[['estNumber', 'recall', 'misclassification']])

We are looking for a combination of high recall and low misclassification. Look at the top results above and choose 2 for further checking.

In [None]:
chosenEstNumber1 = 4
chosenEstNumber2 = 9

cm1 = df[df['estNumber'] == chosenEstNumber1].confusionMatrix.iloc[0]
cm2 = df[df['estNumber'] == chosenEstNumber2].confusionMatrix.iloc[0]

'recall', 'misclassification'

print('\nEstimator {}:'.format(chosenEstNumber1))
print('')
print('           Recall: {:.3f}'.format(df[df['estNumber'] == chosenEstNumber1].recall.iloc[0]))
print('Misclassification: {:.3f}'.format(df[df['estNumber'] == chosenEstNumber1].misclassification.iloc[0]))
print('')
print(cm1)
print('\nEstimator {}:'.format(chosenEstNumber2))
print('')
print('           Recall: {:.3f}'.format(df[df['estNumber'] == chosenEstNumber2].recall.iloc[0]))
print('Misclassification: {:.3f}'.format(df[df['estNumber'] == chosenEstNumber2].misclassification.iloc[0]))
print('')
print(cm2)

Pick one estimator (model), print out the key parameters and persist to disk using pickle.

In [None]:
chosenEstNumber = 4
clf = df[df['estNumber'] == chosenEstNumber].bestEstimator.iloc[0]
for param in clf.get_params():
    print('{:>20}: {:>10}'.format(param, str(clf.get_params()[param])))

In [None]:
filename = '{:%Y%m%d_%H%M%S}.xgbmodel'.format(datetime.now())
pickle.dump(clf,
            open('./Models/{}'.format(filename),
                 'wb'))
print(filename)