### Global hyperpameters

In [1]:
SEED = 12345 # global random seed for better reproducibility
N_MODELS = 20 # should be less than 100

### Python imports and inits

In [2]:
# suppres Pandas future warning ... they are deprecating `append` ... thanks
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np   # array, vector, matrix calculations
import pandas as pd  # DataFrame handling
import xgboost as xgb 
import matplotlib.pyplot as plt # general plotting
pd.options.display.max_columns = 999 # enable display of all columns in notebook

# for grid search custom functions
import itertools
import json

# set numpy random seed
np.random.seed(SEED)

### Importing dataset 

In [3]:
data = pd.read_csv('credit_line_increase.csv')
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,RACE,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DELINQ_NEXT
0,1,20000,2,1.0,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2.0,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,3.0,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,4.0,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,3.0,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Assign target and inputs for models
Note that Demographic features are not used as model inputs.

In [4]:
id_col = 'ID'
groups = ['SEX', 'RACE', 'EDUCATION', 'MARRIAGE', 'AGE']
target = 'DELINQ_NEXT'

In [5]:
np.random.seed(SEED)

split_train_test = 2/3

split = np.random.rand(len(data)) < split_train_test
train = data[split].copy()
test = data[~split].copy()

split_test_valid = 1/2

split = np.random.rand(len(test)) < split_test_valid
valid = test[split].copy()
test = test[~split].copy()

del data

print(f"Train/Validation/Test sizes: {len(train)}/{len(valid)}/{len(test)}")

Train/Validation/Test sizes: 19919/5045/5036


In [6]:
target = 'DELINQ_NEXT'
demographic_cols = ['SEX', 'RACE','EDUCATION', 'MARRIAGE', 'AGE']
features = [col for col in train.columns if col not in demographic_cols + ['ID',target]]

print('target =', target)
print('predictors =', features)

target = DELINQ_NEXT
predictors = ['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


In [7]:
# Converting Pandas dataframe into DMatrix 
training_frame = xgb.DMatrix(train[features], train[target])
validation_frame = xgb.DMatrix(valid[features], valid[target])
test_frame = xgb.DMatrix(test[features], test[target])

# Training XGBoost model

#### Utility Functions For Training 
Using random grid search to find the best hyperparameter values

In [8]:
def _train(_dtrain, _dvalid, _mono_constraints=None, _xgb_params=None, _ntree=None,
          _early_stopping_rounds=None, _verbose=None, _seed=None, _logger=None):

    """ Wrapper for XGBoost train method.

    :param _dtrain: Training data in LightSVM format.
    :param _dvalid: Validation data in LightSVM format.
    :param _mono_constraints: User-supplied monotonicity constraints.
    :param _xgb_params: XGBoost hyperparameters.
    :param _ntree: Number of trees in XGBoost model, default 250.
    :param _early_stopping_rounds: XGBoost early stopping rounds, default 100.
    :param _verbose: Whether to display training iterations, default True.
    :param _seed: Random seed for better interpretability, defaults to global SEED.
    :return: Trained XGBoost model.

    """

    if _mono_constraints is not None:
        _xgb_params['monotone_constraints'] = _mono_constraints

    # must train on AUC
    xgb_params['eval_metric'] = 'auc'
        
    print('Training with parameters:')
    print(json.dumps(_xgb_params, indent=2))        
        
    watchlist = [(_dtrain, 'train'), (_dvalid, 'eval')]

    # train
    model = xgb.train(_xgb_params,
                      _dtrain,
                      _ntree,
                      early_stopping_rounds=_early_stopping_rounds,
                      evals=watchlist,
                      verbose_eval=_verbose)

    return model


def random_grid_train(_dtrain, _dvalid, _mono_constraints=None, _xgb_params=None, 
                      _cv_params=None, _n_models=None, _ntree=None, 
                      _early_stopping_rounds=None, _verbose=None,
                      _seed=None):
    
    """ Performs a random grid search over _n_models and _cv_params.

    :param _dtrain: Training data in LightSVM format.
    :param _dvalid: Validation data in LightSVM format.
    :param _mono_constraints: User-supplied monotonicity constraints.
    :param _xgb_params: XGBoost hyperparameters.
    :param _cv_params: Dictionary of lists of potential XGBoost parameters over which to search.
    :param _n_models: Number of random models to evaluate.
    :param _ntree: Number of trees in XGBoost model, default 250.
    :param _early_stopping_rounds: XGBoost early stopping rounds, default 100.
    :param _verbose: Whether to display training iterations, default True.
    :param _seed: Random seed for better interpretability, defaults to global SEED.
    :return: tuple of (best candidate model from random grid search, entire grid of models)

    """

    print('Starting random grid search over %d models.' % int(_n_models))

    # cartesian product of _cv_params
    keys, values = zip(*_cv_params.items())
    experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # select randomly from cartesian product space
    selected_experiments = np.random.choice(len(experiments), _n_models)

    # pull in global params for objective, monotonicity etc.
    _params: dict = _xgb_params

    # init grid search loop conditional on eval_metric
    best_candidate = None
    best_score = 0

    # full dict of grid candidates
    candidates = {}
    
    # grid search loop
    for i, exp in enumerate(selected_experiments):

        _params.update(experiments[exp])  # override global params with current grid run params

        print('Grid search run %d/%d.' % (int(i + 1), int(_n_models)))

        # train on current params
        candidate = _train(_dtrain, _dvalid, _mono_constraints=_mono_constraints, _ntree=_ntree,
                           _xgb_params=xgb_params, _early_stopping_rounds=_early_stopping_rounds,
                           _verbose=_verbose, _seed=_seed)

        candidates[i] = {'model': candidate,  'score': candidate.best_score}
        
        if candidate.best_score > best_score:
            best_candidate = candidate
            best_score = candidate.best_score
            print('Grid search new best score discovered at iteration %d/%d: %.4f.' %
                     (int(i + 1), int(_n_models), candidate.best_score))

        print()
        print('----------- ------------')
        print()
    
    return best_candidate, candidates


#### Train random grid search 

In [9]:
# default params
xgb_params = {'booster': 'gbtree',
              'colsample_bytree': 0.6,
              'eta': 0.001,
              'max_depth': 5,
              'objective': 'binary:logistic',
              'reg_alpha': 0.005,
              'reg_lambda': 0.005,
              'seed': SEED,
              'subsample': 0.6}

gs_params = {'colsample_bytree': [0.3, 0.5, 0.7],
             'eta': [0.005, 0.05, 0.3],
             'max_depth': [3, 5, 7],
             'reg_alpha': [0.0005, 0.005, 0.05],
             'reg_lambda': [0.0005, 0.005, 0.05],
             'subsample': [0.3, 0.5, 0.7]}

# grid search prelims 
xgb_params['nthread'] = 16
train_mean_y = float(train[target].mean())
xgb_params['base_score'] = train_mean_y  # mean of y

# +1 positive correlation to target
# 0 no correlation to target
# -1 negative correlation to target
mono_constraints = {'LIMIT_BAL': -1,
                    'PAY_0': 1,
                    'PAY_2': 1,
                    'PAY_3': 1,
                    'PAY_4': 1,
                    'PAY_5': 1,
                    'PAY_6': 1,
                    'BILL_AMT1': -1,
                    'BILL_AMT2': -1,
                    'BILL_AMT3': -1,
                    'BILL_AMT4': -1,
                    'BILL_AMT5': -1,
                    'BILL_AMT6': -1,
                    'PAY_AMT1': -1,
                    'PAY_AMT2': -1,
                    'PAY_AMT3': -1,
                    'PAY_AMT4': -1,
                    'PAY_AMT5': -1,
                    'PAY_AMT6': -1}

n_gs_models = N_MODELS
ntree = 1000
early_stopping_rounds = 50
verbose = False

# train
best_xgb, grid = random_grid_train(training_frame, validation_frame, _mono_constraints=mono_constraints, 
                                   _xgb_params=xgb_params, _cv_params=gs_params, _n_models=n_gs_models,
                                   _ntree=ntree, _early_stopping_rounds=early_stopping_rounds, 
                                   _verbose=verbose, _seed=SEED)


Starting random grid search over 20 models.
Grid search run 1/20.
Training with parameters:
{
  "booster": "gbtree",
  "colsample_bytree": 0.7,
  "eta": 0.05,
  "max_depth": 5,
  "objective": "binary:logistic",
  "reg_alpha": 0.005,
  "reg_lambda": 0.0005,
  "seed": 12345,
  "subsample": 0.3,
  "nthread": 16,
  "base_score": 0.22029218334253728,
  "monotone_constraints": {
    "LIMIT_BAL": -1,
    "PAY_0": 1,
    "PAY_2": 1,
    "PAY_3": 1,
    "PAY_4": 1,
    "PAY_5": 1,
    "PAY_6": 1,
    "BILL_AMT1": -1,
    "BILL_AMT2": -1,
    "BILL_AMT3": -1,
    "BILL_AMT4": -1,
    "BILL_AMT5": -1,
    "BILL_AMT6": -1,
    "PAY_AMT1": -1,
    "PAY_AMT2": -1,
    "PAY_AMT3": -1,
    "PAY_AMT4": -1,
    "PAY_AMT5": -1,
    "PAY_AMT6": -1
  },
  "eval_metric": "auc"
}
Grid search new best score discovered at iteration 1/20: 0.7781.

----------- ------------

Grid search run 2/20.
Training with parameters:
{
  "booster": "gbtree",
  "colsample_bytree": 0.7,
  "eta": 0.005,
  "max_depth": 7,
  "obj


----------- ------------

Grid search run 12/20.
Training with parameters:
{
  "booster": "gbtree",
  "colsample_bytree": 0.5,
  "eta": 0.3,
  "max_depth": 3,
  "objective": "binary:logistic",
  "reg_alpha": 0.0005,
  "reg_lambda": 0.0005,
  "seed": 12345,
  "subsample": 0.3,
  "nthread": 16,
  "base_score": 0.22029218334253728,
  "monotone_constraints": {
    "LIMIT_BAL": -1,
    "PAY_0": 1,
    "PAY_2": 1,
    "PAY_3": 1,
    "PAY_4": 1,
    "PAY_5": 1,
    "PAY_6": 1,
    "BILL_AMT1": -1,
    "BILL_AMT2": -1,
    "BILL_AMT3": -1,
    "BILL_AMT4": -1,
    "BILL_AMT5": -1,
    "BILL_AMT6": -1,
    "PAY_AMT1": -1,
    "PAY_AMT2": -1,
    "PAY_AMT3": -1,
    "PAY_AMT4": -1,
    "PAY_AMT5": -1,
    "PAY_AMT6": -1
  },
  "eval_metric": "auc"
}

----------- ------------

Grid search run 13/20.
Training with parameters:
{
  "booster": "gbtree",
  "colsample_bytree": 0.7,
  "eta": 0.05,
  "max_depth": 5,
  "objective": "binary:logistic",
  "reg_alpha": 0.0005,
  "reg_lambda": 0.0005,
  "see

## Overall Rank after Grid Search based on AUC

In [10]:
models_rank = pd.DataFrame().from_dict(grid, orient='index')
models_rank.sort_values(by='score', ascending=False, inplace=True)
models_rank.reset_index(inplace=True)
models_rank

Unnamed: 0,index,model,score
0,12,<xgboost.core.Booster object at 0x7f4bfd785c70>,0.778798
1,2,<xgboost.core.Booster object at 0x7f4bfd770460>,0.778757
2,3,<xgboost.core.Booster object at 0x7f4bfd7709a0>,0.778585
3,13,<xgboost.core.Booster object at 0x7f4bfd785610>,0.778562
4,1,<xgboost.core.Booster object at 0x7f4bfe804df0>,0.778538
5,10,<xgboost.core.Booster object at 0x7f4bfd78c0a0>,0.778391
6,0,<xgboost.core.Booster object at 0x7f4bfe804b50>,0.778114
7,8,<xgboost.core.Booster object at 0x7f4bfd785580>,0.777971
8,18,<xgboost.core.Booster object at 0x7f4bfd770df0>,0.777885
9,17,<xgboost.core.Booster object at 0x7f4bfd7811f0>,0.777241


#### Best model grid search params

In [17]:
# index is index in grid dictionary
# "model 0 " = grid[12], models_rank.iloc[0, 0] = 12
json.loads(grid[models_rank.iloc[0, 0]]['model'].save_config())

{'learner': {'generic_param': {'fail_on_invalid_gpu_id': '0',
   'gpu_id': '-1',
   'n_jobs': '16',
   'nthread': '16',
   'random_state': '12345',
   'seed': '12345',
   'seed_per_iteration': '0',
   'validate_parameters': '1'},
  'gradient_booster': {'gbtree_train_param': {'num_parallel_tree': '1',
    'predictor': 'auto',
    'process_type': 'default',
    'tree_method': 'exact',
    'updater': 'grow_colmaker,prune',
    'updater_seq': 'grow_colmaker,prune'},
   'name': 'gbtree',
   'specified_updater': False,
   'updater': {'grow_colmaker': {'colmaker_train_param': {'opt_dense_col': '1'},
     'train_param': {'alpha': '0.000500000024',
      'cache_opt': '1',
      'colsample_bylevel': '1',
      'colsample_bynode': '1',
      'colsample_bytree': '0.699999988',
      'default_direction': 'learn',
      'eta': '0.0500000007',
      'gamma': '0',
      'grow_policy': 'depthwise',
      'interaction_constraints': '',
      'lambda': '0.000500000024',
      'learning_rate': '0.05000000