# Gradient boosting

### Imports

In [None]:
#libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import urllib as url
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

#custom
import utils
import plots
import transformers as tran

LABEL = "1Y_default"
SEED = 42

%matplotlib inline  
%load_ext autoreload
%autoreload 2

random.seed(SEED)

### Load data

In [None]:
#read from csvs
train_df = pd.read_csv("train_full.csv", na_filter = False)
test_df = pd.read_csv("test_full.csv", na_filter = False)

In [None]:
#sanity checks
print("Train shape: {}".format(train_df.shape))
print("Test shape: {}".format(test_df.shape))
print("Train:")
utils.check_missing(train_df)
print("Test:")
utils.check_missing(test_df)

In [None]:
#numerically encoded categoricals
to_categorical = ["Education", "EmploymentStatus", "Gender", "HomeOwnershipType", "LanguageCode", "MaritalStatus",
                 "NewCreditCustomer", "OccupationArea", "UseOfLoan", "VerificationType", "Country", "NrOfDependants", 
                  "NrOfDependants", "WorkExperience", "EmploymentDurationCurrentEmployer", "MonthlyPayment"]
for feature in to_categorical:
    train_df[feature] = train_df[feature].astype("category")
    test_df[feature] = test_df[feature].astype("category")

### First simple benchmark

In [None]:
#model: XGBoost
gb_ben = XGBClassifier(n_estimators=100, random_state=SEED)  
print(gb_ben)
#prepare sets
ohe_rf = tran.OHE_transformer()
#train
X_train = ohe_rf.fit_transform(train_df.drop([LABEL], axis=1))
y_train = train_df[LABEL]
#test
X_test = ohe_rf.transform(test_df.drop([LABEL], axis=1))
y_test = test_df[LABEL]
#fit
gb_ben.fit(X_train.values, y_train.values)
#predict
train_preds_proba = gb_ben.predict_proba(X_train.values)[:,1]
test_preds_proba = gb_ben.predict_proba(X_test.values)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(gb_ben, X_test, y_test)
imps = plots.FeaturesImportanceTree(gb_ben, X_train.columns, figsize=(11,25), ret_idx=True)

## Hyper parametr tuning

In [None]:
#rough grid
param_grid = {"n_estimators": np.arange(80, 500, 20),
              "learning_rate" : [0.05, 0.1, 0.2, 0.3],
              "max_depth" : [2, 3, 4, 5, 6],
              "subsample" : [0.5, 0.7, 1]}

gb = XGBClassifier()
gs = GridSearchCV(gb, param_grid, scoring=utils.GINI_SCORER ,verbose=2)
gs.fit(X_train.values, y_train.values)

### Tuned model

In [None]:
gb = gs.best_estimator_
print(gb)

#predict
train_preds_proba = gb.predict_proba(X_train.values)[:,1]
test_preds_proba = gb.predict_proba(X_test.values)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(gb, X_test, y_test)
plots.FeaturesImportanceTree(gb, X_train.columns, figsize=(11,25))

### Fine tuned model

In [None]:
#finer grid around best parameters found so far
param_grid_fine = {"n_estimators": np.arange(180, 310, 10),
                   "learning_rate" : [0.025, 0.05, 0.075],
                   "max_depth" : [5, 6], #don't want the trees to be much deeper than generally recommended 
                   "subsample" : [0.7]}

gb_fine = XGBClassifier()
gs_fine = GridSearchCV(gb_fine, param_grid_fine, scoring=utils.GINI_SCORER ,verbose=2)
gs_fine.fit(X_train.values, y_train.values)

In [None]:
gb_fine = gs_fine.best_estimator_
print(gb_fine)

#predict
train_preds_proba = gb_fine.predict_proba(X_train.values)[:,1]
test_preds_proba = gb_fine.predict_proba(X_test.values)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(gb_fine, X_test, y_test)
plots.FeaturesImportanceTree(gb_fine, X_train.columns, figsize=(11,25))

### Model comparison

In [None]:
#refit tuned
#best models found in previous iterations
gb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=220,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

gb_fine = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.025, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

gb.fit(X_train.values, y_train.values)
gb_fine.fit(X_train.values, y_train.values)


models = [gb_ben, gb, gb_fine]
names = ["GB benchmark", "GB tuned", "GB fine tuned"]

plots.plot_ROC_multiple(y_test, X_test, names, models)