# AdaBoost

### Imports

In [None]:
#libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import urllib as url
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

#custom
import utils
import config
import plots
import transformers as tran

LABEL = "1Y_default"
SEED = 42

%matplotlib inline  
%load_ext autoreload
%autoreload 2

random.seed(SEED)

### Load data

In [None]:
#read from csvs
train_df = pd.read_csv("train_full.csv", na_filter = False)
test_df = pd.read_csv("test_full.csv", na_filter = False)

In [None]:
#sanity checks
print("Train shape: {}".format(train_df.shape))
print("Test shape: {}".format(test_df.shape))
print("Train:")
utils.check_missing(train_df)
print("Test:")
utils.check_missing(test_df)

In [None]:
#numerically encoded categoricals
to_categorical = ["Education", "EmploymentStatus", "Gender", "HomeOwnershipType", "LanguageCode", "MaritalStatus",
                 "NewCreditCustomer", "OccupationArea", "UseOfLoan", "VerificationType", "Country", "NrOfDependants", 
                  "NrOfDependants", "WorkExperience", "EmploymentDurationCurrentEmployer", "MonthlyPayment"]
for feature in to_categorical:
    train_df[feature] = train_df[feature].astype("category")
    test_df[feature] = test_df[feature].astype("category")

### First simple benchmark

In [None]:
#model: AdaBoost
ab_ben = AdaBoostClassifier(n_estimators=100, random_state=SEED)  
print(ab_ben)
#prepare sets
ohe_rf = tran.OHE_transformer()
#train
X_train = ohe_rf.fit_transform(train_df.drop([LABEL], axis=1))
y_train = train_df[LABEL]
#test
X_test = ohe_rf.transform(test_df.drop([LABEL], axis=1))
y_test = test_df[LABEL]
#fit
ab_ben.fit(X_train, y_train)
#predict
train_preds_proba = ab_ben.predict_proba(X_train)[:,1]
test_preds_proba = ab_ben.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(ab_ben, X_test, y_test)
imps = plots.FeaturesImportanceTree(ab_ben, X_train.columns, figsize=(11,25), ret_idx=True)

## Hyper parametr tuning

In [None]:
#rough grid
param_grid = {"n_estimators": np.arange(60, 520, 20),
              "learning_rate" : [0.2, 0.5, 0.7, 1.0]}

ab = AdaBoostClassifier()
gs = GridSearchCV(ab, param_grid, scoring=utils.GINI_SCORER ,verbose=2)
gs.fit(X_train, y_train)

### Tuned model

In [None]:
ab = gs.best_estimator_
print(ab)

#predict
train_preds_proba = ab.predict_proba(X_train)[:,1]
test_preds_proba = ab.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(ab, X_test, y_test)
plots.FeaturesImportanceTree(ab, X_train.columns, figsize=(11,25))

### Fine tuned model

In [None]:
#finer grid around best parameters found so far
#rough grid
param_grid_fine = {"n_estimators": np.arange(470, 600, 10),
              "learning_rate" : [0.1, 0.15, 0.2, 0.25, 0.30]}

ab_fine = AdaBoostClassifier()
gs_fine = GridSearchCV(ab_fine, param_grid_fine, scoring=utils.GINI_SCORER ,verbose=2)
gs_fine.fit(X_train, y_train)

In [None]:
ab_fine = gs_fine.best_estimator_
print(ab_fine)

#predict
train_preds_proba = ab_fine.predict_proba(X_train)[:,1]
test_preds_proba = ab_fine.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(ab_fine, X_test, y_test)
plots.FeaturesImportanceTree(ab_fine, X_train.columns, figsize=(11,25))

### Model comparison

In [None]:
models = [ab_ben, ab_fine]
names = ["AB benchmark", "AB fine"]

plots.plot_ROC_multiple(y_test, X_test, names, models)