# Extremely randomized trees

### Imports

In [None]:
#libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import urllib as url
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

#custom
import utils
import config
import plots
import transformers as tran

LABEL = "1Y_default"
SEED = 42

%matplotlib inline  
%load_ext autoreload
%autoreload 2

random.seed(SEED)

### Load data

In [None]:
#read from csvs
train_df = pd.read_csv("train_full.csv", na_filter = False)
test_df = pd.read_csv("test_full.csv", na_filter = False)

In [None]:
#sanity checks
print("Train shape: {}".format(train_df.shape))
print("Test shape: {}".format(test_df.shape))
print("Train:")
utils.check_missing(train_df)
print("Test:")
utils.check_missing(test_df)

In [None]:
#numerically encoded categoricals
to_categorical = ["Education", "EmploymentStatus", "Gender", "HomeOwnershipType", "LanguageCode", "MaritalStatus",
                 "NewCreditCustomer", "OccupationArea", "UseOfLoan", "VerificationType", "Country", "NrOfDependants", 
                  "NrOfDependants", "WorkExperience", "EmploymentDurationCurrentEmployer", "MonthlyPayment"]
for feature in to_categorical:
    train_df[feature] = train_df[feature].astype("category")
    test_df[feature] = test_df[feature].astype("category")

### First simple benchmark

In [None]:
#model: ExtraTrees
et_ben = ExtraTreesClassifier(n_estimators=100, random_state=SEED)  
print(et_ben)
#prepare sets
ohe_rf = tran.OHE_transformer()
#train
X_train = ohe_rf.fit_transform(train_df.drop([LABEL], axis=1))
y_train = train_df[LABEL]
#test
X_test = ohe_rf.transform(test_df.drop([LABEL], axis=1))
y_test = test_df[LABEL]
#fit
et_ben.fit(X_train, y_train)
#predict
train_preds_proba = et_ben.predict_proba(X_train)[:,1]
test_preds_proba = et_ben.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(et_ben, X_test, y_test)
imps = plots.FeaturesImportanceTree(et_ben, X_train.columns, figsize=(11,25), ret_idx=True)

## Hyper parametr tuning

In [None]:
#rough grid
param_grid = {"n_estimators": np.arange(20, 320, 20),
              "max_features" : ["sqrt", "log2"]}

et = ExtraTreesClassifier()
gs = GridSearchCV(et, param_grid, scoring=utils.GINI_SCORER ,verbose=2)
gs.fit(X_train, y_train)

### Tuned model

In [None]:
# et = gs.best_estimator_
# print(et)

et = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='log2', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

et.fit(X_train, y_train)

#predict
train_preds_proba = et.predict_proba(X_train)[:,1]
test_preds_proba = et.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(et, X_test, y_test)
plots.FeaturesImportanceTree(et, X_train.columns, figsize=(11,25))

### Tune depth

In [None]:
#depth "grid"
param_grid = {"max_depth" : [5, 10, 12, 15, 18, 20, None]}

et_depth = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='log2', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=280, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

gs = GridSearchCV(et_depth, param_grid, scoring=utils.GINI_SCORER ,verbose=2)
gs.fit(X_train, y_train)

In [None]:
et_depth = gs.best_estimator_
print(et_depth)

et_depth.fit(X_train, y_train)
#predict
train_preds_proba = et_depth.predict_proba(X_train)[:,1]
test_preds_proba = et_depth.predict_proba(X_test)[:,1]
#results
utils.print_results(train_preds_proba, y_train, test_preds_proba, y_test)
plots.plot_ROC_curve(et_depth, X_test, y_test)
plots.FeaturesImportanceTree(et_depth, X_train.columns, figsize=(11,25))

### Model comparison

In [None]:
models = [et_ben, et]
names = ["ET benchmark", "ET tuned"]

plots.plot_ROC_multiple(y_test, X_test, names, models)