### In this notebook:
I train xgboost model with optimized hyperparameters (see notebook `xgb_random_search`) on whole feature matrix data and make prediction on test dataset. Feature matrix has selected variables after feature selection. See notebook `feature_selection`.

### About project:
Data is from Kaggle competiotion [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk). 

I implement an automated feature engineering approach with an open-source library [Featuretools](https://www.featuretools.com/). 


In [10]:
import xgboost as xgb 

import numpy as np
import pandas as pd
import time
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import gc

In [62]:
from sklearn.model_selection import train_test_split

In [49]:
# Import data

In [None]:
dtrain1 = pd.read_csv('../data/feature_matrix1_selected.csv')
dtrain2 = pd.read_csv('../data/feature_matrix2_selected.csv')
dtrain3 = pd.read_csv('../data/feature_matrix3_selected.csv')

In [17]:
feature_matrix4_selected = pd.read_csv('../data/feature_matrix4_selected.csv')

dtrain4 = feature_matrix4_selected.loc[~feature_matrix4_selected['TARGET'].isnull()]

In [None]:
dtest = feature_matrix4_selected.loc[feature_matrix4_selected['TARGET'].isnull()]
dtest.drop('TARGET', inplace=True)

In [43]:
dtrain = pd.concat([dtrain1, dtrain2, dtrain3, dtrain4], axis=0)

In [44]:
dtrain.shape

(134611, 1038)

In [48]:
dtest.shape

(48744, 1037)

In [60]:
id_col = 'SK_ID_CURR'
target_col = 'TARGET'

predictors = [x for x in dtrain.columns if x not in [id_col, target_col]]

In [56]:
with open('../data/best_params.pkl', mode='rb') as f:
    best_params = pickle.load(f)

In [58]:
best_params

{'subsample': 0.89,
 'reg_lambda': 0.12244897959183673,
 'reg_alpha': 0.1020408163265306,
 'n_estimators': 700,
 'min_child_weight': 3,
 'max_depth': 5,
 'learning_rate': 0.027271356526649162,
 'gamma': 0.32,
 'colsample_bytree': 0.72,
 'colsample_bylevel': 0.6799999999999999}

In [None]:
best_params['reg_lambda'] = 0.12
best_params['reg_alpha'] = 0.1
best_params['learning_rate'] = 0.027
best_params['colsample_bylevel'] = 0.68

#### Model

In [65]:
clf = xgb.XGBClassifier(silent=False, **best_params)

In [75]:
clf.fit(X = dtrain[predictors], y = dtrain[target_col],
    eval_set=[(dtrain[predictors], dtrain[target_col])],
        eval_metric='auc',
        verbose=True)

In [None]:
pickle.dump(clf, open('..data/clf.pickle.dat', 'wb'))

#### Evaluation

In [11]:
from sklearn.metrics import roc_auc_score

In [None]:
evals_result = clf.evals_result()

In [None]:
evals_result

In [None]:
#Train evaluation
train_pred = clf.predict_proba(dtrain[predictors])

In [None]:
roc_auc_score(dtrain[target_col], train_pred)

In [None]:
predictions = clf.predict_proba(dtest)