# Prudential Life Insurance Assessment

## Part 3 - XGBoost Baseline Model
___

### Import dependencies and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier, plot_importance

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import processed datasets
main_df = pd.read_csv('data/processed/train_processed.csv')
test_df = pd.read_csv('data/processed/test_processed.csv')

### Data Preparation

In [3]:
# Set X and y datasets
X = main_df.drop(labels='Response', axis=1)
y = main_df['Response']

# Perform train test split (Not doing as we will be using cross-validation on entire train set)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to XGB data structure
d_train = xgb.DMatrix(X, y)

<IPython.core.display.Javascript object>

### Run XGBoost Classification

In [4]:
# Set parameter grid
xgb_params = {'max_depth': [3, 5, 6, 10, 15, 20], # Maximum depth of a tree
              'learning_rate': [0.01, 0.1, 0.2, 0.3], # Step size shrinkage used in update to prevents overfitting
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.4, 1.0, 0.1), # Number of features supplied to a tree
              'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
              'n_estimators': [100, 500, 1000]}

In [9]:
# Create RandomizedSearchCV instance
xgb_grid = RandomizedSearchCV(estimator=XGBClassifier(objective='multi:softmax', num_class=8), 
                              param_distributions=xgb_params, 
                              cv=5, 
                              n_jobs=-1, 
                              verbose=2,
                              n_iter=20,
                              scoring='f1_macro') 

In [None]:
# Run XGBoost grid search
xgb_grid.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
