# End-to-End AutoML for Life Insurance Assessment

## Part 2 - XGBoost Baseline Model
___

### Import dependencies and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Import processed datasets
main_df = pd.read_csv('./data/processed/train.csv')
test_df = pd.read_csv('./data/processed/test.csv')

In [None]:
main_df.head()

Unnamed: 0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4,Employment_Info_5,Employment_Info_6,InsuredInfo_1,InsuredInfo_2,InsuredInfo_3,InsuredInfo_4,InsuredInfo_5,InsuredInfo_6,InsuredInfo_7,Insurance_History_1,Insurance_History_2,Insurance_History_3,Insurance_History_4,Insurance_History_5,Insurance_History_7,Insurance_History_8,Insurance_History_9,Family_Hist_1,Family_Hist_2,Family_Hist_4,Medical_History_1,Medical_History_2,Medical_History_3,Medical_History_4,Medical_History_5,Medical_History_6,Medical_History_7,Medical_History_8,Medical_History_9,Medical_History_11,Medical_History_12,Medical_History_13,Medical_History_14,Medical_History_16,Medical_History_17,Medical_History_18,Medical_History_19,Medical_History_20,Medical_History_21,Medical_History_22,Medical_History_23,Medical_History_25,Medical_History_26,Medical_History_27,Medical_History_28,Medical_History_29,Medical_History_30,Medical_History_31,Medical_History_33,Medical_History_34,Medical_History_35,Medical_History_36,Medical_History_37,Medical_History_38,Medical_History_39,Medical_History_40,Medical_History_41,Medical_Keyword_1,Medical_Keyword_2,Medical_Keyword_3,Medical_Keyword_4,Medical_Keyword_5,Medical_Keyword_6,Medical_Keyword_7,Medical_Keyword_8,Medical_Keyword_9,Medical_Keyword_10,Medical_Keyword_11,Medical_Keyword_12,Medical_Keyword_13,Medical_Keyword_14,Medical_Keyword_15,Medical_Keyword_16,Medical_Keyword_17,Medical_Keyword_18,Medical_Keyword_19,Medical_Keyword_20,Medical_Keyword_21,Medical_Keyword_22,Medical_Keyword_23,Medical_Keyword_24,Medical_Keyword_25,Medical_Keyword_26,Medical_Keyword_27,Medical_Keyword_28,Medical_Keyword_29,Medical_Keyword_30,Medical_Keyword_31,Medical_Keyword_32,Medical_Keyword_33,Medical_Keyword_34,Medical_Keyword_35,Medical_Keyword_36,Medical_Keyword_37,Medical_Keyword_38,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response,Product_Info_2_Alpha,Product_Info_2_Num,Med_Keywords_Count
0,1,16,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,0.028,12,1,0.0,3,,1,2,6,3,1,2,1,1,1,3,1,0.000667,1,1,2,2,,,4.0,112,2,1,1,3,2,2,1,3,2,3,3,3,3,1,1,2,1,2,3,1,3,3,1,3,2,3,1,3,1,2,2,1,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,3,3,0
1,1,0,26,0.076923,2,3,1,0.059701,0.6,0.131799,0.272288,0.0,1,3,0.0,2,0.0018,1,2,6,3,1,2,1,2,1,3,1,0.000133,1,3,2,2,0.188406,0.084507,5.0,412,2,1,1,3,2,2,1,3,2,3,3,1,3,1,1,2,1,2,3,1,3,3,1,3,2,3,3,1,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,0
2,1,18,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.42878,0.03,9,1,0.0,2,0.03,1,2,8,3,1,1,1,2,1,1,3,,3,2,3,3,0.304348,0.225352,10.0,3,2,2,1,3,2,2,2,3,2,3,3,1,3,1,1,2,1,2,3,2,2,3,1,3,2,3,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,4,1,0
3,1,17,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,0.042,9,1,0.0,3,0.2,2,2,8,3,1,2,1,2,1,1,3,,3,2,3,3,0.42029,0.352113,0.0,350,2,2,1,3,2,2,2,3,2,3,3,1,3,1,1,2,2,2,3,1,3,3,1,3,2,3,3,3,1,2,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,3,4,1
4,1,15,26,0.230769,2,3,1,0.41791,0.654545,0.23431,0.424046,0.027,9,1,0.0,2,0.05,1,2,6,3,1,2,1,2,1,1,3,,3,2,3,2,0.463768,0.408451,,162,2,2,1,3,2,2,2,3,2,3,3,1,3,1,1,2,1,2,3,2,2,3,1,3,2,3,3,3,1,3,2,1,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,3,2,0


### Data Preparation

In [None]:
main_df['Response'].value_counts(dropna=False)

8    19489
6    11233
7     8027
2     6552
1     6207
5     5432
4     1428
3     1013
Name: Response, dtype: int64

In [None]:
# Set X and y datasets
X = main_df.drop(columns='Response')
y = main_df['Response']

# Perform train test split (Not doing as we will be using cross-validation on entire train set)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to XGB data structure
# d_train = xgb.DMatrix(X, y)

### Run XGBoost Classification

In [None]:
# Set parameter grid (More extensive)
xgb_params = {'max_depth': [3, 5, 6, 10, 12, 14], # Maximum depth of a tree
              'learning_rate': [0.01, 0.1, 0.2, 0.3], # Step size shrinkage used in update to prevents overfitting
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.4, 1.0, 0.1), # Number of features supplied to a tree
              'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
              'n_estimators': np.arange(100, 400, 100),
              'gamma': np.arange(0, 0.3, 0.1) }

# Set parameter grid (Less extensive)
# xgb_params = {
#     "learning_rate": [0.1, 0.01],
#     "colsample_bytree": [0.6, 0.8, 1.0],
#     "subsample": [0.6, 0.8, 1.0],
#     "max_depth": [2, 3, 4],
#     "n_estimators": [100, 200, 300, 400],
#     "reg_lambda": [1, 1.5, 2],
#     "gamma": [0, 0.1, 0.3],
# }

In [None]:
# Create RandomizedSearchCV instance
xgb_grid = RandomizedSearchCV(estimator=XGBClassifier(objective='binary:logistic', 
                                                      tree_method="gpu_hist", # Use GPU
                                                      random_state=42), 
                              param_distributions=xgb_params, 
                              cv=5, 
                              verbose=2,
                              n_iter=20,
                              scoring='f1_macro') 

In [None]:
# Run XGBoost grid search
xgb_grid.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 115.2min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None, num_class=8,
                                           objective='multi:softmax',
                                           random_state=42, reg_alpha=0,
                                           reg_...
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 

In [None]:
# Best parameters from RandomizedSearchCV
xgb_grid.best_params_

{'colsample_bylevel': 0.8999999999999999,
 'colsample_bytree': 0.7999999999999999,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 6,
 'n_estimators': 300,
 'subsample': 0.5}

In [None]:
# Get best XGBoost model (based on best parameters) and predict on test set
xgb_best = xgb_grid.best_estimator_

### Predict on Test Set

In [None]:
# Make predictions on test dataset (without ID)
X_test = test_df.drop("Id",axis=1).copy()
preds = xgb_best.predict(X_test)

In [None]:
# Format in submission format
submission = pd.DataFrame({'Id': test_df['Id'].values, 
                           'Response': preds})
submission.head()

Unnamed: 0,Id,Response
0,1,2
1,3,8
2,4,6
3,9,8
4,12,4


In [None]:
# Save as CSV for submission online on Kaggle
submission.to_csv(f'submissions/prudential_xgboost.csv', index=False)

##### Final Score: 0.56279

___
### References
- https://medium.com/analytics-vidhya/using-gpu-to-boost-xgboost-training-time-533a114164d7
- https://xgboost.readthedocs.io/en/release_0.81/gpu/index.html