# End-to-End AutoML for Life Insurance Assessment

## Part 2 - XGBoost Baseline Model
___

### Import dependencies and data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import processed datasets
main_df = pd.read_csv('./data/processed/train.csv')
test_df = pd.read_csv('./data/processed/test.csv')

In [3]:
main_df.head()

Unnamed: 0,Gender,Age,Driving_License,Previously_Insured,Annual_Premium,Vintage,Response,Region_Code_1,Region_Code_10,Region_Code_11,Region_Code_12,Region_Code_13,Region_Code_14,Region_Code_15,Region_Code_16,Region_Code_17,Region_Code_18,Region_Code_19,Region_Code_2,Region_Code_20,Region_Code_21,Region_Code_22,Region_Code_23,Region_Code_24,Region_Code_25,Region_Code_26,Region_Code_27,Region_Code_28,Region_Code_29,Region_Code_3,Region_Code_30,Region_Code_31,Region_Code_32,Region_Code_33,Region_Code_34,Region_Code_35,Region_Code_36,Region_Code_37,Region_Code_38,Region_Code_39,Region_Code_4,Region_Code_40,Region_Code_41,Region_Code_42,Region_Code_43,Region_Code_44,Region_Code_45,Region_Code_46,Region_Code_47,Region_Code_48,Region_Code_49,Region_Code_5,Region_Code_50,Region_Code_51,Region_Code_52,Region_Code_6,Region_Code_7,Region_Code_8,Region_Code_9,Vehicle_Age_lt_1Y,Vehicle_Age_gt_2Y,Vehicle_Damage_Yes,Policy_Sales_Channel_10,Policy_Sales_Channel_100,Policy_Sales_Channel_101,Policy_Sales_Channel_102,Policy_Sales_Channel_103,Policy_Sales_Channel_104,Policy_Sales_Channel_105,Policy_Sales_Channel_106,Policy_Sales_Channel_107,Policy_Sales_Channel_108,Policy_Sales_Channel_109,Policy_Sales_Channel_11,Policy_Sales_Channel_110,Policy_Sales_Channel_111,Policy_Sales_Channel_112,Policy_Sales_Channel_113,Policy_Sales_Channel_114,Policy_Sales_Channel_115,Policy_Sales_Channel_116,Policy_Sales_Channel_117,Policy_Sales_Channel_118,Policy_Sales_Channel_119,Policy_Sales_Channel_12,Policy_Sales_Channel_120,Policy_Sales_Channel_121,Policy_Sales_Channel_122,Policy_Sales_Channel_123,Policy_Sales_Channel_124,Policy_Sales_Channel_125,Policy_Sales_Channel_126,Policy_Sales_Channel_127,Policy_Sales_Channel_128,Policy_Sales_Channel_129,Policy_Sales_Channel_13,Policy_Sales_Channel_130,Policy_Sales_Channel_131,Policy_Sales_Channel_132,Policy_Sales_Channel_133,Policy_Sales_Channel_134,Policy_Sales_Channel_135,Policy_Sales_Channel_136,Policy_Sales_Channel_137,Policy_Sales_Channel_138,Policy_Sales_Channel_139,Policy_Sales_Channel_14,Policy_Sales_Channel_140,Policy_Sales_Channel_143,Policy_Sales_Channel_144,Policy_Sales_Channel_145,Policy_Sales_Channel_146,Policy_Sales_Channel_147,Policy_Sales_Channel_148,Policy_Sales_Channel_149,Policy_Sales_Channel_15,Policy_Sales_Channel_150,Policy_Sales_Channel_151,Policy_Sales_Channel_152,Policy_Sales_Channel_153,Policy_Sales_Channel_154,Policy_Sales_Channel_155,Policy_Sales_Channel_156,Policy_Sales_Channel_157,Policy_Sales_Channel_158,Policy_Sales_Channel_159,Policy_Sales_Channel_16,Policy_Sales_Channel_160,Policy_Sales_Channel_163,Policy_Sales_Channel_17,Policy_Sales_Channel_18,Policy_Sales_Channel_19,Policy_Sales_Channel_2,Policy_Sales_Channel_20,Policy_Sales_Channel_21,Policy_Sales_Channel_22,Policy_Sales_Channel_23,Policy_Sales_Channel_24,Policy_Sales_Channel_25,Policy_Sales_Channel_26,Policy_Sales_Channel_27,Policy_Sales_Channel_28,Policy_Sales_Channel_29,Policy_Sales_Channel_3,Policy_Sales_Channel_30,Policy_Sales_Channel_31,Policy_Sales_Channel_32,Policy_Sales_Channel_33,Policy_Sales_Channel_34,Policy_Sales_Channel_35,Policy_Sales_Channel_36,Policy_Sales_Channel_37,Policy_Sales_Channel_38,Policy_Sales_Channel_39,Policy_Sales_Channel_4,Policy_Sales_Channel_40,Policy_Sales_Channel_41,Policy_Sales_Channel_42,Policy_Sales_Channel_43,Policy_Sales_Channel_44,Policy_Sales_Channel_45,Policy_Sales_Channel_46,Policy_Sales_Channel_47,Policy_Sales_Channel_48,Policy_Sales_Channel_49,Policy_Sales_Channel_50,Policy_Sales_Channel_51,Policy_Sales_Channel_52,Policy_Sales_Channel_53,Policy_Sales_Channel_54,Policy_Sales_Channel_55,Policy_Sales_Channel_56,Policy_Sales_Channel_57,Policy_Sales_Channel_58,Policy_Sales_Channel_59,Policy_Sales_Channel_6,Policy_Sales_Channel_60,Policy_Sales_Channel_61,Policy_Sales_Channel_62,Policy_Sales_Channel_63,Policy_Sales_Channel_64,Policy_Sales_Channel_65,Policy_Sales_Channel_66,Policy_Sales_Channel_67,Policy_Sales_Channel_68,Policy_Sales_Channel_69,Policy_Sales_Channel_7,Policy_Sales_Channel_70,Policy_Sales_Channel_71,Policy_Sales_Channel_73,Policy_Sales_Channel_74,Policy_Sales_Channel_75,Policy_Sales_Channel_76,Policy_Sales_Channel_78,Policy_Sales_Channel_79,Policy_Sales_Channel_8,Policy_Sales_Channel_80,Policy_Sales_Channel_81,Policy_Sales_Channel_82,Policy_Sales_Channel_83,Policy_Sales_Channel_84,Policy_Sales_Channel_86,Policy_Sales_Channel_87,Policy_Sales_Channel_88,Policy_Sales_Channel_89,Policy_Sales_Channel_9,Policy_Sales_Channel_90,Policy_Sales_Channel_91,Policy_Sales_Channel_92,Policy_Sales_Channel_93,Policy_Sales_Channel_94,Policy_Sales_Channel_95,Policy_Sales_Channel_96,Policy_Sales_Channel_97,Policy_Sales_Channel_98,Policy_Sales_Channel_99
0,0,0.369231,1,0,0.070366,0.716263,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0.861538,1,0,0.057496,0.598616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0.415385,1,0,0.066347,0.058824,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0.015385,1,1,0.048348,0.66782,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0.138462,1,1,0.046259,0.100346,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Data Preparation

In [5]:
target_value_counts = main_df['Response'].value_counts(dropna=False)
target_value_counts

0    334399
1     46710
Name: Response, dtype: int64

In [9]:
# Adjust scale_pos_weight for controlling balance of positive and negative weights ( useful for unbalanced classes)
# https://machinelearningmastery.com/xgboost-for-imbalanced-classification/
scale_pos_weight = round(target_value_counts[0] / target_value_counts[1])
scale_pos_weight

7

In [None]:
# Set X and y datasets
X = main_df.drop(columns='Response')
y = main_df['Response']

# Perform train test split (Not doing as we will be using cross-validation on entire train set)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to XGB data structure
# d_train = xgb.DMatrix(X, y)

### Run XGBoost Classification

In [None]:
# Set parameter grid (More extensive)
xgb_params = {'max_depth': [3, 5, 6, 10, 12, 14], # Maximum depth of a tree
              'learning_rate': [0.01, 0.1, 0.2, 0.3], # Step size shrinkage used in update to prevents overfitting
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.4, 1.0, 0.1), # Number of features supplied to a tree
              'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
              'n_estimators': np.arange(100, 400, 100),
              'gamma': np.arange(0, 0.3, 0.1),
              ''}

# Set parameter grid (Less extensive)
# xgb_params = {
#     "learning_rate": [0.1, 0.01],
#     "colsample_bytree": [0.6, 0.8, 1.0],
#     "subsample": [0.6, 0.8, 1.0],
#     "max_depth": [2, 3, 4],
#     "n_estimators": [100, 200, 300, 400],
#     "reg_lambda": [1, 1.5, 2],
#     "gamma": [0, 0.1, 0.3],
# }

In [None]:
# Create RandomizedSearchCV instance
xgb_grid = RandomizedSearchCV(estimator=XGBClassifier(objective='multi:softmax', 
                                                      num_class=main_df['Response'].nunique(),
                                                      tree_method="gpu_hist", # Use GPU
                                                      random_state=42,
                                                      scale_pos_weight=scale_pos_weight), 
                              param_distributions=xgb_params, 
                              cv=5, 
                              verbose=2,
                              n_iter=20,
                              scoring='f1_macro') 

In [None]:
# Run XGBoost grid search
xgb_grid.fit(X, y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999 
[CV]  subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0, colsample_bytree=0.8999999999999999, colsample_bylevel=0.7999999999999999, total= 1.1min
[CV] subsample=0.8999999999999999, n_estimators=100, max_depth=12, learning_rate=0.3, gamma=0.0

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 115.2min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None, num_class=8,
                                           objective='multi:softmax',
                                           random_state=42, reg_alpha=0,
                                           reg_...
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 

In [None]:
# Best parameters from RandomizedSearchCV
xgb_grid.best_params_

{'colsample_bylevel': 0.8999999999999999,
 'colsample_bytree': 0.7999999999999999,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 6,
 'n_estimators': 300,
 'subsample': 0.5}

In [None]:
# Get best XGBoost model (based on best parameters) and predict on test set
xgb_best = xgb_grid.best_estimator_

### Predict on Test Set

In [None]:
# Make predictions on test dataset
X_test = test_df.drop(columns='Response')
y_test = test_df['Response']
y_pred = xgb_best.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

##### Final Score: 0.56279

___
### References
- https://medium.com/analytics-vidhya/using-gpu-to-boost-xgboost-training-time-533a114164d7
- https://xgboost.readthedocs.io/en/release_0.81/gpu/index.html