# XG boost (eXtreme gradient boosting) Regressor Algorithm - insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [7]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [8]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [9]:
import xgboost as xgb 

In [10]:
# """ 
# hyper parameters :  n_estimators, learning_rate, max_depth, subsample, colsample_bytree
# """

# regressor = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     eval_metric='rmse',
#     random_state=0,
#     n_estimators=60,
#     learning_rate=0.1,
#     max_depth=3,
#     subsample=0.9,
#     colsample_bytree=0.8
# )
# regressor.fit(x_train, y_train)


from sklearn.model_selection import GridSearchCV

param_grid = {
  "n_estimators":[50,100,150,200],
    "learning_rate":[1.0,0.5,0.1,0.01,0.03,0.05],
    "subsample":[0.5,0.6,0.7,0.9],
    "colsample_bytree":[0.5,0.6,0.7,0.9]
}

model=xgb.XGBRegressor()

grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

grid_model.fit(x_train,y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


## Grid Search CV results

In [11]:
grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.07504916, 0.07343559, 0.08249364, 0.07589903, 0.12282414,
        0.13136587, 0.12048659, 0.12197967, 0.19600124, 0.17079625,
        0.17252202, 0.22029467, 0.45155396, 0.29682407, 0.33666945,
        0.33268404, 0.08817348, 0.10334845, 0.09503369, 0.23533573,
        0.16617665, 0.20912271, 0.2267602 , 0.1860981 , 0.34812551,
        0.23182631, 0.30792184, 0.2030941 , 0.30795803, 0.38232522,
        0.31910744, 0.35833874, 0.11770015, 0.10808568, 0.1034761 ,
        0.11744065, 0.16575522, 0.23601174, 0.16759281, 0.18212905,
        0.28397069, 0.27546668, 0.26701145, 0.24088531, 0.2811296 ,
        0.30059233, 0.30227876, 0.28714061, 0.09207807, 0.09975791,
        0.09524055, 0.09124107, 0.16013179, 0.51211305, 0.18614097,
        0.16453042, 0.25517726, 0.41398716, 0.37186136, 0.36443281,
        0.51925073, 0.61066699, 0.29840703, 0.43090792, 0.0981154 ,
        0.08894448, 0.09620652, 0.10941911, 0.19763126, 0.16926141,
        0.15821552, 0.15674443,

In [12]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.075049,0.006150,0.011106,0.000772,0.5,1.00,50,0.5,"{'colsample_bytree': 0.5, 'learning_rate': 1.0...",0.311775,0.102994,0.095541,0.411411,0.393558,0.263056,0.137909,377
1,0.073436,0.003944,0.012208,0.001254,0.5,1.00,50,0.6,"{'colsample_bytree': 0.5, 'learning_rate': 1.0...",0.522880,0.367664,0.235337,0.487982,0.453828,0.413538,0.102938,363
2,0.082494,0.007282,0.013194,0.002367,0.5,1.00,50,0.7,"{'colsample_bytree': 0.5, 'learning_rate': 1.0...",0.505397,0.366355,0.422187,0.433173,0.517324,0.448887,0.055943,351
3,0.075899,0.001936,0.013763,0.002965,0.5,1.00,50,0.9,"{'colsample_bytree': 0.5, 'learning_rate': 1.0...",0.557011,0.539469,0.488406,0.573008,0.540988,0.539776,0.028432,330
4,0.122824,0.009979,0.011784,0.000728,0.5,1.00,100,0.5,"{'colsample_bytree': 0.5, 'learning_rate': 1.0...",0.200974,0.090469,0.154276,0.435050,0.273289,0.230812,0.118289,382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,0.302176,0.023510,0.015031,0.002655,0.9,0.05,150,0.9,"{'colsample_bytree': 0.9, 'learning_rate': 0.0...",0.851339,0.828379,0.810648,0.845398,0.780665,0.823286,0.025605,22
380,0.365333,0.017098,0.015666,0.001699,0.9,0.05,200,0.5,"{'colsample_bytree': 0.9, 'learning_rate': 0.0...",0.847008,0.816910,0.804603,0.848291,0.768460,0.817054,0.029629,26
381,0.392582,0.021700,0.017973,0.003305,0.9,0.05,200,0.6,"{'colsample_bytree': 0.9, 'learning_rate': 0.0...",0.842684,0.818227,0.807444,0.843452,0.771788,0.816719,0.026442,27
382,0.368410,0.010499,0.014438,0.000850,0.9,0.05,200,0.7,"{'colsample_bytree': 0.9, 'learning_rate': 0.0...",0.839184,0.822784,0.805868,0.844705,0.769174,0.816343,0.027211,28


In [13]:
grid_model.best_params_

{'colsample_bytree': 0.9,
 'learning_rate': 0.03,
 'n_estimators': 150,
 'subsample': 0.5}

In [14]:
grid_model.best_estimator_

## Model Accuracy

In [21]:
"""  
parameter                                                                       Accuracy
{'colsample_bytree': 0.9,
 'learning_rate': 0.03,
 'n_estimators': 150,
 'subsample': 0.5}                                                         - 0.8320932865142823 
"""

grid_model.best_score_

np.float64(0.8320932865142823)

## save the model 

In [16]:
# import pickle 
# pickle.dump(regressor, open("XGBoost_finalmodel_insurance_charge_predict.sav ","wb"))

In [22]:
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])  

array([10980.673], dtype=float32)