# LightGBM (boosting) - Regressor Algorithm - insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [3]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [4]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [5]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [7]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [9]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [10]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [11]:
!pip install lightgbm



In [12]:
import lightgbm as lgb

In [13]:
""" 
classlightgbm.LGBMRegressor(*, boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=None, importance_type='split', **kwargs)
-> max_depth (int, optional (default=-1)) – Maximum tree depth for base learners, <=0 means no limit. If setting this to a positive value, consider also changing num_leaves to <= 2^max_depth.


tuning - hyper parameters :  n_estimators, max_depth, learning_rate, num_leaves 

"""
# regressor = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=16, max_depth=4, learning_rate=0.03, n_estimators=200)
# regressor.fit(x_train, y_train)

from sklearn.model_selection import GridSearchCV

param_grid = {
  "n_estimators":[50,100,150,200],
    "learning_rate":[1.0,0.5,0.1,0.01,0.03,0.05],
   "max_depth":[-1,3,4,5,6],
    "num_leaves": [31,8,16,32,64]
}

model=lgb.LGBMRegressor()

grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

grid_model.fit(x_train,y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 5
[LightGBM] [Info] Start training from score 13201.182046


## Grid Search CV results

In [15]:
grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.35468664, 0.07462583, 0.12328515, 0.18462462, 0.21391449,
        0.36254859, 0.11376991, 0.21573257, 0.45642433, 0.49284492,
        0.81007605, 0.36303468, 0.4296247 , 0.73926101, 1.47158127,
        0.93870301, 0.26550341, 0.36862354, 0.78580341, 1.26181197,
        0.08753471, 0.14786139, 0.30710802, 0.12774982, 0.12947941,
        0.23554173, 0.16347947, 0.26310039, 0.2339416 , 0.24724145,
        0.35679836, 0.41036873, 0.62817068, 0.57536349, 0.39124708,
        0.63065743, 0.38609719, 0.59735703, 0.47272367, 0.5250926 ,
        0.18146949, 0.0910038 , 0.15702734, 0.56972995, 0.72683067,
        0.25387626, 0.21249843, 0.41684103, 0.25318427, 0.30474048,
        0.56656637, 0.24168344, 0.47926111, 0.3891942 , 0.33173661,
        0.50345626, 0.43021126, 0.47147431, 0.45200996, 0.5010365 ,
        0.25696564, 0.63145924, 0.14314919, 0.13922672, 0.19254923,
        0.58556404, 0.31924357, 0.27810841, 0.35817366, 0.29086123,
        0.65689731, 0.19581599,

In [16]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.354687,0.217181,0.011070,0.001529,1.00,-1,50,31,"{'learning_rate': 1.0, 'max_depth': -1, 'n_est...",0.776905,0.720644,0.745639,0.750908,0.684301,0.735679,0.031294,518
1,0.074626,0.010509,0.012800,0.006969,1.00,-1,50,8,"{'learning_rate': 1.0, 'max_depth': -1, 'n_est...",0.831273,0.805942,0.762267,0.803216,0.733861,0.787312,0.034690,436
2,0.123285,0.014033,0.009703,0.001069,1.00,-1,50,16,"{'learning_rate': 1.0, 'max_depth': -1, 'n_est...",0.791573,0.730170,0.754342,0.784710,0.715365,0.755232,0.029691,494
3,0.184625,0.022932,0.009268,0.000803,1.00,-1,50,32,"{'learning_rate': 1.0, 'max_depth': -1, 'n_est...",0.772859,0.726326,0.725808,0.769492,0.698504,0.738598,0.028460,517
4,0.213914,0.011705,0.010452,0.000583,1.00,-1,50,64,"{'learning_rate': 1.0, 'max_depth': -1, 'n_est...",0.764385,0.703368,0.722934,0.750346,0.704123,0.729031,0.024578,525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.488320,0.028412,0.013782,0.003153,0.05,6,200,31,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est...",0.867969,0.835804,0.819302,0.856326,0.781185,0.832117,0.030474,220
596,0.231779,0.030157,0.011106,0.000732,0.05,6,200,8,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est...",0.874637,0.843618,0.825905,0.862071,0.796832,0.840613,0.027423,94
597,0.377348,0.015758,0.012417,0.003060,0.05,6,200,16,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est...",0.868141,0.838090,0.818099,0.857854,0.785995,0.833636,0.029353,213
598,0.476087,0.017766,0.012842,0.001115,0.05,6,200,32,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est...",0.867969,0.835804,0.819302,0.856326,0.781185,0.832117,0.030474,220


In [17]:
grid_model.best_params_

{'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 150, 'num_leaves': 31}

In [18]:
grid_model.best_estimator_

## Model Accuracy

In [23]:
"""  
parameter                                                                       Accuracy
  {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 150, 'num_leaves': 31}   -   0.847175796130581 
"""

grid_model.best_score_

np.float64(0.847175796130581)

## save the model 

In [24]:
# import pickle 
# pickle.dump(regressor, open("lightGBM_finalmodel_insurance_charge_predict.sav","wb"))

In [25]:
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])  



array([12952.89103167])