# LightGBM (boosting) - Regressor Algorithm - insurance_charge_prediction
## MODEL CREATION PHASE

## read the dataset

In [7]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [8]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [9]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [10]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [11]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [13]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [14]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [15]:
!pip install lightgbm



In [5]:
import lightgbm as lgb

In [102]:
""" 
classlightgbm.LGBMRegressor(*, boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, subsample_for_bin=200000, objective=None, class_weight=None, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=None, importance_type='split', **kwargs)
-> max_depth (int, optional (default=-1)) – Maximum tree depth for base learners, <=0 means no limit. If setting this to a positive value, consider also changing num_leaves to <= 2^max_depth.


tuning - hyper parameters :  n_estimators, max_depth, learning_rate, num_leaves 

"""
regressor = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=16, max_depth=4, learning_rate=0.03, n_estimators=200)
regressor.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 5
[LightGBM] [Info] Start training from score 13201.182046


## test the model 

In [103]:
y_predict = regressor.predict(x_test)
y_predict

array([12466.64032351, 10513.28829861, 45475.26890261, 13255.82370878,
       10399.11364645,  5473.06257529,  2104.3731638 , 12569.10718716,
        7528.2122343 ,  6233.36043227,  6632.2721145 , 12065.8041683 ,
        9805.43690908,  6122.9340281 , 20048.7838482 , 11802.48278291,
       13883.34237377,  5259.47676745,  7391.33645567, 35232.66938931,
       23428.91644617, 14386.68249734, 12256.13495501, 26667.02144214,
        3463.32338118,  6845.02815721,  5230.39311626,  7829.08457298,
        4336.13589115, 10135.05351779,  7589.91493821, 48463.83818704,
       15348.40343948, 12958.67993128, 16999.22216753,  4612.39569622,
       10009.38015644, 37362.98446245, 38844.6969315 ,  3289.05496287,
        5628.20713722,  4678.93278506, 20129.52343941, 46634.83133061,
       36993.58131316,  5881.06363032, 11802.48278291,  7100.09845198,
        5719.22115115, 11658.04699552,  3788.28765841,  5566.40892768,
       26700.37860224, 44660.99766535, 11166.79869457,  4778.70472413,
      

## Evaluation metrics 

In [104]:
from sklearn.metrics import r2_score
r_score = r2_score(y_test, y_predict)
print(f" THE ACCURACY OF THE MODEL : {r_score}")    # THE ACCURACY OF THE MODEL : 

 THE ACCURACY OF THE MODEL : 0.9043329784188349


## Finding model overfit or under fit

In [21]:
train_score = regressor.score(x_train,y_train) 
test_score = regressor.score(x_test,y_test) 

In [25]:
print(
    "train_score: ",train_score ,"\n",
    "test_score: ",test_score ,"\n",
    "train and test difference: ", abs(train_score - test_score),"\n"
)

"""


 ==> 
 """

train_score:  0.8763703107833862 
 test_score:  0.9024880528450012 
 train and test difference:  0.02611774206161499 



'\ntrain_score:  0.8763703107833862 \n test_score:  0.9024880528450012 \n train and test difference:  0.02611774206161499 \n\n ==> Good Model\n '

## save the model 

In [16]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [17]:
regressor.predict([[ 52, 30.200, 1, 1, 0 ]])  

array([12800.898], dtype=float32)