# XG boost (eXtreme gradient boosting) Regressor Algorithm - insurance_charge_prediction
## MODEL CREATION PHASE

## read the dataset

In [4]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [5]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [6]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [7]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [8]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [10]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [11]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [12]:
import xgboost as xgb 

In [19]:
""" 
hyper parameters :  n_estimators, learning_rate, max_depth, subsample, colsample_bytree
"""

regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=0,
    n_estimators=60,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.8
)
regressor.fit(x_train, y_train)

## test the model 

In [14]:
y_predict = regressor.predict(x_test)
y_predict

array([12800.898 , 10628.991 , 44921.754 , 13786.937 ,  9982.266 ,
        4703.7773,  2449.7002, 12982.711 ,  8255.47  ,  7058.2183,
        6464.0312, 12282.557 ,  9507.536 ,  5611.8403, 20058.842 ,
       12537.303 , 13941.664 ,  5963.4595,  7530.7373, 34023.05  ,
       25103.217 , 14524.387 , 12056.687 , 26204.402 ,  3408.119 ,
        6293.3994,  4411.2744,  8531.8   ,  4414.552 , 10025.289 ,
        8055.7285, 47354.26  , 14876.794 , 12170.801 , 17181.523 ,
        4630.6484, 10840.079 , 37177.016 , 39206.547 ,  3425.2136,
        4800.0205,  4548.273 , 21425.174 , 46719.58  , 36173.773 ,
        5888.985 , 12537.303 ,  7298.4497,  5570.928 , 12366.084 ,
        3988.4062,  5469.3013, 26056.158 , 44655.76  , 11081.12  ,
        6078.134 ,  5656.1987, 10491.761 , 10156.093 , 15236.657 ,
        2681.5493, 45249.95  , 16833.727 , 12490.013 , 12528.246 ,
       10974.549 , 35720.582 , 40189.043 ,  4345.363 ,  9291.98  ,
       14646.157 , 12827.927 , 18483.932 , 14864.475 , 13879.4

## Evaluation metrics 

In [18]:
from sklearn.metrics import r2_score
r_score = r2_score(y_test, y_predict)
print(f" THE ACCURACY OF THE MODEL : {r_score}")    # THE ACCURACY OF THE MODEL : 0.9024880528450012

 THE ACCURACY OF THE MODEL : 0.9024880528450012


## save the model 

In [16]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [17]:
regressor.predict([[ 52, 30.200, 1, 1, 0 ]])  

array([12800.898], dtype=float32)