# Ada boost (adaptive boosting) Regressor Algorithm - insurance_charge_prediction
## MODEL CREATION PHASE

## read the dataset

In [2]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [3]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [8]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [9]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [10]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
regressor = AdaBoostRegressor(random_state=0, estimator=DecisionTreeRegressor(max_depth=3), n_estimators=50, loss='square', learning_rate=0.01)
regressor.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


## test the model 

In [11]:
y_predict = regressor.predict(x_test)
y_predict

array([13394.4759742 , 11124.54188833, 45210.73709931, 14741.91152803,
       11918.23283213,  6687.59110853,  4215.21796889, 11918.23283213,
        7260.92593292,  7115.37305935,  7224.45899801, 11918.23283213,
        9456.22599561,  6687.59110853, 18677.63617783, 11918.23283213,
       14651.47058213,  6640.2903938 ,  7260.92593292, 37741.33951338,
       25579.80443068, 13535.42945224, 13394.4759742 , 25579.80443068,
        4215.21796889,  7027.8747027 ,  5329.77760526,  7328.20503909,
        5329.77760526, 10811.93531157,  7328.20503909, 45723.29722128,
       14741.91152803, 13394.4759742 , 18677.63617783,  5329.77760526,
       10955.19650732, 37741.33951338, 39362.87433486,  4215.21796889,
        6864.48548867,  5329.77760526, 18720.01195561, 45723.29722128,
       37741.33951338,  4677.52603909, 11918.23283213,  7250.45721787,
        7027.8747027 , 13394.4759742 ,  4677.52603909,  4677.52603909,
       25579.80443068, 45518.74554409, 13394.4759742 ,  5329.77760526,
      

## Evaluation metrics 

In [13]:
from sklearn.metrics import r2_score
r_score = r2_score(y_test, y_predict)
print(f" THE ACCURACY OF THE MODEL : {r_score}")    # THE ACCURACY OF THE MODEL : 0.8905112709237017

 THE ACCURACY OF THE MODEL : 0.8905112709237017


## Finding model overfit or under fit

In [16]:
train_score = regressor.score(x_train,y_train) 
test_score = regressor.score(x_test,y_test) 
print(
    "train_score: ",train_score ,"\n",
    "test_score: ",test_score ,"\n",
    "train and test difference (<5% = good model): ", abs(train_score - test_score),"\n"
)

train_score:  0.8512751714379214 
 test_score:  0.8905112709237017 
 train and test difference (<5% = good model):  0.03923609948578033 



## save the model 

In [13]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [14]:
regressor.predict([[ 52, 30.200, 1, 1, 0 ]])  



array([13985.37855442])