# Ada boost (adaptive boosting) Regressor Algorithm - insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train and test set 


In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

In [7]:
x_test

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
578,52,30.200,1,1,0
610,47,29.370,1,0,0
569,48,40.565,2,1,1
1034,61,38.380,0,1,0
198,51,18.050,0,0,0
...,...,...,...,...,...
1084,62,30.495,2,0,0
726,41,28.405,1,1,0
1132,57,40.280,0,1,0
725,30,39.050,3,0,1


In [8]:
y_test

Unnamed: 0,charges
578,9724.53000
610,8547.69130
569,45702.02235
1034,12950.07120
198,9644.25250
...,...
1084,15019.76005
726,6664.68595
1132,20709.02034
725,40932.42950


## model creation 

In [9]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
  "n_estimators":[50,100,150,200],
  "learning_rate":[1.0,0.5,0.1,0.01,0.03,0.05],
    "loss":["linear","square","exponential"]
}

model=AdaBoostRegressor()

grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3, n_jobs=-1)

grid_model.fit(x_train,y_train)


# from sklearn.tree import DecisionTreeRegressor
# regressor = AdaBoostRegressor(random_state=0, estimator=DecisionTreeRegressor(max_depth=3), n_estimators=50, loss='square', learning_rate=0.01)
# regressor.fit(x_train, y_train)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


  y = column_or_1d(y, warn=True)


## Grid Search CV results

In [10]:

grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.09646387, 0.07625904, 0.08214192, 0.07683072, 0.41679969,
        0.64914184, 1.01124196, 1.72569838, 0.35072713, 0.67146306,
        1.00362954, 1.37039704, 0.12877164, 0.12557111, 0.12688413,
        0.10671906, 0.33981371, 0.69081612, 0.95565333, 1.34194875,
        0.36888418, 0.70545216, 0.99178953, 1.38578763, 0.34262142,
        0.4985899 , 0.4656539 , 0.48902874, 0.36825476, 0.68839154,
        1.17193084, 1.37514954, 0.37334561, 0.68887243, 1.03859978,
        1.42405696, 0.42728834, 0.74763689, 1.04696136, 1.43474107,
        0.34731207, 0.71686077, 1.10469584, 1.39094672, 0.35760446,
        0.70581532, 1.09642854, 1.41435566, 0.38539162, 0.71786938,
        1.04388862, 1.2783783 , 0.35714931, 0.71623306, 1.04622927,
        1.38876047, 0.36699028, 0.69899573, 1.04018884, 1.35626435,
        0.3593986 , 0.72205386, 0.80807886, 0.92489843, 0.36000619,
        0.70334105, 1.10358839, 1.40355721, 0.35234022, 0.7048141 ,
        1.28568683, 1.42486811]

In [11]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_loss,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.096464,0.028435,0.016549,0.002469,1.00,linear,50,"{'learning_rate': 1.0, 'loss': 'linear', 'n_es...",0.844277,0.785934,0.765767,0.798469,0.797708,0.798431,0.025789,43
1,0.076259,0.015797,0.013981,0.001635,1.00,linear,100,"{'learning_rate': 1.0, 'loss': 'linear', 'n_es...",0.849043,0.764448,0.811366,0.805380,0.779910,0.802029,0.029018,41
2,0.082142,0.023533,0.014915,0.001626,1.00,linear,150,"{'learning_rate': 1.0, 'loss': 'linear', 'n_es...",0.839901,0.762102,0.811683,0.811847,0.786689,0.802444,0.026278,40
3,0.076831,0.020381,0.016643,0.003098,1.00,linear,200,"{'learning_rate': 1.0, 'loss': 'linear', 'n_es...",0.856016,0.758843,0.781417,0.825922,0.798678,0.804175,0.033952,39
4,0.416800,0.032197,0.053966,0.013692,1.00,square,50,"{'learning_rate': 1.0, 'loss': 'square', 'n_es...",0.504581,0.401810,0.447214,0.451369,0.602863,0.481568,0.068841,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,1.403557,0.016958,0.147358,0.012065,0.05,square,200,"{'learning_rate': 0.05, 'loss': 'square', 'n_e...",0.737802,0.599169,0.650974,0.688146,0.733812,0.681981,0.052268,53
68,0.352340,0.010673,0.043741,0.003691,0.05,exponential,50,"{'learning_rate': 0.05, 'loss': 'exponential',...",0.867388,0.821930,0.818433,0.844225,0.803214,0.831038,0.022411,12
69,0.704814,0.025774,0.080359,0.023716,0.05,exponential,100,"{'learning_rate': 0.05, 'loss': 'exponential',...",0.853574,0.798709,0.805690,0.828645,0.797066,0.816737,0.021600,23
70,1.285687,0.123437,0.139442,0.027099,0.05,exponential,150,"{'learning_rate': 0.05, 'loss': 'exponential',...",0.836431,0.774495,0.783690,0.811462,0.792486,0.799713,0.022051,42


In [12]:
grid_model.best_params_

{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50}

In [13]:
grid_model.best_estimator_

## Model Accuracy

In [17]:
""" 
parameter                                                                       Accuracy
{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50} -       0.8371126261942514    (best score using grid CV)
"""

grid_model.best_score_

np.float64(0.8371126261942514)

## save the model 

In [15]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [18]:
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])  



array([12619.19808226])