# Random Forest Regressor Algorithm -  insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train test

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

## model creation 

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# parameter grid
param_grid = { 
    "criterion":["squared_error","friedman_mse","absolute_error","poisson"],
     "max_features" : ["sqrt","log2",None],
    "n_estimators": [50,100,150,200]
}
# model 
model = RandomForestRegressor()
# model creation 
grid_model = GridSearchCV(model, param_grid, refit = True, verbose=3, n_jobs=-1)
grid_model.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


In [8]:
# from sklearn.ensemble import RandomForestRegressor
# # criterion   {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”mse"
# regressor = RandomForestRegressor(n_estimators=100,random_state=0, criterion="poisson")
# regressor.fit(x_train, y_train)

## Grid Search CV results

In [9]:

grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.34665298, 0.67317781, 1.00562296, 1.51675525, 0.35033755,
        0.70579782, 1.06828327, 1.48922315, 0.49746084, 1.10324569,
        1.32968407, 1.65931902, 0.35039568, 0.65983596, 0.99613705,
        1.38389702, 0.34367723, 0.65284095, 0.9921731 , 1.32259908,
        0.42802429, 0.91673484, 1.27283382, 1.64721789, 0.80031142,
        1.5713964 , 2.34262638, 3.07870002, 0.77703719, 1.56864862,
        2.3369103 , 3.14787006, 1.25231524, 2.58910685, 3.78921642,
        6.17345634, 0.46974235, 0.83569865, 1.36500983, 1.71698103,
        0.57010436, 1.14851713, 1.61079116, 2.27074075, 0.75755882,
        1.54530854, 1.56596408, 2.09074531]),
 'std_fit_time': array([0.02119323, 0.02629055, 0.0239415 , 0.18558732, 0.02538866,
        0.01783486, 0.13602608, 0.13768544, 0.03756469, 0.09473788,
        0.02856457, 0.05899531, 0.02025494, 0.03143744, 0.02011497,
        0.04352697, 0.02465072, 0.01554874, 0.02831716, 0.03250893,
        0.04173739, 0.07176977, 0.037

In [10]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.346653,0.021193,0.028996,0.003217,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.843879,0.815235,0.797809,0.826245,0.771905,0.811015,0.024633,32
1,0.673178,0.026291,0.048527,0.006072,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.847097,0.822168,0.800866,0.833975,0.770705,0.814962,0.026846,21
2,1.005623,0.023942,0.067895,0.005509,squared_error,sqrt,150,"{'criterion': 'squared_error', 'max_features':...",0.846966,0.825469,0.803723,0.834864,0.771518,0.816508,0.02658,8
3,1.516755,0.185587,0.106581,0.025396,squared_error,sqrt,200,"{'criterion': 'squared_error', 'max_features':...",0.843811,0.829719,0.804485,0.838552,0.77332,0.817977,0.026096,1
4,0.350338,0.025389,0.028139,0.0021,squared_error,log2,50,"{'criterion': 'squared_error', 'max_features':...",0.845079,0.815646,0.805019,0.836167,0.766381,0.813658,0.027596,29
5,0.705798,0.017835,0.055229,0.019693,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.847777,0.827558,0.802136,0.834807,0.768739,0.816203,0.028009,11
6,1.068283,0.136026,0.069145,0.005439,squared_error,log2,150,"{'criterion': 'squared_error', 'max_features':...",0.847396,0.822217,0.802022,0.836006,0.774419,0.816412,0.025877,9
7,1.489223,0.137685,0.090714,0.010832,squared_error,log2,200,"{'criterion': 'squared_error', 'max_features':...",0.846332,0.826173,0.803173,0.836461,0.772131,0.816854,0.026561,5
8,0.497461,0.037565,0.036544,0.004286,squared_error,,50,"{'criterion': 'squared_error', 'max_features':...",0.845316,0.815031,0.791309,0.808268,0.769335,0.805852,0.025268,44
9,1.103246,0.094738,0.060401,0.013101,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.845071,0.818561,0.796593,0.813067,0.772712,0.809201,0.023998,35


In [11]:
grid_model.best_params_

{'criterion': 'squared_error', 'max_features': 'sqrt', 'n_estimators': 200}

In [12]:
grid_model.best_estimator_

## Model Accuracy

In [13]:
"""
parameter                                                                       Accuracy
{'criterion': 'poisson', 'max_features': 'sqrt', 'n_estimators': 100} -     0.8183011956899856   (best score using grid CV)
"""

grid_model.best_score_

np.float64(0.8179774938774642)

## save the model 

In [14]:
# import pickle 
# pickle.dump(regressor, open("insurance_charge_finalModel_RF_regressor.sav","wb"))

## predict the model 

In [15]:
# insurance_charge_predict = pickle.load(open("insurance_charge_finalModel_RF_regressor.sav","rb"))
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])



array([10458.030279])