# Random Forest Regressor Algorithm -  insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## split train test

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)

## model creation 

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# parameter grid
param_grid = { 
    "criterion":["squared_error","friedman_mse","absolute_error","poisson"],
     "max_features" : ["sqrt","log2",None],
    "n_estimators": [50,100,150,200]
}
# model 
model = RandomForestRegressor()
# model creation 
grid_model = GridSearchCV(model, param_grid, refit = True, verbose=3, n_jobs=-1)
grid_model.fit(x_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


In [None]:
# from sklearn.ensemble import RandomForestRegressor
# # criterion   {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”mse"
# regressor = RandomForestRegressor(n_estimators=100,random_state=0, criterion="poisson")
# regressor.fit(x_train, y_train)

## Grid Search CV results

In [9]:

grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.3550777 , 0.68896823, 1.01514769, 1.36752033, 0.35080323,
        0.70046935, 0.98279538, 1.31504507, 0.43130498, 0.8189537 ,
        1.24246373, 1.66339698, 0.37734785, 0.66140189, 1.02577538,
        1.36684761, 0.33622556, 0.68671455, 0.99075732, 1.34182587,
        0.40606961, 0.85326791, 1.20033722, 1.68414469, 0.97192578,
        1.88587937, 2.79956374, 5.5122838 , 3.04517426, 9.95299025,
        5.59610391, 5.94390464, 2.36583443, 4.06112595, 5.22692947,
        5.91746459, 0.37698655, 0.74581108, 1.13314605, 1.5336947 ,
        0.39949937, 0.75592537, 1.20503545, 1.58166428, 0.51741152,
        1.00037756, 1.54232383, 1.88729672]),
 'std_fit_time': array([0.02549124, 0.03040152, 0.04082093, 0.0438171 , 0.03454048,
        0.04518128, 0.02364371, 0.02296889, 0.01978901, 0.02393073,
        0.03490939, 0.03569745, 0.04514306, 0.00849032, 0.03661812,
        0.06003984, 0.00959361, 0.01417591, 0.02190969, 0.0158798 ,
        0.01860857, 0.04407071, 0.026

In [10]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.355078,0.025491,0.026996,0.000858,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.847968,0.824836,0.796556,0.832685,0.763699,0.813149,0.029837,28
1,0.688968,0.030402,0.044947,0.000856,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.845006,0.825579,0.804953,0.83663,0.769251,0.816284,0.027077,18
2,1.015148,0.040821,0.063559,0.001484,squared_error,sqrt,150,"{'criterion': 'squared_error', 'max_features':...",0.847729,0.822479,0.79891,0.835909,0.769349,0.814875,0.027954,24
3,1.36752,0.043817,0.099611,0.01954,squared_error,sqrt,200,"{'criterion': 'squared_error', 'max_features':...",0.844472,0.824911,0.802631,0.842336,0.771643,0.817199,0.027286,7
4,0.350803,0.03454,0.02691,0.001024,squared_error,log2,50,"{'criterion': 'squared_error', 'max_features':...",0.845137,0.823152,0.80153,0.832589,0.759616,0.812405,0.029997,30
5,0.700469,0.045181,0.053854,0.008962,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.842532,0.824609,0.808011,0.838617,0.77047,0.816848,0.026164,13
6,0.982795,0.023644,0.067919,0.005634,squared_error,log2,150,"{'criterion': 'squared_error', 'max_features':...",0.846597,0.828475,0.801815,0.838126,0.768437,0.81669,0.028434,15
7,1.315045,0.022969,0.082114,0.002004,squared_error,log2,200,"{'criterion': 'squared_error', 'max_features':...",0.848056,0.825743,0.804226,0.837654,0.77206,0.817548,0.027014,4
8,0.431305,0.019789,0.027952,0.002893,squared_error,,50,"{'criterion': 'squared_error', 'max_features':...",0.8428,0.815398,0.791532,0.817439,0.763763,0.806186,0.026708,44
9,0.818954,0.023931,0.047053,0.004516,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.847376,0.824383,0.79344,0.806903,0.76913,0.808246,0.026618,37


In [11]:
grid_model.best_params_

{'criterion': 'poisson', 'max_features': 'sqrt', 'n_estimators': 100}

In [12]:
grid_model.best_estimator_

## Model Accuracy

In [16]:
"""
parameter                                                                       Accuracy
{'criterion': 'poisson', 'max_features': 'sqrt', 'n_estimators': 100} -     0.8183011956899856   (best score using grid CV)
"""

grid_model.best_score_

np.float64(0.8183011956899856)

## save the model 

In [14]:
# import pickle 
# pickle.dump(regressor, open("insurance_charge_finalModel_RF_regressor.sav","wb"))

## predict the model 

In [15]:
# insurance_charge_predict = pickle.load(open("insurance_charge_finalModel_RF_regressor.sav","rb"))
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])



array([10663.4306254])