# RandomForest with HyperTuning GridSearchCV concept:

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [12]:
data = pd.read_csv("insurance_pre.csv")
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [14]:
df = pd.get_dummies(data,drop_first = True,dtype=int)

In [16]:
independent = df[['age', 'bmi', 'children','sex_male', 'smoker_yes']]

In [18]:
dependent = df[["charges"]]

In [20]:
independent.head(5)


Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,0,1
1,18,33.77,1,1,0
2,28,33.0,3,1,0
3,33,22.705,0,1,0
4,32,28.88,0,1,0


In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(independent,dependent,test_size=0.30,random_state=10)

In [30]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
'max_features': ['sqrt','log2'],
'n_estimators':[10,50,100,250,500]}

In [34]:
grid = GridSearchCV(RandomForestRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [42]:
print(grid.best_params_)
#For deployment phase - create pkl file with best_model name
best_model = grid.best_estimator_ 

{'criterion': 'poisson', 'max_features': 'sqrt', 'n_estimators': 500}


In [44]:
re=grid.cv_results_
table = pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.041382,0.015989,0.002269,0.001742,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.760192,0.855876,0.845472,0.778331,0.817205,0.811415,0.037117,40
1,0.266448,0.056851,0.013416,0.002432,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.788343,0.865792,0.872406,0.817197,0.833934,0.835534,0.031116,28
2,0.628309,0.033733,0.01733,0.010027,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.790187,0.868935,0.8665,0.819741,0.843996,0.837872,0.029747,17
3,1.714304,0.068688,0.058383,0.005099,squared_error,sqrt,250,"{'criterion': 'squared_error', 'max_features':...",0.790982,0.870215,0.878291,0.819909,0.837904,0.83946,0.032213,4
4,3.438357,0.061944,0.115587,0.007592,squared_error,sqrt,500,"{'criterion': 'squared_error', 'max_features':...",0.788742,0.869612,0.878032,0.819207,0.83831,0.838781,0.032798,12
5,0.072544,0.002998,0.0095,0.005038,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.762622,0.857567,0.852342,0.802561,0.839393,0.822897,0.035743,36
6,0.348358,0.009485,0.020601,0.007224,squared_error,log2,50,"{'criterion': 'squared_error', 'max_features':...",0.784348,0.866412,0.873527,0.818516,0.837931,0.836147,0.03259,25
7,0.662397,0.028168,0.025415,0.003549,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.786845,0.864535,0.875108,0.81796,0.836993,0.836288,0.031893,24
8,1.547103,0.033819,0.055772,0.004073,squared_error,log2,250,"{'criterion': 'squared_error', 'max_features':...",0.787745,0.8701,0.875889,0.820254,0.842399,0.839277,0.032612,9
9,3.317966,0.051635,0.110482,0.008408,squared_error,log2,500,"{'criterion': 'squared_error', 'max_features':...",0.787769,0.869484,0.874003,0.820714,0.842051,0.838804,0.032008,11


In [46]:
y_pred = grid.predict(X_test)

In [48]:
from sklearn.metrics import r2_score
r2score = r2_score(y_test,y_pred)
r2score

0.8370605981739486

In [50]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))


Age: 12
BMI: 21
Children: 1
Sex Male 0 or 1: 1
Smoker Yes 0 or 1: 1


In [52]:
new_data = [[age_input, bmi_input, children_input, sex_male_input, smoker_yes_input]]
# IMPORTANT: Use transform(), NOT fit_transform()
new_data_scaled = sc.transform(new_data)  
print(new_data_scaled)

[[-1.91457909 -1.58182317 -0.05768565  0.99573559  1.96207567]]


In [54]:
# Make prediction using trained model
Future_Prediction = grid.predict(new_data_scaled)

print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[14873.02766688]


## Save the model using PKL file:

In [57]:
import pickle
filename = "rf_grid.sav"

In [63]:
#model creation
pickle.dump(best_model,open(filename,'wb'))

In [65]:
#for pre-process
pickle.dump(sc,open("preprocess_scaler.sav",'wb'))