# In ML regression, if the R-value is good, preprocessing may not be required. If the R-value is poor, we apply input and output preprocessing techniques to improve performance.

In [1]:
#importing the Libraies
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Reading the Dataset
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset=pd.get_dummies(dataset,drop_first=True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,False,True
1,18,33.770,1,1725.55230,True,False
2,28,33.000,3,4449.46200,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.880,0,3866.85520,True,False
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,True,False
1334,18,31.920,0,2205.98080,False,False
1335,18,36.850,0,1629.83350,False,False
1336,21,25.800,0,2007.94500,False,False


In [6]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dep=dataset[['charges']]

In [7]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size = 1/3, random_state = 0)


from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
from sklearn.model_selection import GridSearchCV
#from sklearn.tree import Random Forest Regressor
param_grid = {'criterion':['squared_error','absolute_error'],
              'max_features': ['sqrt','log2',None],
              'n_estimators':[10,100]} 



grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
   
# fitting the model for grid search 
grid.fit(X_train, y_train) 

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  return fit_method(estimator, *args, **kwargs)


In [10]:
# print best parameter after tuning 
#print(grid.best_params_) 
re=grid.cv_results_
#print(re)
grid_predictions = grid.predict(X_test) 
   
# print classification report 
from sklearn.metrics import r2_score
r_score=r2_score(y_test,grid_predictions)

print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'absolute_error', 'max_features': 'log2', 'n_estimators': 100}: 0.8748660762632172


In [11]:
table=pd.DataFrame.from_dict(re)

In [12]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.082791,0.004314,0.014337,0.001659,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.772547,0.756851,0.806701,0.788908,0.742451,0.773492,0.022724,10
1,0.594601,0.023324,0.03716,0.000959,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.80094,0.769358,0.83949,0.837727,0.756958,0.800895,0.033974,3
2,0.075066,0.005848,0.010837,0.001156,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.793438,0.777656,0.818741,0.784749,0.737418,0.7824,0.026434,8
3,0.534728,0.026812,0.037696,0.001508,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.804511,0.765401,0.841337,0.834507,0.758961,0.800943,0.034049,2
4,0.069865,0.003093,0.009604,0.000543,squared_error,,10,"{'criterion': 'squared_error', 'max_features':...",0.790131,0.71485,0.80981,0.77759,0.755519,0.76958,0.032551,11
5,0.613408,0.014022,0.035271,0.00234,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.802845,0.743847,0.831188,0.800725,0.764759,0.788673,0.030773,7
6,0.140111,0.014706,0.009067,0.000547,absolute_error,sqrt,10,"{'criterion': 'absolute_error', 'max_features'...",0.776196,0.749803,0.799204,0.792445,0.77299,0.778128,0.017214,9
7,1.290023,0.035619,0.032936,0.003344,absolute_error,sqrt,100,"{'criterion': 'absolute_error', 'max_features'...",0.797943,0.768298,0.839449,0.825676,0.761268,0.798527,0.030705,4
8,0.142541,0.010549,0.009174,0.001229,absolute_error,log2,10,"{'criterion': 'absolute_error', 'max_features'...",0.794394,0.763294,0.824276,0.823656,0.754137,0.791951,0.02935,5
9,1.208416,0.065301,0.026872,0.001782,absolute_error,log2,100,"{'criterion': 'absolute_error', 'max_features'...",0.809982,0.772955,0.8403,0.826224,0.764245,0.802741,0.02961,1


In [13]:
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age: 21
BMI: 25.800
Children: 0
Sex Male 0 or 1: 0
Smoker Yes 0 or 1: 0


In [14]:
Future_Prediction=grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])# change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[2315.6743495]




In [None]:
table=pd.DataFrame.from_dict(re)
best_test_score = table['mean_test_score'].max()
best_test_score

In [None]:
table