In [1]:
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")

In [2]:
#dataset.columns

In [3]:
#convert dataset as numerical data

dataset=pd.get_dummies(dataset, drop_first=True, dtype=int)

In [4]:
dataset.columns


Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
# split input and output data

independent=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dependent=dataset[['charges']]

In [6]:
independent, dependent

(      age     bmi  children  sex_male  smoker_yes
 0      19  27.900         0         0           1
 1      18  33.770         1         1           0
 2      28  33.000         3         1           0
 3      33  22.705         0         1           0
 4      32  28.880         0         1           0
 ...   ...     ...       ...       ...         ...
 1333   50  30.970         3         1           0
 1334   18  31.920         0         0           0
 1335   18  36.850         0         0           0
 1336   21  25.800         0         0           0
 1337   61  29.070         0         0           1
 
 [1338 rows x 5 columns],
           charges
 0     16884.92400
 1      1725.55230
 2      4449.46200
 3     21984.47061
 4      3866.85520
 ...           ...
 1333  10600.54830
 1334   2205.98080
 1335   1629.83350
 1336   2007.94500
 1337  29141.36030
 
 [1338 rows x 1 columns])

In [7]:
# preprocessing the data in a range

from sklearn.preprocessing import StandardScaler
SS= StandardScaler()
independent = SS.fit_transform(independent)


independent



array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [8]:
# model creation using GridserchCV and DeciosnTreeRegressor modules

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid={ 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
             'max_features': ['sqrt', 'log2'],
             'splitter': ['best','random'] 
            }
grid=GridSearchCV(DecisionTreeRegressor(),param_grid, refit=True, n_jobs=-1,verbose=3)


In [9]:
#fit the grid

grid.fit(independent,dependent)
print("R_score value for best parameter      = {}".format(grid.best_params_))
results=grid.cv_results_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
R_score value for best parameter      = {'criterion': 'absolute_error', 'max_features': 'log2', 'splitter': 'random'}


In [10]:
table=pd.DataFrame.from_dict(results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00501,0.00307,0.001289,0.000454,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.75181,0.631342,0.695188,0.76165,0.694754,0.706949,0.046922,3
1,0.001809,0.000405,0.000663,0.000267,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.693808,0.545371,0.65749,0.65901,0.695209,0.650178,0.054859,10
2,0.003361,0.002355,0.000965,0.000414,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.750914,0.578394,0.748732,0.706042,0.71006,0.698829,0.063065,5
3,0.001894,0.000468,0.0008,0.000224,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.707674,0.571544,0.685584,0.68073,0.703434,0.669793,0.050177,8
4,0.002762,0.000253,0.001022,0.000131,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.740679,0.673811,0.697655,0.729863,0.661206,0.700643,0.030793,4
5,0.001836,0.000647,0.001044,0.000845,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.576775,0.522157,0.662697,0.650773,0.676486,0.617778,0.058954,16
6,0.003076,0.001312,0.000737,0.000294,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.684203,0.589973,0.75609,0.671563,0.723816,0.685129,0.056163,7
7,0.001385,0.000475,0.000409,7e-05,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.736472,0.576902,0.62531,0.692407,0.655614,0.657341,0.054755,9
8,0.011821,0.002671,0.000798,0.000435,absolute_error,sqrt,best,"{'criterion': 'absolute_error', 'max_features'...",0.729445,0.573902,0.751773,0.740868,0.777639,0.714725,0.072197,2
9,0.010532,0.00097,0.000433,5e-05,absolute_error,sqrt,random,"{'criterion': 'absolute_error', 'max_features'...",0.574342,0.683088,0.745696,0.674632,0.450502,0.625652,0.103362,15


In [11]:
pred=grid.predict(independent)

In [14]:
#finding r2Score value
from sklearn.metrics import r2_score
r2_score= r2_score(pred,dependent)
r2_score

0.9648176451991102

In [15]:
# save model
import pickle
pickle.dump(grid,open("model_grid_decision_tree_Regression.sav",'wb'))

In [16]:
loaded_model=pickle.load(open("model_grid_decision_tree_Regression.sav",'rb'))

In [17]:
#scale your input before predict

input=SS.transform([[19,27.900,0,1,0]])
input




array([[-1.43876426, -0.45332   , -0.90861367,  0.98959079, -0.5074631 ]])

In [21]:
#model predition with the scaled input..

loaded_model.predict(input)
#print ("future prediction {}".format(prediction))

array([1635.73365])