In [1]:
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")

In [2]:
#dataset.columns

In [3]:
#convert dataset as numerical data

dataset=pd.get_dummies(dataset, drop_first=True, dtype=int)

In [4]:
dataset.columns


Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
# split input and output data

independent=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dependent=dataset[['charges']]

In [6]:
independent, dependent

(      age     bmi  children  sex_male  smoker_yes
 0      19  27.900         0         0           1
 1      18  33.770         1         1           0
 2      28  33.000         3         1           0
 3      33  22.705         0         1           0
 4      32  28.880         0         1           0
 ...   ...     ...       ...       ...         ...
 1333   50  30.970         3         1           0
 1334   18  31.920         0         0           0
 1335   18  36.850         0         0           0
 1336   21  25.800         0         0           0
 1337   61  29.070         0         0           1
 
 [1338 rows x 5 columns],
           charges
 0     16884.92400
 1      1725.55230
 2      4449.46200
 3     21984.47061
 4      3866.85520
 ...           ...
 1333  10600.54830
 1334   2205.98080
 1335   1629.83350
 1336   2007.94500
 1337  29141.36030
 
 [1338 rows x 1 columns])

In [7]:
# preprocessing the data in a range

from sklearn.preprocessing import StandardScaler
SS= StandardScaler()
independent = SS.fit_transform(independent)
#dependent = SS.fit_transform(dependent)

independent



array([[-1.43876426, -0.45332   , -0.90861367, -1.0105187 ,  1.97058663],
       [-1.50996545,  0.5096211 , -0.07876719,  0.98959079, -0.5074631 ],
       [-0.79795355,  0.38330685,  1.58092576,  0.98959079, -0.5074631 ],
       ...,
       [-1.50996545,  1.0148781 , -0.90861367, -1.0105187 , -0.5074631 ],
       [-1.29636188, -0.79781341, -0.90861367, -1.0105187 , -0.5074631 ],
       [ 1.55168573, -0.26138796, -0.90861367, -1.0105187 ,  1.97058663]])

In [8]:
# model creation using GridserchCV and DeciosnTreeRegressor modules

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid={ 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
             'max_features': ['sqrt', 'log2'],
             'splitter': ['best','random'] 
            }
grid=GridSearchCV(DecisionTreeRegressor(),param_grid, refit=True, n_jobs=-1,verbose=3)


In [9]:
#fit the grid
grid.fit(independent,dependent)
results=grid.cv_results_
print("R_score value for best parameter      = {}".format(grid.best_params_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
R_score value for best parameter      = {'criterion': 'absolute_error', 'max_features': 'sqrt', 'splitter': 'best'}


In [10]:
table=pd.DataFrame.from_dict(results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001912,0.000196,0.000652,0.000371,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.792373,0.626247,0.624492,0.695076,0.688207,0.685279,0.061243,3
1,0.00252,0.001199,0.000732,0.000336,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.685132,0.578212,0.668565,0.661792,0.599561,0.638653,0.041885,12
2,0.002701,0.000281,0.000985,8.7e-05,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.779128,0.624315,0.734466,0.653699,0.740039,0.706329,0.057834,2
3,0.002114,6.8e-05,0.001116,0.000319,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.657143,0.573697,0.645712,0.598952,0.670382,0.629177,0.036729,14
4,0.002403,0.000956,0.000664,0.000436,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.64184,0.636491,0.443434,0.696118,0.630662,0.609709,0.086374,16
5,0.002294,0.000834,0.001146,0.000497,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.643025,0.585688,0.70219,0.639624,0.640372,0.64218,0.036886,11
6,0.002696,0.000495,0.000597,0.000165,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.602817,0.613369,0.712083,0.72853,0.728379,0.677036,0.056706,5
7,0.001677,0.000354,0.000551,0.000291,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.699448,0.571444,0.717789,0.634902,0.633593,0.651435,0.052342,9
8,0.012019,0.002448,0.000659,0.000211,absolute_error,sqrt,best,"{'criterion': 'absolute_error', 'max_features'...",0.760528,0.644542,0.715419,0.71505,0.751948,0.717497,0.040918,1
9,0.010255,0.001493,0.000686,0.000239,absolute_error,sqrt,random,"{'criterion': 'absolute_error', 'max_features'...",0.565653,0.468738,0.68681,0.678426,0.709333,0.621792,0.091296,15


In [11]:
# save model
import pickle
pickle.dump(grid,open("model_grid_decision_tree_Regression.sav",'wb'))

In [12]:
loaded_model=pickle.load(open("model_grid_decision_tree_Regression.sav",'rb'))

In [13]:
prediction=loaded_model.predict([[21,28.900,0,0,1]])
print ("future prediction {}".format(prediction))

future prediction [63770.42801]
[CV 2/5] END criterion=squared_error, max_features=sqrt, splitter=best;, score=0.626 total time=   0.0s
[CV 2/5] END criterion=friedman_mse, max_features=log2, splitter=random;, score=0.571 total time=   0.0s
[CV 1/5] END criterion=absolute_error, max_features=sqrt, splitter=best;, score=0.761 total time=   0.0s
[CV 2/5] END criterion=absolute_error, max_features=sqrt, splitter=best;, score=0.645 total time=   0.0s
[CV 2/5] END criterion=absolute_error, max_features=log2, splitter=random;, score=0.622 total time=   0.0s
[CV 3/5] END criterion=absolute_error, max_features=log2, splitter=random;, score=0.746 total time=   0.0s
[CV 1/5] END criterion=poisson, max_features=log2, splitter=random;, score=0.692 total time=   0.0s
[CV 3/5] END criterion=squared_error, max_features=sqrt, splitter=random;, score=0.669 total time=   0.0s
[CV 3/5] END criterion=friedman_mse, max_features=sqrt, splitter=random;, score=0.702 total time=   0.0s
[CV 5/5] END criterion=f