In [1]:
import pandas as pd
dataset=pd.read_csv("insurance_pre.csv")

In [2]:
#convert dataset as numerical data

dataset=pd.get_dummies(dataset, drop_first=True, dtype=int)
#dataset.columns

In [3]:
# split input and output data

independent=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dependent=dataset[['charges']]
independent, dependent

(      age     bmi  children  sex_male  smoker_yes
 0      19  27.900         0         0           1
 1      18  33.770         1         1           0
 2      28  33.000         3         1           0
 3      33  22.705         0         1           0
 4      32  28.880         0         1           0
 ...   ...     ...       ...       ...         ...
 1333   50  30.970         3         1           0
 1334   18  31.920         0         0           0
 1335   18  36.850         0         0           0
 1336   21  25.800         0         0           0
 1337   61  29.070         0         0           1
 
 [1338 rows x 5 columns],
           charges
 0     16884.92400
 1      1725.55230
 2      4449.46200
 3     21984.47061
 4      3866.85520
 ...           ...
 1333  10600.54830
 1334   2205.98080
 1335   1629.83350
 1336   2007.94500
 1337  29141.36030
 
 [1338 rows x 1 columns])

In [6]:
#Split the data for train and test data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=.3, random_state=0)

#x_train,x_test,y_train,y_test

(      age     bmi  children  sex_male  smoker_yes
 1163   18  28.215         0         0           0
 196    39  32.800         0         0           0
 438    52  46.750         5         0           0
 183    44  26.410         0         0           0
 1298   33  27.455         2         1           0
 ...   ...     ...       ...       ...         ...
 763    27  26.030         0         1           0
 835    42  35.970         2         1           0
 1216   40  25.080         0         1           0
 559    19  35.530         0         1           0
 684    33  18.500         1         0           0
 
 [936 rows x 5 columns],
       age     bmi  children  sex_male  smoker_yes
 578    52  30.200         1         1           0
 610    47  29.370         1         0           0
 569    48  40.565         2         1           1
 1034   61  38.380         0         1           0
 198    51  18.050         0         0           0
 ...   ...     ...       ...       ...         ...
 126

In [7]:
# preprocessing the input data in a range

from sklearn.preprocessing import StandardScaler
SS= StandardScaler()
x_train = SS.fit_transform(x_train)

#x_train

array([[-1.5330973 , -0.40713453, -0.89833872, -0.97676557, -0.50466988],
       [-0.03364163,  0.32855417, -0.89833872, -0.97676557, -0.50466988],
       [ 0.89459283,  2.56690911,  3.25603402, -0.97676557, -0.50466988],
       ...,
       [ 0.03776102, -0.91016269, -0.89833872,  1.02378711, -0.50466988],
       [-1.46169465,  0.76659782, -0.89833872,  1.02378711, -0.50466988],
       [-0.46205754, -1.96596021, -0.06746417, -0.97676557, -0.50466988]])

In [9]:
# model creation using GridserchCV and DeciosnTreeRegressor modules

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid={ 'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
             'max_features': ['sqrt', 'log2'],
             'splitter': ['best','random'] 
            }
grid=GridSearchCV(DecisionTreeRegressor(),param_grid, refit=True, n_jobs=-1,verbose=3)


In [10]:
#fit the grid

grid.fit(x_train,y_train)
print("R_score value for best parameter      = {}".format(grid.best_params_))
results=grid.cv_results_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
R_score value for best parameter      = {'criterion': 'squared_error', 'max_features': 'log2', 'splitter': 'best'}


In [11]:
table=pd.DataFrame.from_dict(results)
#table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003906,0.00406,0.000759,0.00045,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.666042,0.546668,0.574176,0.555154,0.638511,0.59611,0.047521,9
1,0.001989,0.001137,0.000756,0.00066,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.661586,0.622552,0.599111,0.603472,0.544381,0.60622,0.037985,8
2,0.002588,0.000635,0.000947,3.9e-05,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.74099,0.707906,0.679402,0.671759,0.666885,0.693389,0.027722,1
3,0.001837,0.000707,0.000637,0.000267,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.691168,0.527636,0.562651,0.521562,0.603431,0.58129,0.062221,12
4,0.001754,0.000465,0.001104,0.000978,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.592903,0.402874,0.638318,0.677312,0.533858,0.569053,0.095855,14
5,0.001778,0.001568,0.000392,0.000103,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.729119,0.600049,0.63758,0.628633,0.633444,0.645765,0.043704,3
6,0.001391,0.00037,0.000443,0.000215,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.410425,0.614223,0.68972,0.425558,0.608472,0.549679,0.111384,15
7,0.001715,0.000592,0.000904,0.000414,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.573028,0.626766,0.40196,0.515362,0.584034,0.54023,0.077743,16
8,0.009008,0.003132,0.000811,0.000612,absolute_error,sqrt,best,"{'criterion': 'absolute_error', 'max_features'...",0.751371,0.59364,0.614981,0.559288,0.654274,0.634711,0.065965,4
9,0.005286,0.002016,0.000438,0.000167,absolute_error,sqrt,random,"{'criterion': 'absolute_error', 'max_features'...",0.555901,0.553483,0.554294,0.634707,0.612513,0.58218,0.034557,11


In [12]:
y_pred=grid.predict(x_test)



In [13]:
#finding r2Score value
from sklearn.metrics import r2_score
r2_score= r2_score(y_test,y_pred)
#r2_score

-2.060311001771934

In [None]:
# save model
#import pickle
#pickle.dump(grid,open("model_grid_decision_tree_Regression.sav",'wb'))

In [None]:
#loaded_model=pickle.load(open("model_grid_decision_tree_Regression.sav",'rb'))

In [14]:
#scale your input before predict

input=SS.transform([[19,27.900,0,1,0]])
input




array([[-1.46169465, -0.45767803, -0.89833872,  1.02378711, -0.50466988]])

In [16]:
#model predition with the scaled input..

grid.predict(input)
print ("future prediction {}".format(prediction))

NameError: name 'prediction' is not defined