# Decision Tree Algorithm - insurance_charge_prediction
# using Grid Search CV 
## MODEL CREATION PHASE

## read the dataset

In [1]:
import pandas as pd 
dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


## convert categorical data into numerical data

In [2]:
dataset = pd.get_dummies(dataset,drop_first=True,dtype=int)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


## split Input and Output

In [3]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [4]:
independent = dataset[['age', 'bmi', 'children',  'sex_male', 'smoker_yes' ]]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [5]:
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


## model creation 

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

"""
hyper parameters:
criterion   {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”squared_error”
splitter   {“best”, “random”}, default=”best”
max_features  int, float or {“sqrt”, “log2”}, default=None

"""
# parameter grid 
param_grid = { 
    "criterion":["squared_error","friedman_mse","absolute_error","poisson"],
    "splitter": ["best","random"],
    "max_features" : ["sqrt","log2"]
}
# model
model = DecisionTreeRegressor()
# model creation 
grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3,n_jobs=-1)
grid_model.fit(independent,dependent)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [6]:
"""
criterion   {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”squared_error”
splitter   {“best”, “random”}, default=”best”
max_features  int, float or {“sqrt”, “log2”}, default=None
"""
# from sklearn.tree import DecisionTreeRegressor
# regressor = DecisionTreeRegressor(criterion="absolute_error",splitter="best",random_state=0, max_features="sqrt")
# regressor.fit(x_train,y_train)

'\ncriterion   {“squared_error”, “friedman_mse”, “absolute_error”, “poisson”}, default=”squared_error”\nsplitter   {“best”, “random”}, default=”best”\nmax_features  int, float or {“sqrt”, “log2”}, default=None\n'

In [225]:
from sklearn.metrics import r2_score
r_score = r2_score(y_test, y_predict)
print(f" THE ACCURACY OF THE MODEL : {r_score}")    # THE ACCURACY OF THE MODEL : 0.7538062098354589   --- using criterion="absolute_error",splitter="random"

 THE ACCURACY OF THE MODEL : 0.7885524275111394


## Grid Search CV results 

In [20]:

grid_result = grid_model.cv_results_
grid_result

{'mean_fit_time': array([0.01874223, 0.01434193, 0.02010288, 0.01275072, 0.01477327,
        0.0137867 , 0.01460142, 0.01516838, 0.04044166, 0.02992067,
        0.03972483, 0.04482899, 0.03438993, 0.02205682, 0.02179179,
        0.02221384]),
 'std_fit_time': array([0.00146159, 0.00161645, 0.00407756, 0.00124583, 0.00062465,
        0.00238664, 0.00100545, 0.00478682, 0.00338054, 0.00187969,
        0.00391617, 0.01792501, 0.01945843, 0.00349606, 0.00128593,
        0.0052499 ]),
 'mean_score_time': array([0.01141434, 0.01224799, 0.01059256, 0.00848799, 0.00831246,
        0.0087966 , 0.00865192, 0.00783911, 0.00793343, 0.00824804,
        0.00805812, 0.0111516 , 0.01575646, 0.01186495, 0.0113513 ,
        0.01125369]),
 'std_score_time': array([0.00348599, 0.0026029 , 0.00303932, 0.00058068, 0.00071212,
        0.00122505, 0.00051487, 0.00020867, 0.00044352, 0.00050596,
        0.00029121, 0.00388063, 0.00245862, 0.0015272 , 0.00058215,
        0.00141601]),
 'param_criterion': masked

In [21]:
table = pd.DataFrame.from_dict(grid_result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.018742,0.001462,0.011414,0.003486,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.631049,0.582996,0.665566,0.696654,0.579709,0.631195,0.045695,13
1,0.014342,0.001616,0.012248,0.002603,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.664159,0.528489,0.730067,0.688607,0.737881,0.669841,0.075659,4
2,0.020103,0.004078,0.010593,0.003039,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.66904,0.601357,0.663942,0.696479,0.392047,0.604573,0.110729,16
3,0.012751,0.001246,0.008488,0.000581,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.692649,0.516758,0.680601,0.735977,0.602309,0.645659,0.077564,11
4,0.014773,0.000625,0.008312,0.000712,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.712757,0.621993,0.740072,0.781941,0.740712,0.719495,0.053526,1
5,0.013787,0.002387,0.008797,0.001225,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.608379,0.617276,0.619081,0.718752,0.65224,0.643146,0.040635,12
6,0.014601,0.001005,0.008652,0.000515,friedman_mse,log2,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.700011,0.642219,0.767435,0.701235,0.722365,0.706653,0.040414,2
7,0.015168,0.004787,0.007839,0.000209,friedman_mse,log2,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.679121,0.592562,0.784833,0.680768,0.641914,0.675839,0.063247,3
8,0.040442,0.003381,0.007933,0.000444,absolute_error,sqrt,best,"{'criterion': 'absolute_error', 'max_features'...",0.708119,0.584803,0.488231,0.734545,0.578903,0.61892,0.090742,15
9,0.029921,0.00188,0.008248,0.000506,absolute_error,sqrt,random,"{'criterion': 'absolute_error', 'max_features'...",0.59009,0.687202,0.682487,0.643858,0.678169,0.656361,0.03649,7


In [22]:
grid_model.best_params_

{'criterion': 'friedman_mse', 'max_features': 'sqrt', 'splitter': 'best'}

In [23]:
grid_model.best_estimator_

## Model Accuracy 

In [24]:
"""
parameter                                                                       Accuracy
{'criterion': 'friedman_mse', 'max_features': 'sqrt', 'splitter': 'best'} -     0.7194952320691046   (best score using grid CV)
"""
grid_model.best_score_

np.float64(0.7194952320691046)

## save the model 

In [25]:
# import pickle 
# pickle.dump(regressor, open(" ","wb"))

In [26]:
grid_model.predict([[ 52, 30.200, 1, 1, 0 ]])



array([9724.53])