- Step 1. Load Dataset

In [1]:
#dataset : Medical Cost Personal Datasets (https://www.kaggle.com/mirichoi0218/insurance/data)
import pandas as pd
insurance = pd.read_csv('./datasets/insurance/insurance.csv')
X=insurance.drop(['sex','smoker','region','charges'],axis=1).values
y=insurance.charges.values

In [2]:
#Do preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
X_prepared = num_pipeline.fit_transform(X) #정제된 X



In [3]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X_prepared,y,test_size=0.2,random_state=42)

- Step 2. Train decision tree regression model using the dataset.

In [4]:
#use only a features --> age 
from sklearn.tree import DecisionTreeRegressor
X_1=X_train[:,0].reshape(-1, 1) #age
dt_reg_2 = DecisionTreeRegressor(max_depth=2, random_state=42)
dt_reg_3 = DecisionTreeRegressor(max_depth=3, random_state=42)
dt_reg_4 = DecisionTreeRegressor(max_depth=4, random_state=42)
dt_reg_2.fit(X_1, y_train)
dt_reg_3.fit(X_1, y_train)
dt_reg_4.fit(X_1, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')

In [5]:
#case1 : Compare prediction results using same data.
some_data = X_test[:,0].reshape(-1, 1) #예제 data
some_labels = y_test[13] #실제 label : 46151.1245

In [6]:
print("depth2 prediction : {}".format(dt_reg_2.predict([some_data[13]])))
print("depth3 prediction : {}".format(dt_reg_3.predict([some_data[13]])))
print("depth4 prediction : {}".format(dt_reg_4.predict([some_data[13]])))

depth2 prediction : [15678.70375214]
depth3 prediction : [15559.41588868]
depth4 prediction : [14945.46669901]


In [7]:
#compare RMSE (cross_val)
from sklearn.model_selection import cross_val_score
dep2_mse = cross_val_score(dt_reg_2, X_1, y_train, scoring="neg_mean_squared_error", cv=10)
dep2_rmse = np.sqrt(-dep2_mse)
dep3_mse = cross_val_score(dt_reg_3, X_1, y_train, scoring="neg_mean_squared_error", cv=10)
dep3_rmse = np.sqrt(-dep3_mse)
dep4_mse = cross_val_score(dt_reg_4, X_1, y_train, scoring="neg_mean_squared_error", cv=10)
dep4_rmse = np.sqrt(-dep4_mse)

In [8]:
#print RMSE
print('RMSE 비교')
print('depth2 : {}'.format(dep2_rmse.mean()))
print('depth3 : {}'.format(dep3_rmse.mean()))
print('depth4 : {}'.format(dep4_rmse.mean()))

RMSE 비교
depth2 : 11515.884960248783
depth3 : 11581.021680743117
depth4 : 11628.530681730392


- Step 3. Find the best hyperparameter in the picked model.

In [9]:
#grid search
from sklearn.model_selection import GridSearchCV
dt_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
param_grid = [
    {'max_depth': [1,2,3,4,5,6,7,8,9,10,20] }
  ]
grid = GridSearchCV(dt_reg,param_grid,cv=5,scoring='neg_mean_squared_error', return_train_score=True)
grid.fit(X_1,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [10]:
cvres = grid.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
print('##########################################################')
print('Best hyperparameter: {}'.format(grid.best_params_))

11623.348263762102 {'max_depth': 1}
11556.991467659314 {'max_depth': 2}
11638.556728090973 {'max_depth': 3}
11679.136735902455 {'max_depth': 4}
11733.598350403343 {'max_depth': 5}
11803.514722170718 {'max_depth': 6}
11832.225228473588 {'max_depth': 7}
11853.750545477398 {'max_depth': 8}
11854.261963822453 {'max_depth': 9}
11861.367709717306 {'max_depth': 10}
11863.502756986974 {'max_depth': 20}
##########################################################
Best hyperparameter: {'max_depth': 2}


- Step 4. Train the picked model again using all features.

In [11]:
#get RMSE
import time
dt_best = DecisionTreeRegressor(max_depth=2, random_state=42)
start_time = time.time()
dt_best.fit(X_train,y_train)
print("Best model(depth=2)'s training time using all features : {}".format(time.time()-start_time))
best_mse = cross_val_score(dt_best, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
best_rmse = np.sqrt(-best_mse)
print("Best model(depth=2)'s RMSE using all features : {}".format(best_rmse.mean()))

Best model(depth=2)'s training time using all features : 0.001991748809814453
Best model(depth=2)'s RMSE using all features : 11524.596486065017
