# Personal Medical Cost Prediction

## Step 1. Select real dataset 
- Using Dataset : Medical Cost Personal Datasets
- 출처 : https://www.kaggle.com/mirichoi0218/insurance/data

In [1]:
import pandas as pd
insurance = pd.read_csv('./datasets/insurance/insurance.csv')
X=insurance.drop(['sex','smoker','region'],axis=1)
print(X.shape)
X.head()

(1338, 4)


Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


## Step 2. choose regression algorithms
- Algorithm : Linear / Polynomial / SVR

In [2]:
#1. Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
#2.SVR : Support Vector Regression
from sklearn.svm import SVR
svr_reg = SVR(gamma='scale',C=1.0,epsilon=0.2,kernel='linear')
#3. Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

## Step3. Train the regression model

In [3]:
#choose good features in the dataset
X=X.drop(['charges'],axis=1)
X=X.values
y=insurance.charges.values

In [4]:
#Do preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder

num_pipeline = Pipeline([
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
X_prepared = num_pipeline.fit_transform(X) #정제된 X



In [5]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X_prepared,y,test_size=0.2,random_state=42)

In [6]:
#learning model
import time
lin_reg.fit(X_train,y_train) #linear
svr_reg.fit(X_train,y_train) #svr
tree_reg.fit(X_train,y_train) #Decision Tree

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## Step 4. Estimate the model 

In [7]:
#case1 : Compare prediction results using same data.
some_data = X_test[13] #예제 data
some_labels = y_test[13] #실제 label : 46151.1245
some_data , some_labels

(array([0.48366788, 1.91712274, 1.58092576]), 46151.1245)

In [8]:
# 예제 데이터의 predict 값을 출력하는 함수
def gotPredict(data):
    print('Linear Regression : {}'.format(lin_reg.predict(data)))
    print('SVR : {}'.format(svr_reg.predict(data)))
    print('Decision Tree Regression : {}'.format(tree_reg.predict(data)))
gotPredict([some_data])

Linear Regression : [19794.64417995]
SVR : [9613.0875249]
Decision Tree Regression : [8944.1151]


In [9]:
#Case 2. Compare the RMSE
#Calculate RMSE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
lin_mse = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse = np.sqrt(-lin_mse)

In [10]:
svr_mse = cross_val_score(svr_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
svr_rmse = np.sqrt(-svr_mse)

In [11]:
tree_mse = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmse = np.sqrt(-tree_mse)

In [12]:
#print RMSE
print('Linear Regression : {}'.format(lin_rmse.mean()))
print('SVR : {}'.format(svr_rmse.mean()))
print('Decision Tree Regression : {}'.format(tree_rmse.mean()))

Linear Regression : 11352.407612834704
SVR : 12541.318816329556
Decision Tree Regression : 16510.020350708084


In [13]:
#Use Ridge Regression
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.1 , solver='cholesky',random_state=42)
ridge_reg.fit(X_train,y_train)
ridge_reg.predict([some_data])

array([19794.13267391])

In [14]:
#Use Lasso Regression
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train,y_train)
lasso_reg.predict([some_data])

array([19794.27923724])

## Step 5. Use Early stopping

In [19]:
#1000까지의 epoch 중 증가폭이 생기면 early stopping
from sklearn.base import clone
tree_reg_2 = DecisionTreeRegressor()
minimum_val_error = float('inf')
best_epoch = None
best_model = None
for epoch in range(1000):
    tree_reg_2.fit(X_train,y_train)
    y_val_predict = tree_reg_2.predict(X_test)
    val_error = mean_squared_error(y_test,y_val_predict)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(tree_reg_2)

In [20]:
best_epoch, best_model

(163, DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))

In [22]:
from sklearn.model_selection import cross_val_score
tree_mse_2 = cross_val_score(tree_reg_2, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmse_2 = np.sqrt(-tree_mse_2)
print('RMSE without early stopping : {}'.format(tree_rmse.mean()))
print('RMSE using early stopping : {}'.format(tree_rmse_2.mean()))

RMSE without early stopping : 16510.020350708084
RMSE using early stopping : 16499.11762061345


In [23]:
lin_reg_2 = LinearRegression()
minimum_val_error = float('inf')
best_epoch = None
best_model = None
for epoch in range(1000):
    lin_reg_2.fit(X_train,y_train)
    y_val_predict = lin_reg_2.predict(X_test)
    val_error = mean_squared_error(y_test,y_val_predict)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(lin_reg_2)

In [24]:
best_epoch, best_model

(0, LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
          normalize=False))

In [25]:
lin_mse_2 = cross_val_score(lin_reg_2, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse_2 = np.sqrt(-lin_mse_2)
print('RMSE without early stopping : {}'.format(lin_rmse.mean()))
print('RMSE using early stopping : {}'.format(lin_rmse_2.mean()))

RMSE without early stopping : 11352.407612834704
RMSE using early stopping : 11352.407612834704
