# Model Evaluation 2
Analyze : tips data from seaborn
Features : Total Bill, sex, smoker, day and time
Target : Tips

Preprocess :
1. One Hot Encoding : smoker, day, time
2. No Treatment : numerical

Random state 2020, splitting 80:20
1. Model Linear Regression and Tree (criterion mse, max_depth 5) compute R2 using 5-fold CV
2. Model decision tree (criterion mse, max_depth 5) compute mse in test
3. Do hyperparameter tuning (Randomized Search) for decision tree (optimize criterion, min sample leaf, max depth) optimized by mse and using CV 5-fold
4. Compare the result (before and after) in test set

> ## Library and Data

In [1]:
# dataframe
import pandas as pd
import numpy as np

# model 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

# evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import cross_val_score, RandomizedSearchCV


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


> ## Data Splitting

In [3]:
x = tips[['total_bill', 'sex', 'smoker', 'day', 'time']]
x = pd.get_dummies(x, drop_first = True) # One Hot Encoding
y = tips['tip']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                y, 
                test_size = 0.2,
                random_state = 2020)

> ## Model : Benchmark

In [5]:
tree = DecisionTreeRegressor(criterion = 'mse', max_depth = 5, random_state = 2020)
linreg = LinearRegression()

In [6]:
tree_cv = cross_val_score(tree, x_train, y_train, scoring = 'r2')

In [7]:
print('cross validation: ',tree_cv)
print('mean cross validation: ', tree_cv.mean())
print('std cross validation: ', tree_cv.std())

cross validation:  [0.11865437 0.09663697 0.13388799 0.60844872 0.41722998]
mean cross validation:  0.27497160666626597
std cross validation:  0.2037583555823742


In [8]:
linreg_cv = cross_val_score(linreg, x_train, y_train, scoring = 'r2')

In [9]:
print('cross validation: ',linreg_cv)
print('mean cross validation: ', linreg_cv.mean())
print('std cross validation: ', linreg_cv.std())

cross validation:  [0.27243371 0.18984475 0.31816682 0.46456813 0.44769515]
mean cross validation:  0.33854171153933316
std cross validation:  0.10458807015801647


> ## Tree Test Score

In [10]:
tree = DecisionTreeRegressor(criterion = 'mse', max_depth = 5, random_state = 2020)
tree.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=5, random_state=2020)

In [11]:
y_pred = tree.predict(x_test)
mean_squared_error(y_test, y_pred)

1.1723927020084088

> ## Hyperparameter Tuning 

In [12]:
x_train.shape

(195, 7)

In [13]:
hyperparam = {
    'min_samples_leaf':[10, 15, 20, 50], # 4
    'max_depth':[2, 3], # 2
    'criterion':['mse', 'mae'] # 2
} 
# 16 combinations

tree = DecisionTreeRegressor(criterion = 'mse', max_depth = 5, random_state = 2020)

randomized_search = RandomizedSearchCV(tree,
                   param_distributions = hyperparam,
                   n_iter = 16, 
                   cv = 5,
                   scoring = 'r2',
                   random_state = 2020,
                   n_jobs = -1
                  )

In [None]:
randomized_search.fit(x_train, y_train)

In [None]:
randomized_search.best_params_

In [None]:
randomized_search.best_score_

In [None]:
result_cv = pd.DataFrame(randomized_search.cv_results_)

In [None]:
result_cv[result_cv['param_min_samples_leaf'] == 10]

> ## Compare LinReg Before & After

In [None]:
tree = DecisionTreeRegressor(criterion = 'mse', max_depth = 5, random_state = 2020)
tree2 = DecisionTreeRegressor(criterion = 'mae', max_depth = 5, min_samples_leaf = 10, random_state = 2020)
linreg = LinearRegression()

In [None]:
def final_eval(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('mse :', mean_squared_error(y_test, y_pred))
    print('r2 :', r2_score(y_test, y_pred))

In [None]:
final_eval(tree)

In [None]:
final_eval(tree2) # most stable

In [None]:
final_eval(linreg)