# hold-out

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.linear_model import LinearRegression
df = sns.load_dataset('tips')

In [2]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
y_col = 'tip'
X = df.drop(columns=[y_col])
X

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,Male,No,Sat,Dinner,3
240,27.18,Female,Yes,Sat,Dinner,2
241,22.67,Male,Yes,Sat,Dinner,2
242,17.82,Male,No,Sat,Dinner,2


In [4]:
numeric_cols = X.select_dtypes(include=np.number)

In [5]:
numeric_cols

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4
...,...,...
239,29.03,3
240,27.18,2
241,22.67,2
242,17.82,2


In [6]:
X.dtypes

total_bill     float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [7]:
#標準化のために数値カラムのリストを取得
numeric_cols = X.select_dtypes(include=np.number).columns.to_list()

In [8]:
numeric_cols

['total_bill', 'size']

In [9]:
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...
239,29.03,3,0,1,0,1,0,1
240,27.18,2,1,0,0,1,0,1
241,22.67,2,0,0,0,1,0,1
242,17.82,2,0,1,0,1,0,1


In [10]:
y = df[y_col]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
61,13.81,2,0,0,0,1,0,1
146,18.64,3,1,1,0,0,0,0
52,34.81,4,1,1,0,0,1,1
66,16.45,2,1,1,0,1,0,1
26,13.37,2,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...
67,3.07,1,1,0,0,1,0,1
192,28.44,2,0,0,0,0,0,0
117,10.65,2,1,1,0,0,0,0
47,32.40,4,0,1,0,0,1,1


In [12]:
# 標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_train_scaled

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
61,-0.682321,-0.616436,0,0,0,1,0,1
146,-0.152575,0.463915,1,1,0,0,0,0
52,1.620922,1.544266,1,1,0,0,1,1
66,-0.392771,-0.616436,1,1,0,1,0,1
26,-0.730580,-0.616436,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...
67,-1.860266,-1.696786,1,0,0,1,0,1
192,0.922272,-0.616436,0,0,0,0,0,0
117,-1.028905,-0.616436,1,1,0,0,0,0
47,1.356598,1.544266,0,1,0,0,1,1


In [13]:
X_test_scaled = X_test.copy()
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [14]:
#　線形回帰モデル
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred

array([2.82249035, 2.97504474, 2.8260184 , 1.38113692, 3.15154584,
       1.72121268, 2.48332645, 3.03579004, 2.75176346, 4.52560955,
       3.1133346 , 3.14781575, 2.33198109, 2.11518372, 2.93262778,
       4.27846609, 1.83157994, 2.26626275, 2.31085596, 3.24382161,
       3.81889336, 2.85616455, 2.42949782, 2.42039736, 2.20253234,
       2.42509643, 2.81777778, 4.70274951, 3.81268552, 2.38673795,
       2.29194112, 2.20803273, 2.45503466, 1.7743294 , 2.71663745,
       2.22913684, 2.72146912, 2.01205852, 5.85346207, 3.49435578,
       2.26246168, 2.20347519, 2.50905642, 4.41646769, 1.97212663,
       2.78445294, 2.65274212, 3.01652357, 2.73423023, 3.95761528,
       3.9498931 , 2.53992971, 2.71758399, 6.35620823, 1.7434279 ,
       2.33450139, 4.23562521, 3.29319236, 2.41114285, 2.20345847,
       3.72455103, 2.29099827, 3.04008335, 3.74539008, 4.01431996,
       2.26547605, 2.66047323, 3.84238482, 2.17921165, 3.87859588,
       2.59899485, 1.94814647, 3.70801825, 2.11341037])

In [15]:
X_test

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
64,17.59,3,0,1,0,1,0,1
63,18.29,4,0,0,0,1,0,1
55,19.49,2,0,1,0,0,1,1
111,7.25,1,1,1,0,1,0,1
225,16.27,2,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
90,28.97,2,0,0,1,0,0,1
101,15.38,2,1,0,1,0,0,1
75,10.51,2,0,1,0,1,0,1
4,24.59,4,1,1,0,0,1,1


In [16]:
# 精度
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

0.955080898861715

## LOOCV

In [17]:
#データ準備
X = df['total_bill'].values.reshape(-1, 1)
y = df['tip']

In [18]:
loo =LeaveOneOut()
mse_list = []
model = LinearRegression()
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    #モデルの学習
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [19]:
print(f"MSE(LOOCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(LOOCV):1.0675673489857438
std:2.099794455177631


In [20]:
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
print(f"MSE(LOOCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(LOOCV):1.0675673489857438
std:2.099794455177631


## k-fold CV

In [21]:
from sklearn.model_selection import KFold
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=0)
moedl = LinearRegression
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test =X[train_index], X[test_index]
    y_train, y_test =y[train_index], y[test_index]
    
    #モデル学習
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [22]:
print(f"MSE({k}FoldCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(5FoldCV):1.080211088394392
std:0.16170100507039514


In [23]:
scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)

In [24]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

In [28]:
from sklearn.model_selection import KFold, RepeatedKFold
k = 5
n_repeats =3
# cv = KFold(n_splits=k, shuffle=True, random_state=0)
cv = RepeatedKFold(n_splits=k, n_repeats=n_repeats, random_state=0)
moedl = LinearRegression
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test =X[train_index], X[test_index]
    y_train, y_test =y[train_index], y[test_index]
    
    #モデル学習
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [29]:
print(f"MSE({k}FoldCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(5FoldCV):1.0746387233165984
std:0.26517178540898434


In [30]:
mse_list

[0.8213090642766288,
 1.0745842125927976,
 1.0880123892600388,
 1.3323867714930204,
 1.084763004349474,
 1.1587839131131425,
 1.6042084002514578,
 1.0307086207441927,
 0.7120290668798744,
 0.8472985410140899,
 0.8856103319481907,
 1.5248521639391936,
 0.6332659028150582,
 1.200354200262607,
 1.121414266809207]

## Pipeline

In [41]:
from sklearn.pipeline import Pipeline
pipline = Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])

In [42]:
pipline

In [45]:
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipline, X, y, scoring='neg_mean_squared_error', cv=cv)

In [46]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])