## 5.3 実習: 交差検証とブートストラップ

In [147]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import PolynomialFeatures as PF

#### データの読み込み

In [7]:
dataset_path = "./datasets/Auto 2.csv"
df = pd.read_csv(dataset_path, index_col=0)
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
2,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
3,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
4,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
5,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


### 5.3.1 ホールドアウト検証

#### データの分割  
```df.sample```だと、抽出されなかった側のデータをどう引っ張ってくるかが分からないので```train_test_split```を使った方が良い

In [None]:
train_df = df.sample(frac=0.5, random_state=1)

In [18]:
train_df, test_df = train_test_split(df, train_size=0.5, random_state=1)

In [19]:
train_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
32,25.0,4,113.0,95,2228,14.0,71,3,toyota corona
57,26.0,4,91.0,70,1955,20.5,71,1,plymouth cricket
259,20.6,6,231.0,105,3380,15.8,78,1,buick century special
34,19.0,6,232.0,100,2634,13.0,71,1,amc gremlin
271,21.1,4,134.0,95,2515,14.8,78,3,toyota celica gt liftback
...,...,...,...,...,...,...,...,...,...
206,28.0,4,97.0,75,2155,16.4,76,3,toyota corolla
258,19.4,6,232.0,90,3210,17.2,78,1,amc concord
74,13.0,8,307.0,130,4098,14.0,72,1,chevrolet chevelle concours (sw)
238,30.5,4,98.0,63,2051,17.0,77,1,chevrolet chevette


In [28]:
lr_model = LinearRegression()
lr_model.fit(np.array(train_df.horsepower).reshape(-1, 1), train_df.mpg)

LinearRegression()

In [31]:
mpg_pred = lr_model.predict(np.array(test_df.horsepower).reshape(-1, 1))
mpg_pred.shape

(196,)

#### MSEの算出

In [34]:
mse(test_df.mpg, mpg_pred)

24.80212062059356

#### 多項回帰 

In [64]:
cubic_features = PF(degree=3, include_bias = False)

train_c_horsepower = cubic_features.fit_transform(np.array(train_df.horsepower).reshape(-1, 1))
train_q_horsepower = train_c_horsepower[:, :2]

test_c_horsepower = cubic_features.fit_transform(np.array(test_df.horsepower).reshape(-1, 1))
test_q_horsepower = test_c_horsepower[:, :2]

In [65]:
lr_q_model = LinearRegression()
lr_q_model.fit(train_q_horsepower, train_df.mpg)

lr_c_model = LinearRegression()
lr_c_model.fit(train_c_horsepower, train_df.mpg)

LinearRegression()

In [67]:
q_mpg_pred = lr_q_model.predict(test_q_horsepower)
c_mpg_pred = lr_c_model.predict(test_c_horsepower)

In [68]:
mse(test_df.mpg, q_mpg_pred), mse(test_df.mpg, c_mpg_pred) 

(18.848292603275663, 18.805111358604574)

### 5.3.2 1つ抜き交差検証
2次、3次の結果は確認していない

In [149]:
loo = LeaveOneOut()
loo_df = loo.split(df)

mses = list()

start = time.time()
for i, j in loo_df:
    train_horsepower =  df.horsepower.to_numpy()[i].reshape(-1, 1)
    test_horsepower = df.horsepower.to_numpy()[j].reshape(-1, 1)
    train_mpg = df.mpg.to_numpy()[i]
    test_mpg = df.mpg.to_numpy()[j]
    
    lr_model.fit(train_horsepower, train_mpg)
    lr_model.predict(test_horsepower)
    mses.append(mse(test_mpg, lr_model.predict(test_horsepower)))
loo_finish = time.time() - start

df_mses = pd.DataFrame(mses)
df_mses.describe()

Unnamed: 0,0
count,392.0
mean,24.231514
std,36.84434
min,0.003546
25%,2.074089
50%,9.727509
75%,31.683015
max,289.448955


### 5.3.3 k分割交差検証

In [150]:
kf = KFold(n_splits=10)
kf_df = kf.split(df)

mses = list()

start = time.time()
for i, j in kf_df:
    train_horsepower =  df.horsepower.to_numpy()[i].reshape(-1, 1)
    test_horsepower = df.horsepower.to_numpy()[j].reshape(-1, 1)
    train_mpg = df.mpg.to_numpy()[i]
    test_mpg = df.mpg.to_numpy()[j]
    
    lr_model.fit(train_horsepower, train_mpg)
    lr_model.predict(test_horsepower)
    mses.append(mse(test_mpg, lr_model.predict(test_horsepower)))
kf_finish = time.time() - start

df_mses = pd.DataFrame(mses)
df_mses.describe()

Unnamed: 0,0
count,10.0
mean,27.439934
std,15.295147
min,15.557633
25%,17.393265
50%,23.09837
75%,27.992216
max,65.934896


In [151]:
loo_finish / kf_finish

26.652291105121293

### 5.3.4 ブートストラップ

In [157]:
idx = df.index.to_numpy()

In [170]:
df.horsepower.to_numpy()[sample_idx].reshape(-1, 1)

array([[110],
       [120],
       [ 95],
       [130],
       [ 90],
       [ 95],
       [225],
       [ 90],
       [105],
       [ 90],
       [ 97],
       [ 46],
       [153],
       [ 75],
       [150],
       [ 63],
       [215],
       [158],
       [ 70],
       [ 75],
       [ 69],
       [140],
       [145],
       [ 74],
       [ 86],
       [165],
       [112],
       [ 62],
       [ 75],
       [140],
       [ 80],
       [112],
       [215],
       [ 76],
       [ 75],
       [ 70],
       [100],
       [140],
       [ 65],
       [ 88],
       [135],
       [ 95],
       [ 88],
       [ 84],
       [ 67],
       [ 75],
       [ 88],
       [ 83],
       [ 90],
       [180],
       [175],
       [ 80],
       [110],
       [140],
       [ 70],
       [ 65],
       [ 83],
       [ 94],
       [ 87],
       [ 52],
       [100],
       [ 90],
       [ 97],
       [ 75],
       [ 88],
       [ 74],
       [ 78],
       [150],
       [ 68],
       [ 90],
       [ 78],
      

In [179]:
idx = [i for i in range(len(df))]
model_result = {'coef': [], 'intercept': []}

for i in range(1000):
    sample_idx = np.random.choice(idx, len(idx))
    train_horsepower =  df.horsepower.to_numpy()[sample_idx].reshape(-1, 1)
    train_mpg = df.mpg.to_numpy()[sample_idx]
    lr_model.fit(train_horsepower, train_mpg)
    model_result['coef'].append(lr_model.coef_[0])
    model_result['intercept'].append(lr_model.intercept_)

model_result_df = pd.DataFrame(model_result)
model_result_df

Unnamed: 0,coef,intercept
0,-0.161855,40.337288
1,-0.163838,40.468166
2,-0.160446,40.355379
3,-0.150274,39.443452
4,-0.148791,38.987215
...,...,...
995,-0.156319,40.108818
996,-0.158825,39.853875
997,-0.155694,39.262025
998,-0.157266,39.959454


In [180]:
model_result_df.describe()

Unnamed: 0,coef,intercept
count,1000.0,1000.0
mean,-0.158203,39.959186
std,0.007626,0.880015
min,-0.185048,37.078968
25%,-0.162961,39.361923
50%,-0.157952,39.952691
75%,-0.153287,40.508845
max,-0.134052,43.11455
