In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [None]:
df = sm.datasets.get_rdataset("Auto", "ISLR", cache=True).data

In [None]:
df.head()

In [None]:
y = df["mpg"]
X = df["horsepower"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)

In [None]:
lm = LinearRegression()
lm.fit(X_train.values.reshape(-1, 1), y_train)
mean_squared_error(y_test, lm.predict(X_test.values.reshape(-1, 1)))

In [None]:
df["hp_sq"] = df["horsepower"] **2
df["hp_cube"] = df["horsepower"] **3

In [None]:
y = df["mpg"]
X = df[["horsepower", "hp_sq"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

In [None]:
y = df["mpg"]
X = df[["horsepower", "hp_sq", "hp_cube"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

## 5.3.2 Leave-One-Out Cross-Validation

In [None]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

In [None]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

In [None]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq", "hp_cube"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

## 5.3.3 k-Fold Cross-Validation

In [None]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
validation_dict = dict()
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for degree in range(1,11):
    validation_dict[degree] = {"true": list(), "predict": list()}
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    Xpoly = poly.fit_transform(X)
    for train, test in kf.split(Xpoly, y):
        lm = LinearRegression()
        lm.fit(Xpoly[train], y[train])
        validation_dict[degree]["true"].extend(y[test])
        validation_dict[degree]["predict"].extend(lm.predict(Xpoly[test]))
for degree in validation_dict.keys():
    y_true = validation_dict[degree]["true"]
    y_pred = validation_dict[degree]["predict"]
    mse = mean_squared_error(y_true, y_pred)
    print(f"Degree: {degree}, MSE: {mse:0.3}")

## 5.3.4 The Bootstrap