In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [None]:
df = sm.datasets.get_rdataset("Auto", "ISLR", cache=True).data

In [None]:
df.head()

In [None]:
y = df["mpg"]
X = df["horsepower"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)

In [None]:
lm = LinearRegression()
lm.fit(X_train.values.reshape(-1, 1), y_train)
mean_squared_error(y_test, lm.predict(X_test.values.reshape(-1, 1)))

In [None]:
df["hp_sq"] = df["horsepower"] **2
df["hp_cube"] = df["horsepower"] **3

In [None]:
y = df["mpg"]
X = df[["horsepower", "hp_sq"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

In [None]:
y = df["mpg"]
X = df[["horsepower", "hp_sq", "hp_cube"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

## 5.3.2 Leave-One-Out Cross-Validation

In [None]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

In [None]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

In [None]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq", "hp_cube"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 