In [1]:
import statsmodels.api as sm
import numpy as np
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

# 5.3 Lab: Cross-Validation and the Bootstrap

## 5.3.1 The Validation Set Approach

In [2]:
df = sm.datasets.get_rdataset("Auto", "ISLR", cache=True).data

  return dataset_meta["Title"].item()


In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
1,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
2,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
3,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
4,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
5,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
y = df["mpg"]
X = df["horsepower"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)

In [5]:
lm = LinearRegression()
lm.fit(X_train.values.reshape(-1, 1), y_train)
mean_squared_error(y_test, lm.predict(X_test.values.reshape(-1, 1)))

25.5738781896844

In [6]:
df["hp_sq"] = df["horsepower"] **2
df["hp_cube"] = df["horsepower"] **3

In [7]:
y = df["mpg"]
X = df[["horsepower", "hp_sq"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

22.218020050032855

In [8]:
y = df["mpg"]
X = df[["horsepower", "hp_sq", "hp_cube"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=196, random_state=42)
lm = LinearRegression()
lm.fit(X_train, y_train)
mean_squared_error(y_test, lm.predict(X_test))

22.667675435534864

## 5.3.2 Leave-One-Out Cross-Validation

In [9]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

24.231513517929226

In [10]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

19.24821312448967

In [11]:
y = df["mpg"].values
X = df[["horsepower", "hp_sq", "hp_cube"]].values
loo = LeaveOneOut()
y_true = []
y_pred = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_true.append(y_test[0])
    y_pred.append(lm.predict(X_test)[0])
mean_squared_error(y_true, y_pred) 

19.33498406402931

## 5.3.3 k-Fold Cross-Validation

In [12]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
validation_dict = dict()
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for degree in range(1,11):
    validation_dict[degree] = {"true": list(), "predict": list()}
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    Xpoly = poly.fit_transform(X)
    for train, test in kf.split(Xpoly, y):
        lm = LinearRegression()
        lm.fit(Xpoly[train], y[train])
        validation_dict[degree]["true"].extend(y[test])
        validation_dict[degree]["predict"].extend(lm.predict(Xpoly[test]))
for degree in validation_dict.keys():
    y_true = validation_dict[degree]["true"]
    y_pred = validation_dict[degree]["predict"]
    mse = mean_squared_error(y_true, y_pred)
    print(f"Degree: {degree}, MSE: {mse:0.3}")

Degree: 1, MSE: 24.2
Degree: 2, MSE: 19.2
Degree: 3, MSE: 19.3
Degree: 4, MSE: 19.3
Degree: 5, MSE: 19.0
Degree: 6, MSE: 18.9
Degree: 7, MSE: 19.1
Degree: 8, MSE: 19.2
Degree: 9, MSE: 19.1
Degree: 10, MSE: 18.9


## 5.3.4 The Bootstrap

### Bootstrapping $\alpha$

In [13]:
df = sm.datasets.get_rdataset("Portfolio", "ISLR", cache=True).data

  return dataset_meta["Title"].item()


In [14]:
def alpha(df):
    x_vals = df["X"].values
    y_vals = df["Y"].values
    cov_mat = np.cov(x_vals, y_vals, ddof=0) #delta degress of freedom defaults to 1 in cov
    x_var = cov_mat[0][0]
    y_var = cov_mat[1][1]
    xy_cov = cov_mat[0][1]
    return (y_var - xy_cov) / (x_var + y_var - (2 * xy_cov))

In [15]:
alpha_ests = np.array([
    alpha(df.sample(frac=1, replace=True))
    for _ in range(1_000)
])

In [16]:
np.mean(alpha_ests)

0.5783434123047262

In [17]:
np.std(alpha_ests)

0.09053838024983998

### Bootstrapping OLS parameter estimates

In [18]:
df = sm.datasets.get_rdataset("Auto", "ISLR", cache=True).data

Get the parameters and standard errors using the usual formulas

In [19]:
auto_ols = sm.OLS(df["mpg"], sm.add_constant(df["horsepower"])).fit()
print(auto_ols.summary())

                            OLS Regression Results                            
Dep. Variable:                    mpg   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.605
Method:                 Least Squares   F-statistic:                     599.7
Date:                Sun, 05 Jan 2020   Prob (F-statistic):           7.03e-81
Time:                        12:40:30   Log-Likelihood:                -1178.7
No. Observations:                 392   AIC:                             2361.
Df Residuals:                     390   BIC:                             2369.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         39.9359      0.717     55.660      0.0

  return ptp(axis=axis, out=out, **kwargs)


In [20]:
y = df["mpg"].values
X = df["horsepower"].values.reshape(-1, 1)
lm = LinearRegression().fit(X, y)
print(f"Horsepower: {lm.coef_[0]}, Intercept: {lm.intercept_}")

Horsepower: -0.15784473335365365, Intercept: 39.93586102117047


In [21]:
beta_1s = list()
beta_0s = list()
for _ in range(1_000):
    boot_df = df.sample(frac=1, replace=True)
    y = boot_df["mpg"].values
    X = boot_df["horsepower"].values.reshape(-1, 1)
    lm = LinearRegression().fit(X, y)
    beta_1s.append(lm.coef_[0])
    beta_0s.append(lm.intercept_)

beta_1s = np.array(beta_1s)
beta_0s = np.array(beta_0s)
print(f"Bootstrap beta1: {np.mean(beta_1s):0.2}, bootstrap intercept: {np.mean(beta_0s):0.2}, Bootstrap b1 se: {np.std(beta_1s):0.2}, Bootstrap intercept se: {np.std(beta_0s):0.2}")

Bootstrap beta1: -0.16, bootstrap intercept: 4e+01, Bootstrap b1 se: 0.0075, Bootstrap intercept se: 0.85
