In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import patsy

In [4]:
auto = pd.read_csv('../data/Auto.csv', na_values='?').dropna()
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [6]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(auto, test_size=0.5, random_state=42)

### validation set approach

In [8]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [13]:
lm_fit1 = smf.ols('mpg ~ horsepower', data=train).fit()
np.mean((valid.mpg - lm_fit1.predict(valid.horsepower))**2)

25.573878189684407

In [18]:
# lm_fit2 = smf.ols('mpg ~ poly(horsepower, 2)', data=train).fit()
lm_fit2 = smf.ols('mpg ~ horsepower + I(horsepower**2)', data=train).fit()
np.mean((valid.mpg - lm_fit2.predict(valid.horsepower))**2)

22.21802005003901

In [20]:
lm_fit3 = smf.ols('mpg ~ horsepower + I(horsepower**2) + I(horsepower**3)', data=train).fit()
np.mean((valid.mpg - lm_fit3.predict(valid.horsepower))**2)

22.667675434958934

### Leave-one-out cv

In [29]:
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [25]:
loo_mse = []
for train_idx, valid_idx in LeaveOneOut().split(auto):
    train, valid = auto.iloc[train_idx], auto.iloc[valid_idx]
    X_train, y_train = train[['horsepower']], train['mpg']
    X_valid, y_valid = valid[['horsepower']], valid['mpg']
    lm = LinearRegression().fit(X_train, y_train)
    loo_mse.append(mean_squared_error(y_valid, lm.predict(X_valid)))
np.mean(loo_mse)

24.231513517929226

In [28]:
degrees = np.arange(1, 6)
deg_mse = []
for deg in degrees:
    loo_mse = []
    for train_idx, valid_idx in LeaveOneOut().split(auto):
        train, valid = auto.iloc[train_idx], auto.iloc[valid_idx]
        X_train, y_train = train[['horsepower']], train['mpg']
        X_valid, y_valid = valid[['horsepower']], valid['mpg']
        poly = PolynomialFeatures(degree=deg)
        X_train_poly = poly.fit_transform(X_train)
        X_valid_poly = poly.transform(X_valid)
        lm = LinearRegression().fit(X_train_poly, y_train)
        loo_mse.append(mean_squared_error(y_valid, lm.predict(X_valid_poly)))
    deg_mse.append(np.mean(loo_mse))
deg_mse

[24.231513517929226,
 19.24821312448939,
 19.334984064114092,
 19.424430309411886,
 19.033211842978396]

### k-fold cv

In [39]:
degrees = np.arange(1, 3)
deg_mse = []
ones = np.ones(10)
for deg in degrees:
    loo_mse = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, valid_idx in kf.split(ones): # the idx's returned depends on the length of ones
                                                # and will be same for each iteration of degree
        print(train_idx, valid_idx)

[0 2 3 4 5 6 7 9] [1 8]
[1 2 3 4 6 7 8 9] [0 5]
[0 1 3 4 5 6 8 9] [2 7]
[0 1 2 3 5 6 7 8] [4 9]
[0 1 2 4 5 7 8 9] [3 6]
[0 2 3 4 5 6 7 9] [1 8]
[1 2 3 4 6 7 8 9] [0 5]
[0 1 3 4 5 6 8 9] [2 7]
[0 1 2 3 5 6 7 8] [4 9]
[0 1 2 4 5 7 8 9] [3 6]


In [34]:
degrees = np.arange(1, 11)
deg_mse = []
for deg in degrees:
    loo_mse = []
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    for train_idx, valid_idx in kf.split(auto):
        train, valid = auto.iloc[train_idx], auto.iloc[valid_idx]
        X_train, y_train = train[['horsepower']], train['mpg']
        X_valid, y_valid = valid[['horsepower']], valid['mpg']
        poly = PolynomialFeatures(degree=deg)
        X_train_poly = poly.fit_transform(X_train)
        X_valid_poly = poly.transform(X_valid)
        lm = LinearRegression().fit(X_train_poly, y_train)
        loo_mse.append(mean_squared_error(y_valid, lm.predict(X_valid_poly)))
    deg_mse.append(np.mean(loo_mse))
deg_mse

[24.199808197692484,
 19.228636614267835,
 19.266265346704934,
 19.351092276514912,
 19.02322470453627,
 18.95474301224136,
 19.07799960256754,
 19.15690094766876,
 19.06359682272744,
 18.885403509750095]

## Bootstrap

I don't know how to do bootstrap yet. TODO