In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
from matplotlib.pyplot import subplots

In [None]:
from functools import partial
from sklearn.model_selection import \
     (cross_validate,
      KFold,
      ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

In [None]:
# Create helper functions for computing predictions and the mean squared error

def predict(X, model):
    # the built-in get_prediction tool returns an array, so we need to convert to a dataframe
    predictions_df = pd.DataFrame(model.get_prediction(X).predicted, columns=['y_hat'], index=X.index)
    return predictions_df['y_hat']

def mse(y, y_hat):
    # calculate the residual error for each individual record
    resid = y - y_hat
    # square the residual (hence "squared error")
    sq_resid = resid**2
    # calculate the sum of squared errors
    SSR = sum(sq_resid)
    # divide by the number of records to get the mean squared error
    MSE = SSR / y.shape[0]
    return MSE

In [None]:
# Randomly generate data using the equation y = x - 2x^2
rng = np.random.default_rng(1)
x = rng.normal(size=150)
y = x - 2 * x**2 + rng.normal(size=150)

In [None]:
# Create a data frame with columns we can use for modeling
new_x = pd.DataFrame(np.column_stack((x**0, x, x**2, x**3)), columns=['intercept','x','x_sq','x_cu'])
new_x

In [None]:
# Split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(new_x,
                                                    y,
                                                    random_state=314,
                                                    test_size=0.33,
                                                    shuffle=True) 

In [None]:
# Let's graph the data we created
fig, ax = plt.subplots()
ax.scatter(x_train['x'],y_train)

In [None]:
# Fit a linear model
model_lin = sm.OLS(y_train, x_train[['intercept','x']])
results_lin = model_lin.fit()
summarize(results_lin)

In [None]:
# Get model predictions
predictions_lin_train = predict(x_train[['intercept', 'x']], results_lin)

In [None]:
# Plot predictions as solid line
fig, ax = plt.subplots()
ax.scatter(x_train['x'],y_train)
ax.plot(x_train['x'],predictions_lin_train, color='green')

In [None]:
# Fit a quadratic model
model_quad = sm.OLS(y_train, x_train[['intercept','x','x_sq']])
results_quad = model_quad.fit()
summarize(results_quad)

In [None]:
# Get predictions
x_train['pred_sq'] = predict(x_train[['intercept', 'x', 'x_sq']], results_quad)

In [None]:
# Plot predictions as solid line
fig, ax = plt.subplots()
ax.scatter(x_train['x'],y_train)
quadratic=x_train[['x','pred_sq']].sort_values('x')
ax.plot(quadratic['x'],quadratic['pred_sq'], color='red')

In [None]:
# Fit a cubic model
model_cubic = sm.OLS(y_train, x_train[['intercept','x','x_sq','x_cu']])
results_cubic = model_cubic.fit()
summarize(results_cubic)

In [None]:
# Get predictions
x_train['pred_cu'] = predict(x_train[['intercept', 'x','x_sq','x_cu']], results_cubic)

In [None]:
# Plot predictions as solid line
fig, ax = plt.subplots()
ax.scatter(x_train['x'],y_train)
cubic=x_train[['x','pred_cu']].sort_values('x')
ax.plot(cubic['x'],cubic['pred_cu'], color='orange')

In [None]:
# Calculate the MSE on the training set for each model
predictions_lin_train = predict(x_train[['intercept', 'x']], results_lin)
predictions_quad_train = predict(x_train[['intercept', 'x', 'x_sq']], results_quad)
predictions_cubic_train = predict(x_train[['intercept', 'x', 'x_sq', 'x_cu']], results_cubic)
print('mse train linear   :',mse(y_train, predictions_lin_train))
print('mse train quadratic:',mse(y_train, predictions_quad_train))
print('mse train cubic    :',mse(y_train, predictions_cubic_train))

In [None]:
# Calculate the MSE on the test set for each model
predictions_lin_test = predict(x_test[['intercept', 'x']], results_lin)
predictions_quad_test = predict(x_test[['intercept', 'x', 'x_sq']], results_quad)
predictions_cubic_test = predict(x_test[['intercept', 'x', 'x_sq', 'x_cu']], results_cubic)
print('mse test linear   :',mse(y_test, predictions_lin_test))
print('mse test quadratic:',mse(y_test, predictions_quad_test))
print('mse test cubic    :',mse(y_test, predictions_cubic_test))

In [None]:
# Calculate CV error w/ 5 folds for linear model
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                      x_train[['intercept','x']],
                      y_train,
                      cv=5)
print(M_CV['test_score'])
cv_error_lin = np.mean(M_CV['test_score'])
print('estimated mse test:', cv_error_lin)

In [None]:
# Calculate CV error w/ 5 folds for quadratic model
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                      x_train[['intercept','x','x_sq']],
                      y_train,
                      cv=5)
print(M_CV['test_score'])
cv_error_quad = np.mean(M_CV['test_score'])
print('estimated mse test:', cv_error_quad)

In [None]:
# Calculate CV error w/ 5 folds for cubic model
M = sklearn_sm(sm.OLS)
M_CV = cross_validate(M,
                      x_train[['intercept','x','x_sq','x_cu']],
                      y_train,
                      cv=5)
print(M_CV['test_score'])
cv_error_cubic = np.mean(M_CV['test_score'])
print('estimated mse test:', cv_error_cubic)