In [1]:
import numpy as np
import statsmodels.api as sm
import seaborn as sn
import pandas as pd
from itertools import combinations
from sklearn.model_selection import KFold

In [69]:
def evaluate_model(model, X, y):
    eval = {}
    n, d = X.shape
    eps = model.predict(X) - y.flatten()
    mse = np.sum(np.square(eps)) / n
    var = np.var(eps)
    Q_inv = np.linalg.inv(X.T @ X / n)
    omega = X.T @ np.diag(eps) @ X / n
    eval['pred_hat'] = mse + 2 * np.trace(Q_inv @ omega @ Q_inv) / n
    eval['BIC'] = mse + 2 * np.log(n) * d * var / n
    eval['c_p_mallow'] = mse + 2 * d * var / n
    eval['cross_validation'] = cross_validation(X, y)
    return eval


def cross_validation(X, y, k=5):
    folds = KFold(k)
    mse = 0
    for train_idx, test_idx in folds.split(X):
        model = sm.OLS(y[train_idx], X[train_idx]).fit()
        mse += np.sum(np.square(model.predict(X[test_idx]) - y[test_idx].flatten()))
    return mse / X.shape[0]

In [95]:
models = []
tips = sn.load_dataset('tips')
tips['smoker'] = (tips['smoker'] == 'Yes').astype(int)
y = tips['tip'].to_numpy()
features = ['total_bill', 'size', 'smoker']
for i in range(1, 4):
    models.extend([list(x) for x in combinations(features, i)])
evaluations = {}
for idx, model in enumerate(models):
    X = tips[model]
    X = np.c_[np.ones(X.shape[0]), X]
    ols = sm.OLS(y, X)
    model = ols.fit()
    evaluations[f'Model {idx + 1}'] = evaluate_model(model, X, y)
for n, m in evaluations.items():
    print(n, m)

Model 1 {'pred_hat': 1.036368622218773, 'BIC': 1.1293829363951629, 'c_p_mallow': 1.0530033672902521, 'cross_validation': 1.0605033737602012}
Model 2 {'pred_hat': 1.4583462710170072, 'BIC': 1.580824200221883, 'c_p_mallow': 1.4739138978323705, 'cross_validation': 1.4762143190493433}
Model 3 {'pred_hat': 1.9065414998196406, 'BIC': 2.0783542760985156, 'c_p_mallow': 1.9377962785052085, 'cross_validation': 2.00072390204187}
Model 4 {'pred_hat': 1.014424668771491, 'BIC': 1.1517100030268406, 'c_p_mallow': 1.0395132229610944, 'cross_validation': 1.0423663451131129}
Model 5 {'pred_hat': 1.0356012403150814, 'BIC': 1.170169847788936, 'c_p_mallow': 1.056174754660546, 'cross_validation': 1.0816330343548475}
Model 6 {'pred_hat': 1.4549054839462867, 'BIC': 1.6350293677534635, 'c_p_mallow': 1.4757487937437261, 'cross_validation': 1.5699247673931107}
Model 7 {'pred_hat': 1.0177277486562684, 'BIC': 1.1956041353594216, 'c_p_mallow': 1.04623591578404, 'cross_validation': 1.0790084322353986}
