**Cross validation**

Steps:
- split
  - k fold
  - leave one out
  - leave k out
- eval
- aggragate

Dependency:
- Model class
  - fit
  - predict
- Metrics class (can be part of model class)
  - eval


In [4]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [43]:
import sklearn.datasets as toy_data
import numpy as np

X, y = toy_data.load_diabetes(return_X_y=True)
y = y.reshape(-1, 1)

class kFoldCv:
    def __init__(self, k, X, y):
        self.k = k
        self.X = X
        self.y = y
        self.splits = []

        if X is not None:
            self.split()
        
    def split(self):
        n = len(self.X)
        fold_len = n // self.k
        
        for i in range(self.k):
            eval_idx = list(range(fold_len * i, fold_len * (i+1)))
            train_idx = [j for j in range(n) if j not in eval_idx]
            self.splits.append((train_idx, eval_idx))

    def __len__(self):
        return self.k

    def __getitem__(self, idx):
        train_idx, eval_idx = self.splits[idx]

        return self.X[train_idx], self.y[train_idx], self.X[eval_idx], self.y[eval_idx]

    def __call__(self, X, y):
        self.X = X
        self.y = y

        self.split()

KFCV = kFoldCv(5, X, y)

class LinReg:
    def __init__(self, bias=True):
        self.bias = bias

    def fit(self, X, y):
        if self.bias:
            ones = np.ones_like(y)
            X_u = np.concatenate([X, ones], axis=1)

        self.weights = np.linalg.inv(X_u.T @ X_u) @ (X_u.T) @ (y)

    def predict(self, X):
        if self.bias:
            ones = np.ones(shape=(len(X), 1))
            return np.concatenate([X, ones], axis=1) @ self.weights
        else:
            return X @ self.weights

model = LinReg()

class RMSE:
    def __init__(self):
        pass

    def __call__(self, y_pred, y_true):
        return np.sqrt(np.mean((y_pred - y_true) ** 2))
    
score = RMSE()


def run_cross_validation(cv_iter, model, score, name):
    scores = []
    for i, data in enumerate(cv_iter):
        X_t, y_t, X_e, y_e = data

        mdl = model.fit(X_t, y_t)
        pred = model.predict(X_e)

        scores.append(score(pred, y_e))

    print('======')
    if name is not None:
        print(f'Cross validation run: {name}')

    print(f'Cross validation score mean {np.mean(scores)}, std {np.std(scores)}')
    print(f'Cross validation scores {scores}')
    print('======')

run_cross_validation(KFCV, model, score, 'my CV')


Cross validation run: my CV
Cross validation score mean 54.868924811339184, std 1.4108715713851354
Cross validation scores [52.77838976271778, 54.72325609766083, 57.1663145725575, 55.21340861259678, 54.46325501116304]


In [44]:
from sklearn.linear_model import LinearRegression

model_sklearn = LinearRegression()
run_cross_validation(KFCV, model_sklearn, score, 'my CV over sklearn linear reg')


Cross validation run: my CV over sklearn linear reg
Cross validation score mean 54.86892481133924, std 1.4108715713851592
Cross validation scores [52.778389762717744, 54.72325609766089, 57.1663145725575, 55.21340861259704, 54.463255011163014]
