# Cross Validation
- How cross validation can be invalid due to time variants
- Example: Hospital readmission risk at discharge, over time

In [5]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

np.random.seed(42)

n_days = 900
dates = pd.date_range("2020-01-01", periods=n_days, freq="D")

# Time trend: hospital gets better over time (lower readmissions)
time_trend = np.linspace(1.5, -1.5, n_days)

# Patient-level features
avg_length_of_stay = np.random.normal(4, 1.0, n_days)
num_prior_admits = np.random.poisson(1.2, n_days)
is_elderly = np.random.binomial(1, 0.35, n_days)

# True readmission risk (logit scale)
logit = (
    time_trend
    + 0.4 * avg_length_of_stay
    + 0.6 * num_prior_admits
    + 0.7 * is_elderly
)

p = 1 / (1 + np.exp(-logit))
y = np.random.binomial(1, p)

X = pd.DataFrame({
    "avg_length_of_stay": avg_length_of_stay,
    "num_prior_admits": num_prior_admits,
    "is_elderly": is_elderly,
})
X.head()

Unnamed: 0,avg_length_of_stay,num_prior_admits,is_elderly
0,4.496714,0,0
1,3.861736,3,0
2,4.647689,0,0
3,5.52303,3,1
4,3.765847,2,0


In [6]:
model = LogisticRegression(max_iter=1000)

cv_auc_random = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="roc_auc"
)

cv_auc_random.mean()

np.float64(0.6780205792706853)

In [7]:
tscv = TimeSeriesSplit(n_splits=5)

auc_scores = []

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]

    auc_scores.append(roc_auc_score(y_test, y_prob))

np.mean(auc_scores)


np.float64(0.7020726829255646)

In [8]:
cv_auc_random.mean(), np.mean(auc_scores)

(np.float64(0.6780205792706853), np.float64(0.7020726829255646))

- This is supposed to show that consecutive time ordered data can leak information to future records and using a time series split can give you a more accurate representation of the cross validation score than regular cross validation.