In [1]:
import time
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import BayesianRidge, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

sns.set_theme("paper")
pd.options.mode.chained_assignment = None

In [2]:
X_TEST_LOAD_DIR = Path(
    Path.cwd().parent, "data.nosync", "transformed_data", "X_test.parquet"
)
y_TEST_LOAD_DIR = Path(
    Path.cwd().parent, "data.nosync", "transformed_data", "y_test.parquet"
)

X_TRAIN_LOAD_DIR = Path(
    Path.cwd().parent, "data.nosync", "transformed_data", "X_train.parquet"
)
y_TRAIN_LOAD_DIR = Path(
    Path.cwd().parent, "data.nosync", "transformed_data", "y_train.parquet"
)

In [3]:
X_test = pd.read_parquet(X_TEST_LOAD_DIR)
y_test = pd.read_parquet(y_TEST_LOAD_DIR)
y_train = pd.read_parquet(y_TRAIN_LOAD_DIR)
X_train = pd.read_parquet(X_TRAIN_LOAD_DIR)
y_test = np.ravel(y_test) # Required for ridge regression
y_train = np.ravel(y_train)

In [4]:
print(f"y_train: {y_train.shape}")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

y_train: (9799929, 1)
X_train: (9799929, 19)
X_test: (4899964, 19)
y_test: (4899964, 1)


# Modelling
## Baseline metrics

It's hard to understand how 'good' a result is without a sense of the distribution of target values. 

In [5]:
y_test.describe()

Unnamed: 0,Load
count,4899964.0
mean,1704.915
std,1479.828
min,205.7
25%,826.8534
50%,1247.816
75%,1828.93
max,11110.3


In [6]:
def calculate_metrics(y_test:pd.DataFrame,y_pred:np.ndarray) -> dict:
    rmse = mean_squared_error(y_test, y_pred, squared = False)
    mse = mean_squared_error(y_test, y_pred, squared = True)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"""
          RMSE: {rmse:.2f} 
          MSE: {mse:.2f}  
          MAE: {mae:.2f}  
          R2: {r2:.2f}""")
    return {"RMSE":rmse, "MSE":mse, "MAE":mae, "R2":r2}
    

def calculate_cv_metrics(model, X_train, y_train, n_splits: int = 5) -> dict:
      tscv = TimeSeriesSplit(n_splits=5)
      rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
      R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')
      return {'RMSE':np.mean(rmse), 'R2':np.mean(R2)}
      

In [7]:
models = [LinearRegression(), Ridge(),BayesianRidge()]
# Lasso Regression does not converge
results = []
timings = []

In [13]:
for model in models:
    start = time.time()
    results.append(calculate_cv_metrics(model, X_train, y_train))
    end = time.time()
    model_time = end - start
    timings.append(model_time)
    print(f"model {model} took {round(model_time, 2)} seconds")

model LinearRegression() took 193.66 seconds
model Ridge() took 37.85 seconds
model BayesianRidge() took 152.9 seconds


In [14]:
results

[{'RMSE': 1254.092020427405, 'R2': 0.3983675148072827},
 {'RMSE': 1253.059966799359, 'R2': 0.39942823507159925},
 {'RMSE': 1254.092020427405, 'R2': 0.3983675148072827},
 {'RMSE': 1253.059966799359, 'R2': 0.39942823507159925},
 {'RMSE': 1254.09132664721, 'R2': 0.3983689156087088}]