In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
import sklearn.metrics as metrics
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, cross_validate

In [2]:
# Read in the data you would like to enter to the model
model_data = pd.read_parquet('model_data.gzip')
model_data.dropna(inplace=True)

# Split the dataset into train and test data
y_train = model_data[lambda x: x['date_id'] < 478]['target']
X_train = model_data[lambda x: x['date_id'] < 478].filter(regex='active_.*|imbalance_buy_sell_flag|order_book_imbalance_*|auction_matched_pct_*')

y_test = model_data[lambda x: x['date_id'] >= 478]['target']
X_test = model_data[lambda x: x['date_id'] >= 478].filter(regex='active_.*|imbalance_buy_sell_flag|order_book_imbalance_*|auction_matched_pct_*')       

baseline_train_mae = y_train.abs().mean()
baseline_test_mae = y_test.abs().mean()

In [None]:
# Fit a linear regression model using the training data and use it to predict the test data to then measure accuracy of prediction
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_train_preds = lin_reg.predict(X_train)
y_test_preds = lin_reg.predict(X_test)
train_score = metrics.mean_absolute_error(y_train, y_train_preds)
test_score = metrics.mean_absolute_error(y_test, y_test_preds)
print('Baseline Train MAE: ' + str(round(baseline_train_mae, 4)))
print('\nModel Train MAE: ' + str(round(train_score, 4)))
print('\nBaseline Test MAE: ' + str(round(baseline_test_mae, 4)))
print('\nModel Test MAE: ' + str(round(test_score, 4)))

Baseline Train MAE: 6.3853

Model Train MAE: 6.3088

Baseline Test MAE: 5.2654

Model Test MAE: 5.2423


In [None]:
# Run a 5-fold CV on the linear regression to ensure the model works well across all subsets of the training data
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
cv_results = cross_validate(lin_reg, X_train, y_train, cv=kf, return_estimator=True, return_train_score=True, scoring='neg_mean_absolute_error')
print(f"MAE for each fold: {[round(-score, 4) for score in cv_results['test_score']]}")
print(f"Average MAE across {k} folds: {-np.mean(cv_results['test_score']):.4f}")

MAE for each fold: [6.2923, 6.3091, 6.3063, 6.3077, 6.3142]
Average MAE across 5 folds: 6.3059


In [None]:
# Next we try fitting and predicting using a ridge regression model, but first we tune the alpha hyperparameter using GridSearch
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
ridge_reg = Ridge()
grid_search = GridSearchCV(ridge_reg, param_grid, cv=kf, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_
y_train_preds = best_model.predict(X_train)
y_test_preds = best_model.predict(X_test)
train_score = metrics.mean_absolute_error(y_train, y_train_preds)
test_score = metrics.mean_absolute_error(y_test, y_test_preds)
print(f"Best alpha: {best_alpha}")
print('\nBaseline Train MAE: ' + str(round(baseline_train_mae, 4)))
print('\nModel Train MAE: ' + str(round(train_score, 4)))
print('\nBaseline Test MAE: ' + str(round(baseline_test_mae, 4)))
print('\nModel Test MAE: ' + str(round(test_score, 4)))

Best alpha: 0.001

Baseline Train MAE: 6.3853

Model Train MAE: 6.3406

Baseline Test MAE: 5.2654

Model Test MAE: 5.2479


In [12]:
# Next we try fitting and predicting using a lasso regression model, but first we tune the alpha hyperparameter using GridSearch
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
lasso_reg = Lasso()
grid_search = GridSearchCV(lasso_reg, param_grid, cv=kf, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_
y_train_preds = best_model.predict(X_train)
y_test_preds = best_model.predict(X_test)
train_score = metrics.mean_absolute_error(y_train, y_train_preds)
test_score = metrics.mean_absolute_error(y_test, y_test_preds)
print(f"Best alpha: {best_alpha}")
print('\nBaseline Train MAE: ' + str(round(baseline_train_mae, 4)))
print('\nModel Train MAE: ' + str(round(train_score, 4)))
print('\nBaseline Test MAE: ' + str(round(baseline_test_mae, 4)))
print('\nModel Test MAE: ' + str(round(test_score, 4)))

Best alpha: 0.001

Baseline Train MAE: 6.3853

Model Train MAE: 6.3723

Baseline Test MAE: 5.2654

Model Test MAE: 5.2578
