In [6]:
import numpy as np
import pandas as pd
import functools
import statsmodels.api as sm
from Regession_Model import RegressionModel
import itertools

# Read in the data you would like to enter to the model
model_data = pd.read_parquet('model_data.gzip')
model_data.dropna(inplace=True)

In [2]:
# Split the dataset into train and test data
y_train = model_data[lambda x: x['date_id'] < 478]['target']
X_train = model_data[lambda x: x['date_id'] < 478].filter(regex='active_.*|imbalance_buy_sell_flag|order_book_imbalance_*|auction_matched_pct_*')

y_test = model_data[lambda x: x['date_id'] >= 478]['target']
X_test = model_data[lambda x: x['date_id'] >= 478].filter(regex='active_.*|imbalance_buy_sell_flag|order_book_imbalance_*|auction_matched_pct_*')

# Initialise model with the train and test datasets
model = RegressionModel(X_train, y_train, X_test, y_test)

In [None]:
# Fit a simple linear regression model to the dataset and evaluate the fit

RegressionModel.fit_model(model, model_name='linear_regression')
pred_train_y = RegressionModel.predict_model(model, train_or_test='train')
train_eval = RegressionModel.eval_fit(model, pred_train_y, metric_name='all', train_or_test='train')
baseline_train_mae = y_train.abs().mean()
baseline_test_mae = y_test.abs().mean()
print('Baseline Train MAE: ' + str(baseline_train_mae))
print('\nBaseline MAE: ' + str(baseline_test_mae))
print('\nTrain Evaluation Metrics:')
print('\n'.join('{}:{}'.format(*k) for k in enumerate(train_eval)))
pred_test_y = RegressionModel.predict_model(model, train_or_test='test')
test_eval = RegressionModel.eval_fit(model, pred_test_y, metric_name='all', train_or_test='test')
print('\nTest Evaluation Metrics:')
print('\n'.join('{}:{}'.format(*k) for k in enumerate(test_eval)))

Baseline Train MAE: 6.385294214988701

Train Evaluation Metrics:
0:('mae', 6.308841173242167)
1:('mse', 86.86180450049464)
2:('rmse', 9.319968052546889)
3:('r2', 0.020630558318166736)
4:('exp_var', 0.02063055831816607)
5:('corr_coef', 0.14363341643979247)

Baseline MAE: 5.265409877425062

Test Evaluation Metrics:
0:('mae', 5.24225237755717)
1:('mse', 61.50558456337398)
2:('rmse', 7.842549621352356)
3:('r2', 0.0008892122604992059)
4:('exp_var', 0.0009855936263353238)
5:('corr_coef', 0.07460267243442188)


In [4]:
# Fit a ridge regression model to the dataset and evaluate the fit for a range of lambdas

lambdas = [0.1, 0.5, 1, 10]

print('Baseline Train MAE: ' + str(baseline_train_mae))
print('\nBaseline MAE: ' + str(baseline_test_mae))

for i, lambda_i in enumerate(lambdas):
    RegressionModel.fit_model(model, model_name='ridge', reg_alpha=lambda_i)
    pred_train_y = RegressionModel.predict_model(model, train_or_test='train')
    train_eval = RegressionModel.eval_fit(model, pred_train_y, metric_name='all', train_or_test='train')
    print('\nTrain Evaluation Metrics (Lambda = ' + str(lambda_i) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(train_eval)))
    pred_test_y = RegressionModel.predict_model(model, train_or_test='test')
    test_eval = RegressionModel.eval_fit(model, pred_test_y, metric_name='all', train_or_test='test')
    print('\nTest Evaluation Metrics (Lambda = ' + str(lambda_i) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(test_eval)))

Baseline Train MAE: 6.385294214988701

Baseline MAE: 5.265409877425062

Train Evaluation Metrics (Lambda = 0.1):
0:('mae', 6.356405404096494)
1:('mse', 87.95647803424615)
2:('rmse', 9.378511504191172)
3:('r2', 0.008288081509870926)
4:('exp_var', 0.008288081509871259)
5:('corr_coef', 0.09142151278861063)

Test Evaluation Metrics (Lambda = 0.1):
0:('mae', 5.251287571864138)
1:('mse', 61.31853070364139)
2:('rmse', 7.830614963311719)
3:('r2', 0.0039277579547453145)
4:('exp_var', 0.004029828861439433)
5:('corr_coef', 0.06789049751002979)

Train Evaluation Metrics (Lambda = 0.5):
0:('mae', 6.360513068199507)
1:('mse', 88.06021682767191)
2:('rmse', 9.384040538471256)
3:('r2', 0.007118423513673533)
4:('exp_var', 0.007118423513673533)
5:('corr_coef', 0.08480720191540556)

Test Evaluation Metrics (Lambda = 0.5):
0:('mae', 5.253115996061536)
1:('mse', 61.33063958537427)
2:('rmse', 7.8313881008014326)
3:('r2', 0.0037310585094403326)
4:('exp_var', 0.003830214966563328)
5:('corr_coef', 0.06543468536

In [5]:
# Fit a lasso regression model to the dataset and evaluate the fit for a range of lambdas

lambdas = [0.1, 0.5, 1, 10]

print('Baseline Train MAE: ' + str(baseline_train_mae))
print('\nBaseline MAE: ' + str(baseline_test_mae))

for i, lambda_i in enumerate(lambdas):
    RegressionModel.fit_model(model, model_name='lasso', reg_alpha=lambda_i)
    pred_train_y = RegressionModel.predict_model(model, train_or_test='train')
    train_eval = RegressionModel.eval_fit(model, pred_train_y, metric_name='all', train_or_test='train')
    print('\nTrain Evaluation Metrics (Lambda = ' + str(lambda_i) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(train_eval)))
    pred_test_y = RegressionModel.predict_model(model, train_or_test='test')
    test_eval = RegressionModel.eval_fit(model, pred_test_y, metric_name='all', train_or_test='test')
    print('\nTest Evaluation Metrics (Lambda = ' + str(lambda_i) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(test_eval)))

Baseline Train MAE: 6.385294214988701

Baseline MAE: 5.265409877425062

Train Evaluation Metrics (Lambda = 0.1):
0:('mae', 6.3828298190837565)
1:('mse', 88.64643252367043)
2:('rmse', 9.415223445233279)
3:('r2', 0.0005088240217365758)
4:('exp_var', 0.0005088240217365758)
5:('corr_coef', 0.04973508488346686)

Test Evaluation Metrics (Lambda = 0.1):
0:('mae', 5.264941885299262)
1:('mse', 61.53706443341193)
2:('rmse', 7.844556356698059)
3:('r2', 0.0003778460492058766)
4:('exp_var', 0.00048064691202698295)
5:('corr_coef', 0.04073889355649986)

Train Evaluation Metrics (Lambda = 0.5):
0:('mae', 6.385056756420479)
1:('mse', 88.69156092039204)
2:('rmse', 9.41761970565769)
3:('r2', 0.0)
4:('exp_var', 0.0)
5:('corr_coef', -5.582195785449494e-18)

Test Evaluation Metrics (Lambda = 0.5):
0:('mae', 5.267082710378381)
1:('mse', 61.56652949136296)
2:('rmse', 7.846434189577005)
3:('r2', -0.00010079109357108429)
4:('exp_var', 0.0)
5:('corr_coef', 1.9984844999027086e-18)

Train Evaluation Metrics (Lambd

In [7]:
# Fit an elastic regression model to the dataset and evaluate the fit for a range of lambdas and ratios

lambdas = [0.1, 0.5, 1, 10]
l1_ratios = [0.1, 0.25, 0.5, 0.75, 0.9]

print('Baseline Train MAE: ' + str(baseline_train_mae))
print('\nBaseline MAE: ' + str(baseline_test_mae))

lambda_ratio_comb = list(itertools.product(lambdas, l1_ratios))

for i, comb_i in enumerate(lambda_ratio_comb):
    RegressionModel.fit_model(model, model_name='elastic_net', reg_alpha=comb_i[0], reg_l1_ratio=comb_i[1])
    pred_train_y = RegressionModel.predict_model(model, train_or_test='train')
    train_eval = RegressionModel.eval_fit(model, pred_train_y, metric_name='all', train_or_test='train')
    print('\nTrain Evaluation Metrics (alpha = ' + str(comb_i[0]) + ' and l1_ratio = ' + str(comb_i[1]) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(train_eval)))
    pred_test_y = RegressionModel.predict_model(model, train_or_test='test')
    test_eval = RegressionModel.eval_fit(model, pred_test_y, metric_name='all', train_or_test='test')
    print('\nTest Evaluation Metrics (alpha = ' + str(comb_i[0]) + ' and l1_ratio = ' + str(comb_i[1]) + '):')
    print('\n'.join('{}:{}'.format(*k) for k in enumerate(test_eval)))

Baseline Train MAE: 6.385294214988701

Baseline MAE: 5.265409877425062

Train Evaluation Metrics (alpha = 0.1 and l1_ratio = 0.1):
0:('mae', 6.377289573509032)
1:('mse', 88.51676327549187)
2:('rmse', 9.408334776967274)
3:('r2', 0.0019708486702253447)
4:('exp_var', 0.0019708486702253447)
5:('corr_coef', 0.04703085631276339)

Test Evaluation Metrics (alpha = 0.1 and l1_ratio = 0.1):
0:('mae', 5.2612535352383425)
1:('mse', 61.47406185905662)
2:('rmse', 7.840539640806404)
3:('r2', 0.001401274281627507)
4:('exp_var', 0.0014979055431916244)
5:('corr_coef', 0.03872926517205623)

Train Evaluation Metrics (alpha = 0.1 and l1_ratio = 0.25):
0:('mae', 6.377615235843961)
1:('mse', 88.52824887803335)
2:('rmse', 9.40894515224918)
3:('r2', 0.0018413481583131608)
4:('exp_var', 0.0018413481583131608)
5:('corr_coef', 0.047496157707605235)

Test Evaluation Metrics (alpha = 0.1 and l1_ratio = 0.25):
0:('mae', 5.261148825694223)
1:('mse', 61.47495089420457)
2:('rmse', 7.840596335369177)
3:('r2', 0.00138683