In [65]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging

logging.basicConfig(level=logging.INFO)

from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [66]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

def print_model_outputs(model, x_train, x_valid, x_test):
    y_train_pred = model.predict(x_train)
    logging.info(f'Train Predictions: {y_train_pred[:10]}')

    y_valid_pred = model.predict(x_valid)
    logging.info(f'Validation Predictions: {y_valid_pred[:10]}')

    y_test_pred = model.predict(x_test)
    logging.info(f'Test Predictions: {y_test_pred[:10]}')

In [67]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2005-01-01', valid_date='2010-01-01', test_date='2015-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

logging.info(f'Train loader length: {len(train_loader)}, Valid loader length: {len(valid_loader)}, Test loader length: {len(test_loader)}')

(576574, 252) (576574, 3)


INFO:root:Train loader length: 114, Valid loader length: 52, Test loader length: 57


114 52 57


In [68]:
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [69]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

params = {
    "max_iter": 1000,
    "max_depth": 2,
    "learning_rate": 0.001,
    "verbose": 1
}

In [70]:
gb_model = HistGradientBoostingRegressor(**params)
gb_model.fit(x_train, y_train)

Binning 0.410 GB of training data: 4.887 s
Binning 0.046 GB of validation data: 0.112 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01401, in 0.036s
[2/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01401, in 0.022s
[3/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.019s
[4/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.016s
[5/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.017s
[6/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.025s
[7/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.024s
[8/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.014s
[9/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01417, val loss: 0.01400, in 0.013s
[10/1000] 1 tree, 4 leaves, max depth = 2, train lo

In [71]:
print_model_outputs(gb_model, x_train, x_valid, x_test)

INFO:root:Train Predictions: [-1.32257570e-05 -1.32257570e-05 -1.32257570e-05 -1.32257570e-05
 -1.32257570e-05 -1.32257570e-05 -1.32257570e-05  1.31894124e-04
  1.31894124e-04 -2.03678582e-03]
INFO:root:Validation Predictions: [-0.00056705 -0.00056705 -0.00056705 -0.00109421 -0.00109421 -0.00056705
 -0.00056705 -0.00056705 -0.00056705 -0.00056705]
INFO:root:Test Predictions: [0.01572081 0.01727967 0.01799925 0.00268044 0.02152127 0.02152127
 0.02152127 0.02728469 0.02728469 0.00916026]


In [73]:
valid_mse = mean_squared_error(y_valid, gb_model.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_valid_r2 = r2_score(y_valid, gb_model.predict(x_valid))
print(f"Model Valid R2_score: {gb_valid_r2}")

gb_test_r2 = r2_score(y_test, gb_model.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

Validation MSE: 0.02584247441752927
Model Valid R2_score: -0.017348727865512226
Model Test R2_score: 0.0055547004707722225
