In [18]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging

from data_utils import load_info, create_dataloaders, load_preprocessed_data

logging.basicConfig(filename='model_output.log', level=logging.INFO, format='%(asctime)s - %(message)s')

In [14]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [15]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2008-01-01', valid_date='2015-01-01', test_date='2023-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

(576574, 252) (576574, 3)
146 69 72


In [16]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

params = {
    "max_iter": 500,
    "max_depth": 2,
    "learning_rate": 0.01,
    "verbose": 1
}

In [19]:
gb_model = HistGradientBoostingRegressor(**params)
gb_model.fit(x_train, y_train)

valid_mse = mean_squared_error(y_valid, gb_model.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_test_r2 = r2_score(y_test, gb_model.predict(x_test))
print(f"Model Test MSE: {gb_test_r2}")

Binning 0.526 GB of training data: 5.452 s
Binning 0.058 GB of validation data: 0.129 s
Fitting gradient boosted rounds:
[1/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01262, val loss: 0.01270, in 0.016s
[2/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01261, val loss: 0.01270, in 0.014s
[3/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01260, val loss: 0.01269, in 0.014s
[4/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01260, val loss: 0.01268, in 0.014s
[5/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01259, val loss: 0.01268, in 0.013s
[6/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01259, val loss: 0.01268, in 0.012s
[7/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01258, val loss: 0.01267, in 0.014s
[8/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01258, val loss: 0.01267, in 0.013s
[9/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01257, val loss: 0.01266, in 0.014s
[10/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.0125