In [2]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [3]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [4]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2008-01-01', valid_date='2015-01-01', test_date='2023-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

(576574, 252) (576574, 3)
146 69 72


In [8]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

# train_loader의 shape 출력
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

# valid_loader의 shape 출력
first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

# test_loader의 shape 출력
first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

params = {
    "max_iter": 500,
    "max_depth": 2,
    "learning_rate": 0.01,
    "verbose": 1
}

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [6]:
gb_model = HistGradientBoostingRegressor(**params)
gb_model.fit(x_train, y_train)

valid_mse = mean_squared_error(y_valid, gb_model.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_test_r2 = r2_score(y_test, gb_model.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

Binning 0.526 GB of training data: 7.018 s
Binning 0.058 GB of validation data: 0.276 s
Fitting gradient boosted rounds:
[1/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01257, val loss: 0.01312, in 0.043s
[2/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01257, val loss: 0.01312, in 0.037s
[3/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01256, val loss: 0.01311, in 0.032s
[4/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01256, val loss: 0.01311, in 0.036s
[5/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01255, val loss: 0.01310, in 0.028s
[6/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01255, val loss: 0.01310, in 0.034s
[7/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01255, val loss: 0.01310, in 0.032s
[8/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01254, val loss: 0.01309, in 0.037s
[9/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.01253, val loss: 0.01308, in 0.049s
[10/500] 1 tree, 4 leaves, max depth = 2, train loss: 0.0125