In [2]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [3]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [18]:
# (576574, 252) (576574, 3)
# 에서 576574는 총 데이터 수, 252는 feature 수, 3은 target 수
# 각 loader의 len은 데이터 수 / batch size
# (146+69+72)*2000 = 574000인데 왜 이런거지?

# feature 250인건 input_data에 250개의 feature가 있다는 것(2개는 drop)

input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='1999-01-01', valid_date='2005-01-01', test_date='2023-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

(576574, 252) (576574, 3)
47 66 174


In [19]:
# train_loader의 shape 출력
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

# valid_loader의 shape 출력
first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

# test_loader의 shape 출력
first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [20]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

params = {
    "max_iter": 1000,
    "max_depth": 2,
    "learning_rate": 0.01,
    "verbose": 1
}

In [21]:
gb_model = HistGradientBoostingRegressor(**params)
gb_model.fit(x_train, y_train)

Binning 0.169 GB of training data: 1.746 s
Binning 0.019 GB of validation data: 0.024 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01061, val loss: 0.00992, in 0.005s
[2/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01060, val loss: 0.00991, in 0.006s
[3/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01059, val loss: 0.00990, in 0.005s
[4/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01057, val loss: 0.00989, in 0.006s
[5/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01056, val loss: 0.00988, in 0.005s
[6/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01055, val loss: 0.00986, in 0.005s
[7/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01054, val loss: 0.00985, in 0.004s
[8/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01053, val loss: 0.00984, in 0.006s
[9/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01051, val loss: 0.00983, in 0.004s
[10/1000] 1 tree, 4 leaves, max depth = 2, train lo

In [22]:
valid_mse = mean_squared_error(y_valid, gb_model.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_valid_r2 = r2_score(y_valid, gb_model.predict(x_valid))
print(f"Model Test R2_score: {gb_valid_r2}")

gb_test_r2 = r2_score(y_test, gb_model.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

Validation MSE: 0.04687360860731694
Model Test R2_score: -0.39911340452980726
Model Test R2_score: -0.21825897491746082
