In [10]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score
from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [11]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [12]:
input_data, target_data = load_preprocessed_data()
firm_info, _ = load_info()
train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2008-01-01', valid_date='2017-01-01', test_date='2023-01-01', batch_size=3000)

In [13]:
print(f'Input data shape :{input_data.shape}')
print(f'Target data shape: {target_data.shape}')
print(f'Train loader length: {len(train_loader)}')
print(f'Valid loader length: {len(valid_loader)}')
print(f'Test loader length: {len(test_loader)}')

first_batch = next(iter(train_loader))
first_batch = next(iter(valid_loader))
first_batch = next(iter(test_loader))

print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Input data shape :(576574, 252)
Target data shape: (576574, 3)
Train loader length: 97
Valid loader length: 58
Test loader length: 32
Train loader: Batch size = 3000, Features = torch.Size([250])
Valid loader: Batch size = 3000, Features = torch.Size([250])
Test loader: Batch size = 3000, Features = torch.Size([250])


In [14]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

In [16]:
params_list = []

for max_iter in range(500, 1001, 100):
    params = {
        'max_iter': max_iter,
        'max_depth': 2,
        'learning_rate': 0.001,
        'verbose': 0,
        'random_state': 55
    }
    params_list.append(params)


for params in params_list:
    model = HistGradientBoostingRegressor(**params)
    model.fit(x_train, y_train)
 
    valid_r2 = r2_score(y_valid, model.predict(x_valid))
    test_r2 = r2_score(y_test, model.predict(x_test))
    print(f"model params: {params}")
    print(f"Valid R2_score: {valid_r2}")
    print(f"Model Test R2_score: {test_r2}\n")

model params: {'max_iter': 500, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}
Valid R2_score: -0.0008737264947664691
Model Test R2_score: 0.0028772498395432944

model params: {'max_iter': 600, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}
Valid R2_score: -0.002723092437043162
Model Test R2_score: 0.004408893231679145

model params: {'max_iter': 700, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}
Valid R2_score: -0.005028335060579536
Model Test R2_score: 0.005745069852794105

model params: {'max_iter': 800, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}
Valid R2_score: -0.007905699203820893
Model Test R2_score: 0.006731629914078274

model params: {'max_iter': 900, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}
Valid R2_score: -0.010701818482452774
Model Test R2_score: 0.00752796623375962

model params: {'max_iter': 1000, 'max_depth': 2, 'learning_rate'