In [18]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [19]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [20]:
input_data, target_data = load_preprocessed_data()
firm_info, _ = load_info()
train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2008-01-01', valid_date='2017-01-01', test_date='2023-01-01', batch_size=3000)



In [21]:
print(input_data.shape, target_data.shape)
print(len(train_loader), len(valid_loader), len(test_loader))

first_batch = next(iter(train_loader))
first_batch = next(iter(valid_loader))
first_batch = next(iter(test_loader))

print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

(576574, 252) (576574, 3)
97 58 32
Train loader: Batch size = 3000, Features = torch.Size([250])
Valid loader: Batch size = 3000, Features = torch.Size([250])
Test loader: Batch size = 3000, Features = torch.Size([250])


In [None]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

max_depth_list = [3, 4, 5, 6, 7, 9, 11]
max_features_list = [5, 10, 15, 20, 30, 40, 50]

for max_depth in max_depth_list:
    for max_features in max_features_list:
        print(f"n_estimators = {300}, max_depth = {max_depth}, max_features = {max_features}")
        model = RandomForestRegressor(
            n_estimators=300,
            max_depth=max_depth,
            max_features=max_features,
            random_state=0,
            n_jobs=8,
        )
        model.fit(x_train, y_train)
        test_r2 = r2_score(y_test, model.predict(x_test))
        print(f"Model Test R2_score: {test_r2}")
        print()
