In [7]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging

logging.basicConfig(level=logging.INFO)

from data_utils import load_info, create_dataloaders, load_preprocessed_data

In [8]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

def print_model_outputs(model, x_train, x_valid, x_test):
    y_train_pred = model.predict(x_train)
    logging.info(f'Train Predictions: {y_train_pred[:10]}')

    y_valid_pred = model.predict(x_valid)
    logging.info(f'Validation Predictions: {y_valid_pred[:10]}')

    y_test_pred = model.predict(x_test)
    logging.info(f'Test Predictions: {y_test_pred[:10]}')

In [9]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2005-01-01', valid_date='2010-01-01', test_date='2015-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

logging.info(f'Train loader length: {len(train_loader)}, Valid loader length: {len(valid_loader)}, Test loader length: {len(test_loader)}')

(576574, 252) (576574, 3)


INFO:root:Train loader length: 114, Valid loader length: 52, Test loader length: 57


114 52 57


In [10]:
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [18]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

params = {
    'n_estimators': 300,
    'max_depth': 7,
    'max_features': 40,
    'random_state': 0,
    'verbose': 1
}

In [19]:
rf = RandomForestRegressor(**params)
rf.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.9min
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  7.8min


In [20]:
print_model_outputs(rf, x_train, x_valid, x_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
INFO:root:Train Predictions: [-0.00276654  0.00562419 -0.01025807 -0.02197495 -0.0036918   0.03682539
  0.01371538  0.02001964  0.00371277  0.0205179 ]
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s
INFO:root:Validation Predictions: [-0.00151215  0.00221575  0.00361794  0.00068462  0.00344181  0.00341272
  0.00169669  0.00613402  0.00429641  0.00520051]
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s
INFO:root:Test Predictions: [-0.00888443 -0.00489393 -0.00234503 -0.00926971  0.00616748  0.01180484
  0.01325438  0.01394711  0.00891432  0.00497028]


In [21]:
valid_mse = mean_squared_error(y_valid, rf.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_valid_r2 = r2_score(y_valid, rf.predict(x_valid))
print(f"Model Valid R2_score: {gb_valid_r2}")

gb_test_r2 = r2_score(y_test, rf.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s


Validation MSE: 0.02567183560144994


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s


Model Valid R2_score: -0.010631136521200846


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.6s


Model Test R2_score: 0.0022987546389720004
