In [30]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging
from data_utils import load_info, create_dataloaders, load_preprocessed_data

logging.basicConfig(level=logging.INFO)

In [31]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [32]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2005-01-01', valid_date='2010-01-01', test_date='2015-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

(576574, 252) (576574, 3)


INFO:root:Train loader length: 114, Valid loader length: 52, Test loader length: 57


114 52 57


In [33]:
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [70]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

params = {
    'n_estimators': 300,
    'max_depth': 5,
    'max_features': 10,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}

In [71]:
rf = RandomForestRegressor(**params)
rf.fit(x_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 300building tree 2 of 300

building tree 3 of 300
building tree 4 of 300
building tree 5 of 300
building tree 6 of 300
building tree 7 of 300
building tree 8 of 300
building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300
building tree 14 of 300
building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300
building tree 30 of 300
building tree 31 of 300
building tree 32 of 300
building tree 33 of 300
building tree 34 of 300
building tree 35 of 300
building tree 36 of 300
building tree 37 of 300


[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    2.0s


building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75 of 300
building tree 76 of 300
building tree 77 of 300
building tree 78 of 300
building tree 79

[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   10.2s


building tree 158 of 300
building tree 159 of 300
building tree 160 of 300
building tree 161 of 300
building tree 162 of 300
building tree 163 of 300
building tree 164 of 300
building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300
building tree 188 of 300
building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300
building tree 195 of 300
building tree 196 of 300
building tree 197 of 300


[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:   20.5s finished


In [72]:
y_train_pred = rf.predict(x_train)
logging.info(f'Train Predictions: {y_train_pred[:10]}')

y_valid_pred = rf.predict(x_valid)
logging.info(f'Validation Predictions: {y_valid_pred[:10]}')

y_test_pred = rf.predict(x_test)
logging.info(f'Test Predictions: {y_test_pred[:10]}')

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.6s finished
INFO:root:Train Predictions: [-1.84254086e-03 -7.13240082e-05 -3.57405389e-03 -6.61905751e-03
 -1.60823982e-03  1.33928700e-02  6.59201637e-03  8.34129639e-03
  8.86195199e-04  9.31916545e-03]
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.3s finished
INFO:root:Validation Predictions: [ 0.0002011   0.00040452  0.00095835 -0.00168345  0.00140872  0.00171081
 -0.0025994   0.00057192  0.0029753   0.00456655]
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done

In [73]:
valid_mse = mean_squared_error(y_valid, rf.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_valid_r2 = r2_score(y_valid, rf.predict(x_valid))
print(f"Model Valid R2_score: {gb_valid_r2}")

gb_test_r2 = r2_score(y_test, rf.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s


Validation MSE: 0.025548840222771697


[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s


Model Valid R2_score: -0.00578913919501689
Model Test R2_score: 0.00614681710087428


[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.3s finished


params = {
    'n_estimators': 300,
    'max_depth': 4,
    'max_features': 20,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}

Model Valid R2_score: -0.01570253664842114
Model Test R2_score: 0.0028909621154153964


params = {
    'n_estimators': 300,
    'max_depth': 4,
    'max_features': 10,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}
Model Valid R2_score: -0.009078635956388803
Model Test R2_score: 0.003866744793660004

params = {
    'n_estimators': 300,
    'max_depth': 4,
    'max_features': 5,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}
Model Valid R2_score: -0.005714613842422223
Model Test R2_score: 0.003244989035436019

params = {
    'n_estimators': 300,
    'max_depth': 5,
    'max_features': 10,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}
Model Valid R2_score: -0.00578913919501689
Model Test R2_score: 0.00614681710087428

params = {
    'n_estimators': 300,
    'max_depth': 6,
    'max_features': 10,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}
Model Valid R2_score: -0.007918786990162463
Model Test R2_score: 0.0033831304286952024

params = {
    'n_estimators': 300,
    'max_depth': 6,
    'max_features': 20,
    'random_state': 0,
    'n_jobs': 8,
    'verbose': 2
}
Model Valid R2_score: -0.016825859770961626
Model Test R2_score: 0.0019076460802596173