In [73]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging
from data_utils import load_info, create_dataloaders, load_preprocessed_data

logging.basicConfig(level=logging.INFO)

In [74]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [75]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2005-01-01', valid_date='2010-01-01', test_date='2015-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

print(f'Train loader length: {len(train_loader)}, Valid loader length: {len(valid_loader)}, Test loader length: {len(test_loader)}')

(576574, 252) (576574, 3)
114 52 57
Train loader length: 114, Valid loader length: 52, Test loader length: 57


In [76]:
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [77]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

In [86]:
params = {
    'max_iter': 1000,
    'max_depth': 2,
    'learning_rate': 0.001,
    'verbose': 1,
    'random_state': 55
}

In [87]:
gbrt = HistGradientBoostingRegressor(**params)
gbrt.fit(x_train, y_train)

Binning 0.410 GB of training data: 5.125 s
Binning 0.046 GB of validation data: 0.069 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.015s
[2/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.013s
[3/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.015s
[4/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.016s
[5/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.016s
[6/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01413, val loss: 0.01440, in 0.013s
[7/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01412, val loss: 0.01440, in 0.014s
[8/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01412, val loss: 0.01440, in 0.015s
[9/1000] 1 tree, 4 leaves, max depth = 2, train loss: 0.01412, val loss: 0.01440, in 0.015s
[10/1000] 1 tree, 4 leaves, max depth = 2, train lo

In [88]:
# y_train_pred = gbrt.predict(x_train)
# print(f'Train Predictions: {y_train_pred[:10]}')

# y_valid_pred = gbrt.predict(x_valid)
# print(f'Validation Predictions: {y_valid_pred[:10]}')

# y_test_pred = gbrt.predict(x_test)
# print(f'Test Predictions: {y_test_pred[:10]}')

In [89]:
valid_mse = mean_squared_error(y_valid, gbrt.predict(x_valid))
print(f"Validation MSE: {valid_mse}")

gb_test_r2 = r2_score(y_test, gbrt.predict(x_test))
print(f"Model Test R2_score: {gb_test_r2}")

Validation MSE: 0.025847455304517986
Model Valid R2_score: -0.017544811992741716
Model Test R2_score: 0.005475882023958989


## Test Results



```py
params = {
    'max_iter': 1000,
    'max_depth': 2,
    'learning_rate': 0.001,
    'verbose': 1,
    'random_state': 55
}
```
Model Test R2_score: 0.005475882023958989


```py
params = {
    'max_iter': 1000,
    'max_depth': 2,
    'learning_rate': 0.001,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: 0.004847571315040655


```py
params = {
    'max_iter': 1000,
    'max_depth': 2,
    'learning_rate': 0.01,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.031088544859680045


```py
params = {
    'max_iter': 300,
    'max_depth': 2,
    'learning_rate': 0.01,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: 2.087641295167586e-06


```py
params = {
    'max_iter': 300,
    'max_depth': 2,
    'learning_rate': 0.1,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.13830890774752214

```py
params = {
    'max_iter': 500,
    'max_depth': 2,
    'learning_rate': 0.01,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.0073809926185368635


```py
params = {
    'max_iter': 500,
    'max_depth': 1,
    'learning_rate': 0.01,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.013684694336023595


```py
params = {
    'max_iter': 300,
    'max_depth': 2,
    'learning_rate': 0.01,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.003195046649528299


```py
params = {
    'max_iter': 500,
    'max_depth': 2,
    'learning_rate': 0.05,
    'verbose': 1,
    'random_state': 0
}
```
Model Test R2_score: -0.12459085645310419