In [2]:
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import logging
from data_utils import load_info, create_dataloaders, load_preprocessed_data

logging.basicConfig(level=logging.INFO)

In [3]:
def extract_data_from_loader(data_loader):
    inputs, targets = [], []
    for batch_inputs, batch_targets in data_loader:
        inputs.append(batch_inputs.numpy())
        targets.append(batch_targets.numpy())
    inputs = np.vstack(inputs)
    targets = np.concatenate(targets)
    return inputs, targets

In [4]:
input_data, target_data = load_preprocessed_data()
print(input_data.shape, target_data.shape)
firm_info, _ = load_info()

train_loader, valid_loader, test_loader, _ = create_dataloaders(
    input_data, target_data, firm_info,
    train_date='2005-01-01', valid_date='2010-01-01', test_date='2015-11-01', batch_size=2000)

print(len(train_loader), len(valid_loader), len(test_loader))

print(f'Train loader length: {len(train_loader)}, Valid loader length: {len(valid_loader)}, Test loader length: {len(test_loader)}')

(576574, 252) (576574, 3)
114 52 57
Train loader length: 114, Valid loader length: 52, Test loader length: 57


In [5]:
first_batch = next(iter(train_loader))
print(f"Train loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(valid_loader))
print(f"Valid loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

first_batch = next(iter(test_loader))
print(f"Test loader: Batch size = {len(first_batch[0])}, Features = {first_batch[0][0].shape}")

Train loader: Batch size = 2000, Features = torch.Size([250])
Valid loader: Batch size = 2000, Features = torch.Size([250])
Test loader: Batch size = 2000, Features = torch.Size([250])


In [6]:
x_train, y_train = extract_data_from_loader(train_loader)
x_valid, y_valid = extract_data_from_loader(valid_loader)
x_test, y_test = extract_data_from_loader(test_loader)

In [9]:
params_list = []

for max_iter in range(100, 1001, 100):
    for max_depth in [1, 2]:
        for learning_rate in [0.1, 0.01, 0.001]:
            params = {
                'max_iter': max_iter,
                'max_depth': max_depth,
                'learning_rate': learning_rate,
                'verbose': 0,
                'random_state': 55
            }
            params_list.append(params)


for params in params_list:
    gbrt = HistGradientBoostingRegressor(**params)
    gbrt.fit(x_train, y_train)

    gb_test_r2 = r2_score(y_test, gbrt.predict(x_test))
    print(f"model params: {params}\n")
    print(f"Model Test R2_score: {gb_test_r2}\n\n")

model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.053898400878025665


model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0006036166043089652


model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.006512814690895707


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.05107032375238174


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005598703077448142


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0034787941361882346


model params: {'max_iter': 200, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_sta

## Test Results


```py
params = {
    'max_iter': 1000,
    'max_depth': 2,
    'learning_rate': 0.001,
    'verbose': 1,
    'random_state': 55
}
```

```
model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.053898400878025665


model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0006036166043089652


model params: {'max_iter': 100, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.006512814690895707


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.05107032375238174


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005598703077448142


model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0034787941361882346


model params: {'max_iter': 200, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.08560567799360541


model params: {'max_iter': 200, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.00014794153472164062


model params: {'max_iter': 200, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.004390595868119851


model params: {'max_iter': 200, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.09434351042108946


model params: {'max_iter': 200, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0025493799944396045


model params: {'max_iter': 200, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.00038332415644171114


model params: {'max_iter': 300, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.08973549877177556


model params: {'max_iter': 300, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.003243665746528901


model params: {'max_iter': 300, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.003366496364071603


model params: {'max_iter': 300, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.13374480662477062


model params: {'max_iter': 300, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0004248528390344841


model params: {'max_iter': 300, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0011238131053944445


model params: {'max_iter': 400, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.08326967657304718


model params: {'max_iter': 400, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.006900477427290719


model params: {'max_iter': 400, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0025338616341858966


model params: {'max_iter': 400, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.15476678296828505


model params: {'max_iter': 400, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0035121043967565324


model params: {'max_iter': 400, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.003013840270967294


model params: {'max_iter': 500, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.07678012055443673


model params: {'max_iter': 500, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.012087838615337354


model params: {'max_iter': 500, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.001809268543438547


model params: {'max_iter': 500, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.1724683019233313


model params: {'max_iter': 500, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.012521630041163245


model params: {'max_iter': 500, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.004368231015365431


model params: {'max_iter': 600, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.07245493358328892


model params: {'max_iter': 600, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.016622645951305914


model params: {'max_iter': 600, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0012400472927291961


model params: {'max_iter': 600, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.1783795704670026


model params: {'max_iter': 600, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.019170317363994327


model params: {'max_iter': 600, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005062435389013409


model params: {'max_iter': 700, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.06822862961040532


model params: {'max_iter': 700, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.021744713524376236


model params: {'max_iter': 700, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.0006667095180261651


model params: {'max_iter': 700, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.19733940847930653


model params: {'max_iter': 700, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.02479308902009092


model params: {'max_iter': 700, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005412241234017623


model params: {'max_iter': 800, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.06993674554784524


model params: {'max_iter': 800, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.028594879658213612


model params: {'max_iter': 800, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.00012168905204523739


model params: {'max_iter': 800, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.22099427659254323


model params: {'max_iter': 800, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.030503176675390264


model params: {'max_iter': 800, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005627361287495303


model params: {'max_iter': 900, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.06618473809131697


model params: {'max_iter': 900, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.037729192258406874


model params: {'max_iter': 900, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0002829695204277227


model params: {'max_iter': 900, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.22821792438963406


model params: {'max_iter': 900, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.03910775650885712


model params: {'max_iter': 900, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0056843887823687655


model params: {'max_iter': 1000, 'max_depth': 1, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.06606081386755802


model params: {'max_iter': 1000, 'max_depth': 1, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.04608625573200231


model params: {'max_iter': 1000, 'max_depth': 1, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.0006001164519561231


model params: {'max_iter': 1000, 'max_depth': 2, 'learning_rate': 0.1, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.2347506553626837


model params: {'max_iter': 1000, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}

Model Test R2_score: -0.043966289978384365


model params: {'max_iter': 1000, 'max_depth': 2, 'learning_rate': 0.001, 'verbose': 0, 'random_state': 55}

Model Test R2_score: 0.005475882023958989
```


# Results

## Case of low max_iter
```
model params: {'max_iter': 100, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}
Model Test R2_score: 0.005598703077448142

model params: {'max_iter': 200, 'max_depth': 2, 'learning_rate': 0.01, 'verbose': 0, 'random_state': 55}
Model Test R2_score: 0.0025493799944396045
```

## Optimer case
```py
params = {
    'max_iter': 900,
    'max_depth': 2,
    'learning_rate': 0.001,
    'random_state': 55
}
```
`Model Test R2_score: 0.0056843887823687655`
