In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import torch
from datetime import datetime, timezone
import logging

logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format for the log messages
    handlers=[
        logging.StreamHandler()  # Log to the console
    ]
)

%reload_ext autoreload
%autoreload 2
from data.raw.retrievers.alpaca_markets_retriever import AlpacaMarketsRetriever
from config.constants import *
from data.processed.dataset_creation import DatasetCreator
from data.processed.indicators import *
from data.processed.targets import Balanced3ClassClassification
from data.processed.normalization import ZScoreOverWindowNormalizer, ZScoreNormalizer, MinMaxNormalizer
from data.processed.dataset_pytorch import DatasetPytorch
from modeling.trainer import Trainer
from modeling.evaluate import evaluate_lgb_regressor, evaluate_torch_regressor, evaluate_torch_regressor_multiasset
from observability.mlflow_integration import log_experiment

from config.experiments.cur_experiment import config

torch.backends.cudnn.benchmark = config.train_config.cudnn_benchmark


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
retriever = AlpacaMarketsRetriever(download_from_gdrive=False)

retrieval_result = retriever.bars_with_quotes(
    symbol_or_symbols=config.data_config.symbol_or_symbols, 
    start=config.data_config.start, 
    end=config.data_config.end)

In [3]:
dataset_creator = DatasetCreator(
    features=config.data_config.features,
    target=config.data_config.target,
    normalizer=config.data_config.normalizer,
    missing_values_handler=config.data_config.missing_values_handler,
    train_set_last_date=config.data_config.train_set_last_date, 
    in_seq_len=config.data_config.in_seq_len,
    multi_asset_prediction=config.data_config.multi_asset_prediction,
)

X_train, y_train, next_return_train, spread_train, X_test, y_test, next_return_test, spread_test = dataset_creator.create_dataset_numpy(retrieval_result)
X_train.shape, y_train.shape, next_return_train.shape, spread_train.shape, X_test.shape, y_test.shape, next_return_test.shape, spread_test.shape

2025-07-09 17:05:29,534 - INFO - Processing AAPL …
2025-07-09 17:05:30,186 - INFO - Imputing 496 NaN rows out of 97359 with forward fill..
2025-07-09 17:05:30,795 - INFO - Imputing 39 NaN rows with 0.5 sentinel value
2025-07-09 17:05:30,830 - INFO - Processing AMD …
2025-07-09 17:05:31,494 - INFO - Imputing 214 NaN rows out of 97359 with forward fill..
2025-07-09 17:05:32,100 - INFO - Imputing 39 NaN rows with 0.5 sentinel value
2025-07-09 17:05:32,130 - INFO - Processing BABA …
2025-07-09 17:05:32,767 - INFO - Imputing 874 NaN rows out of 97359 with forward fill..
2025-07-09 17:05:33,349 - INFO - Imputing 39 NaN rows with 0.5 sentinel value
2025-07-09 17:05:33,380 - INFO - Processing BITU …
2025-07-09 17:05:34,029 - INFO - Imputing 6493 NaN rows out of 97359 with forward fill..
2025-07-09 17:05:34,612 - INFO - Imputing 39 NaN rows with 0.5 sentinel value
2025-07-09 17:05:34,654 - INFO - Processing CSCO …
2025-07-09 17:05:35,488 - INFO - Imputing 3929 NaN rows out of 97359 with forward

((50, 79969, 60, 15),
 (50, 79969),
 (50, 79969),
 (50, 79969),
 (50, 7311, 60, 15),
 (50, 7311),
 (50, 7311),
 (50, 7311))

In [5]:
if config.data_config.multi_asset_prediction:
    X_train = np.swapaxes(X_train, 0, 1)
    y_train = np.swapaxes(y_train, 0, 1)
    next_return_train = np.swapaxes(next_return_train, 0, 1)
    spread_train = np.swapaxes(spread_train, 0, 1)

    X_test = np.swapaxes(X_test, 0, 1)
    y_test = np.swapaxes(y_test, 0, 1)
    next_return_test = np.swapaxes(next_return_test, 0, 1)
    spread_test = np.swapaxes(spread_test, 0, 1)

X_train.shape, y_train.shape, next_return_train.shape, spread_train.shape, X_test.shape, y_test.shape, next_return_test.shape, spread_test.shape

((79969, 50, 60, 15),
 (79969, 50),
 (79969, 50),
 (79969, 50),
 (7311, 50, 60, 15),
 (7311, 50),
 (7311, 50),
 (7311, 50))

In [6]:
y_train.mean(), y_test.mean()

(0.49992302, 0.501705)

In [7]:
train_loader = DatasetPytorch(X_train, y_train, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)
test_loader = DatasetPytorch(X_test, y_test, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)

In [8]:
model = config.model_config.model
model

TemporalSpatial(
  (asset_embed): Embedding(50, 16)
  (asset_proj): Linear(in_features=16, out_features=128, bias=False)
  (lstm): LSTM(15, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (spatial_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [9]:
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=test_loader,
    loss_fn=config.train_config.loss_fn,
    optimizer=config.train_config.optimizer,
    scheduler=config.train_config.scheduler,
    num_epochs=config.train_config.num_epochs,
    device=config.train_config.device,
    metrics=config.train_config.metrics,
    save_path=config.train_config.save_path
)

In [10]:
model, history = trainer.train()

2025-07-09 17:07:30,201 - INFO - Epoch 1/1
2025-07-09 17:09:57,491 - INFO - Train Loss: 0.1426         
2025-07-09 17:09:57,502 - INFO - Train Rmse: 0.3757
2025-07-09 17:09:57,503 - INFO - Val   Loss: 0.1167
2025-07-09 17:09:57,506 - INFO - Val   Rmse: 0.3416
2025-07-09 17:09:57,510 - INFO - 


In [12]:
evaluate_torch_regressor_multiasset(model, X_train, y_train, X_test, y_test, next_return_test, spread_test)

OutOfMemoryError: CUDA out of memory. Tried to allocate 22.62 GiB. GPU 0 has a total capacity of 8.00 GiB of which 1.71 GiB is free. Of the allocated memory 2.64 GiB is allocated by PyTorch, and 2.48 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [20]:
log_experiment(
    config=config, 
    model=model, 
    history=history,
    input_data_sample=next(iter(train_loader))[0].to(trainer.device))

Registered model 'LSTM Default' already exists. Creating a new version of this model...
2025/06/26 15:35:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LSTM Default, version 10


🏃 View run gentle-loon-699 at: http://127.0.0.1:8080/#/experiments/439216085822475480/runs/54deb1104660468d9ffb4e7e278e9cfb
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/439216085822475480


Created version '10' of model 'LSTM Default'.


In [10]:
evaluate_lgb_regressor(X_train, y_train, X_test, y_test, next_return_test)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9435
[LightGBM] [Info] Number of data points in the train set: 7371, number of used features: 37
[LightGBM] [Info] Start training from score 0.497863
Train rmse: 0.26411260601695974, Test rmse: 0.2684210886033184, Baseline rmse: 0.2599985897541046
Expected return: 0.00010183148393891163, Baseline return: 2.569958041931386e-06, Max possible return 0.00048079571570269763


