In [12]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import torch
from datetime import datetime, timezone
import logging

logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format for the log messages
    handlers=[
        logging.StreamHandler()  # Log to the console
    ]
)

%reload_ext autoreload
%autoreload 2
from data.raw.retrievers.alpaca_markets_retriever import AlpacaMarketsRetriever
from config.constants import *
from data.processed.dataset_creation import DatasetCreator
from data.processed.indicators import *
from data.processed.targets import Balanced3ClassClassification
from data.processed.normalization import ZScoreOverWindowNormalizer, ZScoreNormalizer, MinMaxNormalizer
from data.processed.dataset_pytorch import DatasetPytorch
from modeling.trainer import Trainer
from modeling.evaluate import evaluate_lgb_regressor, evaluate_torch_regressor, evaluate_torch_regressor_multiasset
from observability.mlflow_integration import log_experiment

from config.experiments.cur_experiment import config

torch.backends.cudnn.benchmark = config.train_config.cudnn_benchmark


In [13]:
retriever = AlpacaMarketsRetriever(download_from_gdrive=False)

retrieval_result = retriever.bars(
    symbol_or_symbols=config.data_config.symbol_or_symbols, 
    start=config.data_config.start, 
    end=config.data_config.end)

Downloading...
From (original): https://drive.google.com/uc?id=1On6h2pn05svQFj20gU_iyCFuWGwhEYPk
From (redirected): https://drive.google.com/uc?id=1On6h2pn05svQFj20gU_iyCFuWGwhEYPk&confirm=t&uuid=ce999821-8544-4a6a-b0e3-a38cf22bfd90
To: c:\Users\ikurnosau\Projects\QuantitativeTrading\intraday-portfolio-management\data\raw\alpaca\temp\1Min_2024-06-01-2025-06-01_AAPL+MSFT+NVDA+GOOGL+GOOG+META+AVGO+AMD+TSM+QCOM+ORCL+INTC+CSCO+IBM+MU+ADBE+TXN+CRM+PANW+AMAT+SQ+PYP.pkl
100%|██████████| 353M/353M [00:13<00:00, 26.5MB/s] 


In [3]:
dataset_creator = DatasetCreator(
    features=config.data_config.features,
    target=config.data_config.target,
    normalizer=config.data_config.normalizer,
    missing_values_handler=config.data_config.missing_values_handler,
    train_set_last_date=config.data_config.train_set_last_date, 
    in_seq_len=config.data_config.in_seq_len,
    multi_asset_prediction=config.data_config.multi_asset_prediction,
)

X_train, y_train, next_return_train, X_test, y_test, next_return_test = dataset_creator.create_dataset_numpy(retrieval_result)
X_train.shape, y_train.shape, next_return_train.shape, X_test.shape, y_test.shape, next_return_test.shape

2025-07-02 15:03:37,359 - INFO - Processing AAPL …
2025-07-02 15:03:37,680 - INFO - Imputing 496 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:39,444 - INFO - Processing ADBE …
2025-07-02 15:03:39,665 - INFO - Imputing 5392 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:41,505 - INFO - Processing ADI …
2025-07-02 15:03:41,732 - INFO - Imputing 6204 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:43,449 - INFO - Processing AMAT …
2025-07-02 15:03:43,677 - INFO - Imputing 4035 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:45,417 - INFO - Processing AMD …
2025-07-02 15:03:45,669 - INFO - Imputing 214 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:47,430 - INFO - Processing ANET …
2025-07-02 15:03:47,637 - INFO - Imputing 5097 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:49,404 - INFO - Processing AVGO …
2025-07-02 15:03:49,657 - INFO - Imputing 1059 NaN rows out of 97359 with forward fill..
2025-07-02 15:03:51,433 

((49, 79999, 30, 37),
 (49, 79999),
 (49, 79999),
 (49, 7341, 30, 37),
 (49, 7341),
 (49, 7341))

In [4]:
if config.data_config.multi_asset_prediction:
    X_train = np.swapaxes(X_train, 0, 1)
    y_train = np.swapaxes(y_train, 0, 1)
    next_return_train = np.swapaxes(next_return_train, 0, 1)

    X_test = np.swapaxes(X_test, 0, 1)
    y_test = np.swapaxes(y_test, 0, 1)
    next_return_test = np.swapaxes(next_return_test, 0, 1)

X_train.shape, y_train.shape, next_return_train.shape, X_test.shape, y_test.shape, next_return_test.shape

((79999, 49, 30, 37),
 (79999, 49),
 (79999, 49),
 (7341, 49, 30, 37),
 (7341, 49),
 (7341, 49))

In [5]:
y_train.mean(), y_test.mean()

(0.50199676, 0.5025806)

In [6]:
train_loader = DatasetPytorch(X_train, y_train, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)
test_loader = DatasetPytorch(X_test, y_test, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)

In [7]:
model = config.model_config.model
model

TemporalSpatial(
  (asset_embed): Embedding(49, 16)
  (asset_proj): Linear(in_features=16, out_features=128, bias=False)
  (lstm): LSTM(37, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (spatial_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [8]:
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=test_loader,
    loss_fn=config.train_config.loss_fn,
    optimizer=config.train_config.optimizer,
    scheduler=config.train_config.scheduler,
    num_epochs=config.train_config.num_epochs,
    device=config.train_config.device,
    metrics=config.train_config.metrics,
    save_path=config.train_config.save_path
)

In [10]:
model, history = trainer.train()

2025-07-02 15:09:42,624 - INFO - Epoch 1/100
2025-07-02 15:10:26,403 - INFO - Train Loss: 0.1246           
2025-07-02 15:10:26,404 - INFO - Train Rmse: 0.3528
2025-07-02 15:10:26,404 - INFO - Val   Loss: 0.1094
2025-07-02 15:10:26,404 - INFO - Val   Rmse: 0.3305
2025-07-02 15:10:26,404 - INFO - 
2025-07-02 15:10:26,405 - INFO - Epoch 2/100
2025-07-02 15:11:21,429 - INFO - Train Loss: 0.1238           
2025-07-02 15:11:21,429 - INFO - Train Rmse: 0.3516
2025-07-02 15:11:21,430 - INFO - Val   Loss: 0.1098
2025-07-02 15:11:21,430 - INFO - Val   Rmse: 0.3311
2025-07-02 15:11:21,431 - INFO - 
2025-07-02 15:11:21,431 - INFO - Epoch 3/100
2025-07-02 15:12:40,972 - INFO - Train Loss: 0.1234          
2025-07-02 15:12:40,973 - INFO - Train Rmse: 0.3511
2025-07-02 15:12:40,973 - INFO - Val   Loss: 0.1094
2025-07-02 15:12:40,974 - INFO - Val   Rmse: 0.3305
2025-07-02 15:12:40,974 - INFO - 
2025-07-02 15:12:40,975 - INFO - Epoch 4/100
                                                             

KeyboardInterrupt: 

In [22]:
evaluate_torch_regressor_multiasset(model, X_train, y_train, X_test, y_test, next_return_test)

torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Size([1024, 49, 30, 37])
torch.Si

In [20]:
log_experiment(
    config=config, 
    model=model, 
    history=history,
    input_data_sample=next(iter(train_loader))[0].to(trainer.device))

Registered model 'LSTM Default' already exists. Creating a new version of this model...
2025/06/26 15:35:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LSTM Default, version 10


🏃 View run gentle-loon-699 at: http://127.0.0.1:8080/#/experiments/439216085822475480/runs/54deb1104660468d9ffb4e7e278e9cfb
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/439216085822475480


Created version '10' of model 'LSTM Default'.


In [10]:
evaluate_lgb_regressor(X_train, y_train, X_test, y_test, next_return_test)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9435
[LightGBM] [Info] Number of data points in the train set: 7371, number of used features: 37
[LightGBM] [Info] Start training from score 0.497863
Train rmse: 0.26411260601695974, Test rmse: 0.2684210886033184, Baseline rmse: 0.2599985897541046
Expected return: 0.00010183148393891163, Baseline return: 2.569958041931386e-06, Max possible return 0.00048079571570269763


