In [11]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import torch
from datetime import datetime, timezone
import logging

logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format for the log messages
    handlers=[
        logging.StreamHandler()  # Log to the console
    ]
)

%reload_ext autoreload
%autoreload 2
from data.raw.retrievers.alpaca_markets_retriever import AlpacaMarketsRetriever
from config.constants import *
from data.processed.dataset_creation import DatasetCreator
from data.processed.indicators import *
from data.processed.targets import Balanced3ClassClassification
from data.processed.normalization import ZScoreOverWindowNormalizer, ZScoreNormalizer, MinMaxNormalizer
from data.processed.dataset_pytorch import DatasetPytorch
from modeling.trainer import Trainer
from modeling.evaluate import evaluate_lgb_regressor, evaluate_torch_regressor, evaluate_torch_regressor_multiasset

from config.experiments.cur_experiment import config

torch.backends.cudnn.benchmark = config.train_config.cudnn_benchmark


In [2]:
retriever = AlpacaMarketsRetriever(download_from_gdrive=True)

retrieval_result = retriever.bars(
    symbol_or_symbols=config.data_config.symbol_or_symbols, 
    start=config.data_config.start, 
    end=config.data_config.end)

Downloading...
From (original): https://drive.google.com/uc?id=1On6h2pn05svQFj20gU_iyCFuWGwhEYPk
From (redirected): https://drive.google.com/uc?id=1On6h2pn05svQFj20gU_iyCFuWGwhEYPk&confirm=t&uuid=3dbeb1d9-c49f-4cc0-9511-5a9d0febea1d
To: /workspace/intraday-portfolio-management/data/raw/alpaca/bars/1Min_2024-06-01-2025-06-01_AAPL+MSFT+NVDA+GOOGL+GOOG+META+AVGO+AMD+TSM+QCOM+ORCL+INTC+CSCO+IBM+MU+ADBE+TXN+CRM+PANW+AMAT+SQ+PYP.pkl
100%|██████████| 353M/353M [00:03<00:00, 99.4MB/s] 


In [3]:
dataset_creator = DatasetCreator(
    features=config.data_config.features,
    target=config.data_config.target,
    normalizer=config.data_config.normalizer,
    missing_values_handler=config.data_config.missing_values_handler,
    train_set_last_date=config.data_config.train_set_last_date, 
    in_seq_len=config.data_config.in_seq_len,
    multi_asset_prediction=config.data_config.multi_asset_prediction,
)

X_train, y_train, next_return_train, X_test, y_test, next_return_test = dataset_creator.create_dataset_numpy(retrieval_result)
X_train.shape, y_train.shape, next_return_train.shape, X_test.shape, y_test.shape, next_return_test.shape

2025-07-02 17:33:01,634 - INFO - Processing AAPL …
2025-07-02 17:33:01,828 - INFO - Imputing 496 NaN rows out of 97359 with forward fill..
  result = getattr(ufunc, method)(*inputs, **kwargs)
2025-07-02 17:33:02,049 - INFO - Imputing 4156 NaN rows with 0.5 sentinel value
2025-07-02 17:33:02,057 - INFO - Processing ADBE …
2025-07-02 17:33:02,197 - INFO - Imputing 5392 NaN rows out of 97359 with forward fill..
  result = getattr(ufunc, method)(*inputs, **kwargs)
2025-07-02 17:33:02,407 - INFO - Imputing 27209 NaN rows with 0.5 sentinel value
2025-07-02 17:33:02,415 - INFO - Processing ADI …
2025-07-02 17:33:02,627 - INFO - Imputing 6204 NaN rows out of 97359 with forward fill..
  result = getattr(ufunc, method)(*inputs, **kwargs)
2025-07-02 17:33:02,834 - INFO - Imputing 35130 NaN rows with 0.5 sentinel value
2025-07-02 17:33:02,842 - INFO - Processing AMAT …
2025-07-02 17:33:02,980 - INFO - Imputing 4035 NaN rows out of 97359 with forward fill..
  result = getattr(ufunc, method)(*inputs

((49, 79969, 60, 15),
 (49, 79969),
 (49, 79969),
 (49, 7311, 60, 15),
 (49, 7311),
 (49, 7311))

In [4]:
if config.data_config.multi_asset_prediction:
    X_train = np.swapaxes(X_train, 0, 1)
    y_train = np.swapaxes(y_train, 0, 1)
    next_return_train = np.swapaxes(next_return_train, 0, 1)

    X_test = np.swapaxes(X_test, 0, 1)
    y_test = np.swapaxes(y_test, 0, 1)
    next_return_test = np.swapaxes(next_return_test, 0, 1)

X_train.shape, y_train.shape, next_return_train.shape, X_test.shape, y_test.shape, next_return_test.shape

((79969, 49, 60, 15),
 (79969, 49),
 (79969, 49),
 (7311, 49, 60, 15),
 (7311, 49),
 (7311, 49))

In [5]:
y_train.mean(), y_test.mean()

(0.5020159, 0.5024439)

In [6]:
train_loader = DatasetPytorch(X_train, y_train, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)
test_loader = DatasetPytorch(X_test, y_test, learning_task='regression').as_dataloader(
    batch_size=config.train_config.batch_size,
    shuffle=config.train_config.shuffle,
    num_workers=config.train_config.num_workers,
    prefetch_factor=config.train_config.prefetch_factor,
    pin_memory=config.train_config.pin_memory,
    persistent_workers=config.train_config.persistent_workers,
    drop_last=config.train_config.drop_last
)

In [7]:
model = config.model_config.model
model

TemporalSpatial(
  (asset_embed): Embedding(49, 16)
  (asset_proj): Linear(in_features=16, out_features=128, bias=False)
  (lstm): LSTM(15, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (spatial_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [8]:
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=test_loader,
    loss_fn=config.train_config.loss_fn,
    optimizer=config.train_config.optimizer,
    scheduler=config.train_config.scheduler,
    num_epochs=config.train_config.num_epochs,
    device=config.train_config.device,
    metrics=config.train_config.metrics,
    save_path=config.train_config.save_path
)

In [9]:
model, history = trainer.train()

2025-07-02 17:33:24,147 - INFO - Epoch 1/20
2025-07-02 17:33:48,512 - INFO - Train Loss: 0.1536        
2025-07-02 17:33:48,513 - INFO - Train Rmse: 0.3856
2025-07-02 17:33:48,513 - INFO - Val   Loss: 0.1100
2025-07-02 17:33:48,514 - INFO - Val   Rmse: 0.3316
2025-07-02 17:33:48,515 - INFO - 
2025-07-02 17:33:48,515 - INFO - Epoch 2/20
2025-07-02 17:34:09,576 - INFO - Train Loss: 0.1259        
2025-07-02 17:34:09,576 - INFO - Train Rmse: 0.3548
2025-07-02 17:34:09,577 - INFO - Val   Loss: 0.1098
2025-07-02 17:34:09,578 - INFO - Val   Rmse: 0.3313
2025-07-02 17:34:09,578 - INFO - 
2025-07-02 17:34:09,579 - INFO - Epoch 3/20
2025-07-02 17:34:30,881 - INFO - Train Loss: 0.1249        
2025-07-02 17:34:30,882 - INFO - Train Rmse: 0.3533
2025-07-02 17:34:30,883 - INFO - Val   Loss: 0.1101
2025-07-02 17:34:30,883 - INFO - Val   Rmse: 0.3317
2025-07-02 17:34:30,884 - INFO - 
2025-07-02 17:34:30,885 - INFO - Epoch 4/20
2025-07-02 17:34:52,253 - INFO - Train Loss: 0.1245        
2025-07-02 17:

In [13]:
evaluate_torch_regressor_multiasset(model, X_train, y_train, X_test, y_test, next_return_test, trade_asset_count=1)

Train rmse: 0.3492974042892456, Test rmse: 0.3326488733291626, Baseline rmse: 0.3501397669315338
Expected return: 0.00020289617740858242, Baseline return: 3.7141740904189646e-05, Max possible return 0.0006037059938535094


In [11]:
log_experiment(
    config=config, 
    model=model, 
    history=history,
    input_data_sample=next(iter(train_loader))[0].to(trainer.device))

NameError: name 'log_experiment' is not defined

In [10]:
evaluate_lgb_regressor(X_train, y_train, X_test, y_test, next_return_test)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9435
[LightGBM] [Info] Number of data points in the train set: 7371, number of used features: 37
[LightGBM] [Info] Start training from score 0.497863
Train rmse: 0.26411260601695974, Test rmse: 0.2684210886033184, Baseline rmse: 0.2599985897541046
Expected return: 0.00010183148393891163, Baseline return: 2.569958041931386e-06, Max possible return 0.00048079571570269763


