In [19]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
from datetime import datetime, timezone

import logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format for the log messages
    handlers=[
        logging.StreamHandler()  # Log to the console
    ]
)

%reload_ext autoreload
%autoreload 2
from data.raw.retrievers.alpaca_markets_retriever import AlpacaMarketsRetriever
from config.constants import *
from data.processed.dataset_creation import DatasetCreator
from data.processed.indicators import *
from data.processed.targets import Balanced3ClassClassification
from data.processed.normalization import ZScoreOverWindowNormalizer, ZScoreNormalizer, MinMaxNormalizer
from data.processed.missing_values_handling import DummyMissingValuesHandler
from data.processed.dataset_pytorch import DatasetPytorch
from modeling.trainer import Trainer

In [20]:
from config.experiments.cur_experiment import config

In [21]:
retriever = AlpacaMarketsRetriever()

retrieval_result = retriever.bars(
    symbol_or_symbols=config.data_config.symbol_or_symbols, 
    start=config.data_config.start, 
    end=config.data_config.end)

In [22]:
dataset_creator = DatasetCreator(
    features=config.data_config.features,
    target=config.data_config.target,
    normalizer=config.data_config.normalizer,
    missing_values_handler=config.data_config.missing_values_handler,
    train_set_last_date=config.data_config.train_set_last_date, 
    in_seq_len=config.data_config.in_seq_len,
    flatten_sequence=config.data_config.flatten_sequence
)

X_train, y_train, X_test, y_test = dataset_creator.create_dataset_numpy(retrieval_result)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

2025-06-15 19:07:45,930 - INFO - Processing AAPL...
2025-06-15 19:07:45,932 - INFO - Missing values are handled!
2025-06-15 19:07:48,591 - INFO - Features calculated!
2025-06-15 19:07:48,664 - INFO - Features normalized!
2025-06-15 19:07:48,720 - INFO - Target calculated!
2025-06-15 19:07:48,749 - INFO - Dropped 43 rows with NaN values


((63001, 37), (63001,), (17052, 37), (17052,))

In [23]:
train_loader = DatasetPytorch(X_train, y_train).as_dataloader(
    batch_size=config.data_config.batch_size,
    shuffle=config.data_config.shuffle
)
test_loader = DatasetPytorch(X_test, y_test).as_dataloader(
    batch_size=config.data_config.batch_size,
    shuffle=config.data_config.shuffle
)

In [24]:
model = config.model_config.model
model

MLPClassifier(
  (model): Sequential(
    (0): Linear(in_features=37, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=64, out_features=3, bias=True)
  )
)

In [25]:
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=test_loader,
    loss_fn=config.train_config.loss_fn,
    optimizer=config.train_config.optimizer,
    scheduler=config.train_config.scheduler,
    num_epochs=config.train_config.num_epochs,
    device=config.train_config.device,
    metrics=config.train_config.metrics,
    save_path=config.train_config.save_path
)

In [26]:
trainer.train()

2025-06-15 19:07:50,752 - INFO - Epoch 1/10
2025-06-15 19:08:05,354 - INFO - Train Loss: 1.0695           
2025-06-15 19:08:05,354 - INFO - Train Accuracy: 0.4058
2025-06-15 19:08:05,355 - INFO - Val   Loss: 1.0706
2025-06-15 19:08:05,356 - INFO - Val   Accuracy: 0.4019
2025-06-15 19:08:05,357 - INFO - 
2025-06-15 19:08:05,358 - INFO - Epoch 2/10
2025-06-15 19:08:19,780 - INFO - Train Loss: 1.0566           
2025-06-15 19:08:19,781 - INFO - Train Accuracy: 0.4224
2025-06-15 19:08:19,782 - INFO - Val   Loss: 1.0629
2025-06-15 19:08:19,783 - INFO - Val   Accuracy: 0.4134
2025-06-15 19:08:19,784 - INFO - 
2025-06-15 19:08:19,786 - INFO - Epoch 3/10
2025-06-15 19:08:33,860 - INFO - Train Loss: 1.0528           
2025-06-15 19:08:33,861 - INFO - Train Accuracy: 0.4262
2025-06-15 19:08:33,863 - INFO - Val   Loss: 1.0621
2025-06-15 19:08:33,864 - INFO - Val   Accuracy: 0.4109
2025-06-15 19:08:33,865 - INFO - 
2025-06-15 19:08:33,867 - INFO - Epoch 4/10
2025-06-15 19:08:46,935 - INFO - Train Lo

(MLPClassifier(
   (model): Sequential(
     (0): Linear(in_features=37, out_features=128, bias=True)
     (1): ReLU(inplace=True)
     (2): Linear(in_features=128, out_features=64, bias=True)
     (3): ReLU(inplace=True)
     (4): Linear(in_features=64, out_features=3, bias=True)
   )
 ),
 {'train_loss': [1.069474842446658,
   1.056639622104174,
   1.0527823213818959,
   1.0514044884257416,
   1.0507064070902603,
   1.0501154630907057,
   1.0463590823253923,
   1.0459872663111902,
   1.0458445272026722,
   1.045736674156935],
  'val_loss': [1.0705604401135758,
   1.0629158789400313,
   1.062076203930445,
   1.061362023425147,
   1.0610294354342162,
   1.0604307389840848,
   1.0600359605356184,
   1.0596175279894644,
   1.0593708239174247,
   1.0592253614098226],
  'train_accuracy': [0.40577069578466224,
   0.42239906043676995,
   0.42618080243778567,
   0.4259630523108177,
   0.4265820213306247,
   0.4276771203656679,
   0.4307719654647029,
   0.43186262061960384,
   0.431989588623666

In [None]:
import itertools 
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.datasets import make_classification
import lightgbm as lgb


def evaluate_model_accuracy(model, param_grid=None):
    def evaluate_cur_model(cur_model, best_accuracy): 
        cur_model = cur_model.fit(X_train, y_train)
        test_preds = cur_model.predict(X_test)
        best_accuracy = accuracy_score(y_test, test_preds)

        return best_accuracy


    best_accuracy = 0
    best_params = None 
    if param_grid:
        keys, values = zip(*param_grid.items())
        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

        for param_compbination in param_combinations: 
            cur_model = clone(model).set_params(**param_compbination)
            cur_accuracy = evaluate_cur_model(cur_model, best_accuracy)
            if cur_accuracy > best_accuracy: 
                best_accuracy = cur_accuracy
                best_params = param_compbination
    else: 
        cur_accuracy = evaluate_cur_model(model, best_accuracy)
        if cur_accuracy > best_accuracy: 
            best_accuracy = cur_accuracy
            best_params = None

    print(f'Best accuracy: {best_accuracy}, bestparams: {best_params}')

lgb_model = lgb.LGBMClassifier(
    n_estimators=1000, 
    learning_rate=0.001,
    max_depth=5,
    num_leaves=31, 
    objective='multiclass', 
    num_class=3
    )
evaluate_model_accuracy(lgb_model)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009587 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9435
[LightGBM] [Info] Number of data points in the train set: 63001, number of used features: 37
[LightGBM] [Info] Start training from score -1.089668
[LightGBM] [Info] Start training from score -1.115096
[LightGBM] [Info] Start training from score -1.091274




Best accuracy: 0.4201853155055125, bestparams: None
