In [71]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
from datetime import datetime, timezone

import logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format='%(asctime)s - %(levelname)s - %(message)s',  # Format for the log messages
    handlers=[
        logging.StreamHandler()  # Log to the console
    ]
)

%reload_ext autoreload
%autoreload 2
from data.raw.retrievers.alpaca_markets_retriever import AlpacaMarketsRetriever
from config.constants import *
from data.processed.dataset_creation import DatasetCreator
from data.processed.indicators import *
from data.processed.targets import Balanced3ClassClassification
from data.processed.normalization import ZScoreOverWindowNormalizer, ZScoreNormalizer, MinMaxNormalizer
from data.processed.missing_values_handling import DummyMissingValuesHandler

In [25]:
retriever = AlpacaMarketsRetriever()

In [31]:
retrieval_result = retriever.bars(
    symbol_or_symbols=['AAPL'], 
    start=datetime(2025, 1, 1), 
    end=datetime(2025, 6, 1))

In [33]:
retrieval_result['AAPL']

Unnamed: 0,date,open,high,low,close,volume,vwap
0,2025-01-01 00:03:00+00:00,250.4216,250.4216,250.4216,250.4216,189.0,250.421600
1,2025-01-01 00:11:00+00:00,250.4500,250.4500,250.4500,250.4500,221.0,250.450000
2,2025-01-01 00:14:00+00:00,250.4208,250.4208,250.4208,250.4208,502.0,250.420800
3,2025-01-01 00:19:00+00:00,250.4300,250.4300,250.4200,250.4200,1246.0,250.429177
4,2025-01-01 00:20:00+00:00,250.4200,250.4200,250.4200,250.4200,122.0,250.420000
...,...,...,...,...,...,...,...
80091,2025-05-30 23:55:00+00:00,200.2004,200.2200,200.2004,200.2200,442.0,200.213356
80092,2025-05-30 23:56:00+00:00,200.2200,200.2200,200.2200,200.2200,537.0,200.220000
80093,2025-05-30 23:57:00+00:00,200.2300,200.2300,200.2300,200.2300,627.0,200.230000
80094,2025-05-30 23:58:00+00:00,200.2200,200.2792,200.2200,200.2792,2069.0,200.255554


In [None]:
dataset_creator = DatasetCreator(
    features={
        "open": lambda data: data['open'],
        "high": lambda data: data['high'],
        "low": lambda data: data['low'],
        "close": lambda data: data['close'],
        "volume": lambda data: data['volume'],
        "return": lambda data: data['close'].pct_change(),
        "OBV": OBV(),
        "RSI6": RSI(6),
        "RSI12": RSI(12),
        "EMA3": EMA(3),
        "EMA6": EMA(6),
        "EMA12": EMA(12),
        "ATR14": ATR(14),
        "MFI": MFI(14),
        "ADX14": ADX(14),
        "ADX20": ADX(20),
        "MOM1": MOM(1),
        "MOM3": MOM(3),
        "CCI12": CCI(12),
        "CCI20": CCI(20),
        "ROCR12": ROCR(12),
        "MACD": MACD(),
        "WILLR": WILLR(10),
        "TRIX": TRIX(20),
        "BB_LOW": BollingerBand(BollingerBand.BBType.LOWER),
        "BB_UP": BollingerBand(BollingerBand.BBType.UPPER),
        "EMA_26": EMA(26, base_feature="close"),
        "VWAP": VWAP(high_feature='high', low_feature='low', close_feature='close'),
        "ATR_28": ATR(28, high_feature='high', low_feature='low', close_feature='close'),
        "FRL_0": FRL(FRL.FIB_RATIOS[0], high_feature='high', low_feature='low', close_feature='close'),
        "FRL_1": FRL(FRL.FIB_RATIOS[1], high_feature='high', low_feature='low', close_feature='close'),
        "FRL_2": FRL(FRL.FIB_RATIOS[2], high_feature='high', low_feature='low', close_feature='close'),
        "FRL_3": FRL(FRL.FIB_RATIOS[3], high_feature='high', low_feature='low', close_feature='close'),
        "FRL_4": FRL(FRL.FIB_RATIOS[4], high_feature='high', low_feature='low', close_feature='close'),
        "RSI_28": RSI(24),
        "Oscillator_K": Oscillator(Oscillator.LineType.K),
        "Oscillator_D": Oscillator(Oscillator.LineType.D),
    },
    target=Balanced3ClassClassification(base_feature='close'),
    # normalizer=ZScoreOverWindowNormalizer(window=6000),
    normalizer=MinMaxNormalizer(),
    missing_values_handler=DummyMissingValuesHandler(),
    train_set_last_date=datetime(2025, 5, 1, tzinfo=timezone.utc), 
    in_seq_len=1
)

In [69]:
X_train, y_train, X_test, y_test = dataset_creator.create_dataset_numpy(retrieval_result)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((63001, 37), (63001,), (17052, 37), (17052,))

In [70]:
import itertools 
from sklearn.base import clone
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.datasets import make_classification
import lightgbm as lgb



def evaluate_model_accuracy(model, param_grid=None):
    def evaluate_cur_model(cur_model, best_accuracy): 
        cur_model = cur_model.fit(X_train, y_train)

        # if hasattr(cur_model, 'predict_proba'):
        #     test_preds_proba = cur_model.predict_proba(X_test)
        #     print(f'Test predictions obtained! Starting evaluation...')

        #     for treshold in np.array(list(range(45, 65))) / 100:
        #         print()
        #         test_preds = (test_preds_proba[:, 1] >= treshold).astype(int)
        #         best_accuracy = max(best_accuracy, accuracy_score(y_test, test_preds))
        # else: 
        #     test_preds = cur_model.predict(X_test)
        #     best_accuracy = accuracy_score(y_test, test_preds)
        test_preds = cur_model.predict(X_test)
        best_accuracy = accuracy_score(y_test, test_preds)

        return best_accuracy


    best_accuracy = 0
    best_params = None 
    if param_grid:
        keys, values = zip(*param_grid.items())
        param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

        for param_compbination in param_combinations: 
            cur_model = clone(model).set_params(**param_compbination)
            cur_accuracy = evaluate_cur_model(cur_model, best_accuracy)
            if cur_accuracy > best_accuracy: 
                best_accuracy = cur_accuracy
                best_params = param_compbination
    else: 
        cur_accuracy = evaluate_cur_model(model, best_accuracy)
        if cur_accuracy > best_accuracy: 
            best_accuracy = cur_accuracy
            best_params = None

    print(f'Best accuracy: {best_accuracy}, bestparams: {best_params}')

lgb_model = lgb.LGBMClassifier(
    n_estimators=1000, 
    learning_rate=0.001,
    max_depth=5,
    num_leaves=31, 
    objective='multiclass', 
    num_class=3
    )
evaluate_model_accuracy(lgb_model)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9435
[LightGBM] [Info] Number of data points in the train set: 63001, number of used features: 37
[LightGBM] [Info] Start training from score -1.089668
[LightGBM] [Info] Start training from score -1.115096
[LightGBM] [Info] Start training from score -1.091274




Best accuracy: 0.4201853155055125, bestparams: None
