In [1]:
!pip install adlfs
!pip install keras-tuner --quiet


Collecting adlfs
  Downloading adlfs-2024.12.0-py3-none-any.whl.metadata (7.7 kB)
Collecting azure-core<2.0.0,>=1.28.0 (from adlfs)
  Downloading azure_core-1.34.0-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-datalake-store<0.1,>=0.0.53 (from adlfs)
  Downloading azure_datalake_store-0.0.53-py2.py3-none-any.whl.metadata (19 kB)
Collecting azure-identity (from adlfs)
  Downloading azure_identity-1.23.0-py3-none-any.whl.metadata (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-storage-blob>=12.17.0 (from adlfs)
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)
Collecting msal<2,>=1.16.0 (from azure-datalake-store<0.1,>=0.

#### IMPORTS

In [2]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import random
import plotly.express as px
import joblib
from datetime import timedelta
from typing import List, Tuple, Dict,Optional

# Azure
from adlfs import AzureBlobFileSystem

# Models
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc ,  precision_score, recall_score,classification_report
from tensorflow.keras.metrics import AUC, Precision, Recall
from sklearn.utils import class_weight
from sklearn.feature_selection import SelectKBest, f_classif



#### FUNCTIONS

In [3]:

def load_data_from_dl(account_name: str,container_name: str,relative_path: str,access_key: str)->pd.DataFrame:
    """
    Loads all Parquet files from an Azure Blob Storage path into a single DataFrame.
    Args:
        - account_name (str): Azure Storage account name.
        - container_name (str): Name of the container.
        - relative_path (str): Path inside the container to search for .parquet files.
        - access_key (str): Storage account access key.
    Returns:
        - df (pd.DataFrame): Combined DataFrame from all found Parquet files.
    Raises:
        - ValueError: If no Parquet files are found in the path.
    """
    abfs = AzureBlobFileSystem(account_name=account_name, account_key=access_key)


    all_files = abfs.glob(f"{container_name}/{relative_path}/*.parquet")
    print(f"folder: {all_files}")

    if not all_files:
        raise ValueError("Not found .parquet files")

    dfs = []
    for f in all_files:
        print(f"Reading files: {f}")
        with abfs.open(f, "rb") as fp:
            dfs.append(pd.read_parquet(fp))

    df = pd.concat(dfs, ignore_index=True)
    print(df.head())
    return df

In [4]:
def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans a DataFrame by dropping unnecessary columns and handling missing values.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    # Columns to drop (consistent with SVM and Naive Bayes pipelines)
    columns_to_drop = [
    'capital_gains', 'ret_next_3m', 'ret_next_6m', 'ret_next_1y',"price_lead_3m", "price_lead_6m", "price_lead_1y"]

    print(f"Shape before: {df.shape}")
    df = df.drop(columns=columns_to_drop, errors='ignore').copy()

    numeric_cols = df.select_dtypes(include=["float64", "int64", "int32"]).columns
    imputer = SimpleImputer(strategy="mean")
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

    print(f"Shape after: {df.shape}")
    return df


In [5]:
def build_and_split_sequences_by_symbol(
    df: pd.DataFrame,
    target_column: str,
    sequence_length: int = 60,
    test_size: float = 0.2
):
    """
    Builds sequential data for each symbol independently and performs temporal train-test split.

    Args:
        df (pd.DataFrame): Preprocessed DataFrame with features and target.
        target_column (str): Column name for binary target.
        sequence_length (int): Time window for each sequence.
        test_size (float): Proportion of test samples (applied per symbol).

    Returns:
        Tuple of numpy arrays: X_train, X_test, y_train, y_test
    """
    X_train, y_train, X_test, y_test = [], [], [], []

    symbols = df["symbol"].unique()
    for symbol in symbols:
        df_symbol = df[df["symbol"] == symbol].copy()
        df_symbol = df_symbol.sort_values("date")

        if len(df_symbol) <= sequence_length:
            continue  # skip if not enough data

        df_symbol[target_column] = df_symbol[target_column].astype(int)
        features = df_symbol.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
        target = df_symbol[target_column].values

        split_idx = int(len(features) * (1 - test_size))
        for i in range(len(features) - sequence_length):
            if i + sequence_length >= len(features):
                continue  # avoid index error

            X_seq = features.iloc[i:i + sequence_length].values.astype(np.float32)
            y_val = target[i + sequence_length]

            if i + sequence_length < split_idx:
                X_train.append(X_seq)
                y_train.append(y_val)
            else:
                X_test.append(X_seq)
                y_test.append(y_val)

    return (
        np.array(X_train),
        np.array(X_test),
        np.array(y_train),
        np.array(y_test)
    )


In [6]:
def prepare_data_with_symbol_date(df: pd.DataFrame, targets: list, k_best: int = None):
    """
    Prepares a DataFrame for LSTM modeling: encodes 'symbol', extracts 'date' features,
    imputes missing values, optionally selects top-k features, and applies MinMax scaling.

    Args:
        df (pd.DataFrame): Input data with features + target + symbol + date.
        targets (list): List of target column names.
        k_best (int, optional): Number of top features to select. If None, no selection is done.

    Returns:
        pd.DataFrame: Scaled dataframe with selected features plus 'symbol' and 'date'.
        MinMaxScaler: The fitted scaler object.
    """
    df_clean = df.copy()

    # Convert date and extract features
    df_clean["date"] = pd.to_datetime(df_clean["date"], errors="coerce")
    df_clean["year"] = df_clean["date"].dt.year
    df_clean["month"] = df_clean["date"].dt.month
    df_clean["dayofweek"] = df_clean["date"].dt.dayofweek

    date_col = df_clean["date"]  # store datetime (not string)

    # Encode symbol (remains numeric)
    df_clean["symbol"] = LabelEncoder().fit_transform(df_clean["symbol"].astype(str))

    # Drop original date before scaling
    df_clean = df_clean.drop(columns=["date"], errors="ignore")

    # Encode categorical columns
    for col in df_clean.select_dtypes(include=["object", "category"]).columns:
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))

    feature_cols = [col for col in df_clean.columns if col not in targets]
    X = df_clean[feature_cols]
    y = df_clean[targets]

    imputer = SimpleImputer(strategy="mean")
    X_imputed = imputer.fit_transform(X)

    # Select top-k features if k_best specified
    if k_best is not None and k_best < X_imputed.shape[1]:
        selector = SelectKBest(score_func=f_classif, k=k_best)
        X_selected = selector.fit_transform(X_imputed, y.values.ravel())
        selected_features = [feature_cols[i] for i in selector.get_support(indices=True)]
    else:
        X_selected = X_imputed
        selected_features = feature_cols

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X_selected)

    df_scaled = pd.DataFrame(X_scaled, columns=selected_features, index=df_clean.index)
    df_scaled = pd.concat([df_scaled, y], axis=1)

    # Reattach symbol and original date
    df_scaled["symbol"] = df_clean["symbol"].values
    df_scaled["date"] = date_col.values

    print(f"Selected columns for scaling: {selected_features}")
    return df_scaled, scaler

# **MAIN**

In [None]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

account_name = config['storage']['storage_account_name']
container_name = config['storage']['container_name']
relative_path = config['storage']['relative_path']
access_key = config['storage']['access_key']

# Cargar el DataFrame desde Azure
df_full = load_data_from_dl(account_name, container_name, relative_path, access_key)
df_full = df_full.dropna(subset=['ret_next_3m', 'ret_next_6m', 'ret_next_1y'])



folder: ['smart-wallet-dl/smart_wallet/stock_data_parquet/part-00000-tid-6319656906602864551-70988dce-8c71-437f-9993-81d0d24559f8-53-1-c000.snappy.parquet']
Reading files: smart-wallet-dl/smart_wallet/stock_data_parquet/part-00000-tid-6319656906602864551-70988dce-8c71-437f-9993-81d0d24559f8-53-1-c000.snappy.parquet
         date  symbol      open_v        high         low     close_v  \
0  2023-07-25  TEL.OL   99.110617   99.423126   98.128441   98.128441   
1  2023-07-25    ACIW   23.270000   23.530001   23.129999   23.420000   
2  2023-07-25     TER  114.730270  117.172184  114.730270  116.368141   
3  2023-07-25    ADCT    1.320000    1.330000    1.218000    1.290000   
4  2023-07-25     TEX   59.353587   59.490459   57.574251   57.769783   

      volume  dividends  stock_splits  capital_gains  ...  candle_color  \
0  1170518.0        0.0           0.0            NaN  ...           red   
1   301500.0        0.0           0.0            NaN  ...         green   
2  1256700.0       

In [8]:
#Addtargets
df_full["target_3m"] = (df_full["ret_next_3m"] > 0.1).astype(int)
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.1).astype(int)
df_full["target_1y"] = (df_full["ret_next_1y"] > 0.1).astype(int)
print(f" target_3m: {df_full['target_3m'].value_counts()}")
print(f" target_6m: {df_full['target_6m'].value_counts()}")
print(f" target_1y: {df_full['target_1y'].value_counts()}")


 target_3m: target_3m
0    7475299
1    3371248
Name: count, dtype: int64
 target_6m: target_6m
0    6463478
1    4383069
Name: count, dtype: int64
 target_1y: target_1y
0    5554342
1    5292205
Name: count, dtype: int64


In [9]:
top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(20)
    .index
)
df_20_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(100)
    .index
)
df_100_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(500)
    .index
)
df_500_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

#XGBOOST

In [29]:
#TARGET 63m, 20 sybols
target_selected="target_3m"
targets = [target_selected]

df_clean = df_20_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (125466, 56)
Shape after: (125466, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'prev_volume', 'daily_return', 'close_change_pct', 'intraday_volatility', 'price_range', 'log_return', 'is_dividend_day', 'sma_5', 'sma_20', 'bollinger_upper', 'bollinger_lower', 'rel_volume', 'ema_12', 'ema_26', 'macd_histogram', 'tr_1', 'tr_3', 'lower_wick', 'var_95', 'target_6m', 'target_1y', 'year', 'month']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close',
       'prev_volume', 'daily_return', 'close_change_pct',
       'intraday_volatility', 'price_range', 'log_return', 'is_dividend_day',
       'sma_5', 'sma_20', 'bollinger_upper', 'bollinger_lower', 'rel_volume',
       'ema_12', 'ema_26', 'macd_histogram', 'tr_1', 'tr_3', 'lower_wick',
       'var_95', 'target_6m', 'target_1y', 'year', 'month', 'target_3m',
       'd

In [28]:
#TARGET 6m, 20 sybols
target_selected="target_6m"
targets = [target_selected]

df_clean = df_20_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (125466, 56)
Shape after: (125466, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'intraday_volatility', 'price_range', 'sma_5', 'sma_20', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line', 'macd_signal', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year', 'month']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close',
       'intraday_volatility', 'price_range', 'sma_5', 'sma_20', 'rsi_14',
       'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line',
       'macd_signal', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14',
       'candle_body', 'upper_wick', 'roc_10', 'var_95', 'target_3m',
       'target_1y', 'year', 'month', 'target_6m', 'date'],
      dtype='object')
len: (125466, 32)
              precisio

In [30]:
#TARGET 1y, 20 sybols
target_selected="target_1y"
targets = [target_selected]

df_clean = df_20_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (125466, 56)
Shape after: (125466, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'prev_volume', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'var_95', 'target_3m', 'target_6m', 'year']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close',
       'prev_volume', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss',
       'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26',
       'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body',
       'upper_wick', 'lower_wick', 'var_95', 'target_3m', 'target_6m', 'year',
       'target_1y', 'date'],
      dtype='object')
len: (125466, 32)
              precision    recall  f1-score   suppor

# 100 symbols

In [31]:
#TARGET 3m, 100 sybols
target_selected="target_3m"
targets = [target_selected]

df_clean = df_100_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (622891, 56)
Shape after: (622891, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'close_change_pct', 'price_range', 'log_return', 'sma_5', 'sma_20', 'gain', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line', 'macd_signal', 'tr_1', 'tr_2', 'true_range', 'atr_14', 'candle_body', 'momentum_10', 'roc_10', 'var_95', 'target_6m', 'target_1y', 'month']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close',
       'close_change_pct', 'price_range', 'log_return', 'sma_5', 'sma_20',
       'gain', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12',
       'ema_26',

In [32]:
#TARGET 6m, 100 sybols
target_selected="target_6m"
targets = [target_selected]

df_clean = df_100_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (622891, 56)
Shape after: (622891, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'price_range', 'sma_5', 'sma_20', 'gain', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line', 'macd_signal', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'momentum_10', 'var_95', 'target_3m', 'target_1y', 'month']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close',
       'price_range', 'sma_5', 'sma_20', 'gain', 'bollinger_upper',
       'bollinger_lower', 'ema_12', 'ema_26', 'macd_line', 'macd_signal',
       'tr_1', 'tr_2

In [33]:
#TARGET 1y, 100 sybols
target_selected="target_1y"
targets = [target_selected]

df_clean = df_100_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (622891, 56)
Shape after: (622891, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'prev_volume', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'var_95', 'target_3m', 'target_6m', 'year']
unique sbs:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]
columns: Index(['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close',
       'prev_volume', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss',
       'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'macd_line',
       'tr_1', 'tr_2

# 500 symbols

In [34]:
#TARGET 3m, 500 sybols
target_selected="target_3m"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'prev_volume', 'daily_return', 'close_change_pct', 'intraday_volatility', 'gap_open', 'log_return', 'is_dividend_day', 'sma_5', 'sma_20', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'rel_volume', 'ema_12', 'ema_26', 'macd_signal', 'candle_color', 'roc_10', 'var_95', 'target_6m', 'target_1y', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 1

In [35]:
#TARGET 6m, 500 sybols
target_selected="target_6m"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'volume', 'prev_close', 'prev_volume', 'close_change_pct', 'intraday_volatility', 'price_range', 'log_return', 'sma_5', 'sma_20', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'tr_1', 'true_range', 'atr_14', 'candle_body', 'candle_color', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130

In [36]:
#TARGET 1y, 500 sybols
target_selected="target_1y"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'roc_10', 'var_95', 'target_3m', 'target_6m', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 1

# Comparativa con target 0.05

In [15]:
#Addtargets
df_full["target_3m"] = (df_full["ret_next_3m"] > 0.05).astype(int)
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.05).astype(int)
df_full["target_1y"] = (df_full["ret_next_1y"] > 0.05).astype(int)
print(f" target_3m: {df_full['target_3m'].value_counts()}")
print(f" target_6m: {df_full['target_6m'].value_counts()}")
print(f" target_1y: {df_full['target_1y'].value_counts()}")

top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(20)
    .index
)
df_20_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(100)
    .index
)
df_100_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

top_symbols = (
    df_full.groupby("symbol")
    .size()
    .sort_values(ascending=False)
    .head(500)
    .index
)
df_500_symbols = df_full[df_full["symbol"].isin(top_symbols)].copy()

 target_3m: target_3m
0    6156205
1    4690342
Name: count, dtype: int64
 target_6m: target_6m
0    5464849
1    5381698
Name: count, dtype: int64
 target_1y: target_1y
1    5971358
0    4875189
Name: count, dtype: int64


In [16]:
#TARGET 3m, 500 sybols
target_selected="target_3m"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'close_change_pct', 'intraday_volatility', 'price_range', 'log_return', 'sma_5', 'sma_20', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'rel_volume', 'ema_12', 'ema_26', 'tr_1', 'tr_2', 'true_range', 'atr_14', 'upper_wick', 'candle_color', 'roc_10', 'var_95', 'target_6m', 'target_1y', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131

In [17]:
#TARGET 6m, 500 sybols
target_selected="target_6m"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'close_change_pct', 'price_range', 'log_return', 'sma_5', 'sma_20', 'rsi_14', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135

In [18]:
#TARGET 1y, 500 sybols
target_selected="target_1y"
targets = [target_selected]

df_clean = df_500_symbols.copy()   #SYMBOLS

df_clean = df_clean.sort_values(by=["symbol", "date"])
df_clean = clean_columns(df_clean)

# Aquí pasamos k_best=30
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets, k_best=30)

#print(f"shape: {df_processed_minmax.shape}")
print("unique sbs: ",df_processed_minmax.symbol.unique())
print(f"columns: {df_processed_minmax.columns}")
print(f"len: {df_processed_minmax.shape}")

X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax[target_selected]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

model = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.03, colsample_bytree = 0.8,subsample=0.8,  scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (3077632, 56)
Shape after: (3077632, 49)
Selected columns for scaling: ['symbol', 'open_v', 'high', 'low', 'close_v', 'prev_close', 'intraday_volatility', 'price_range', 'sma_5', 'sma_20', 'gain', 'loss', 'bollinger_upper', 'bollinger_lower', 'ema_12', 'ema_26', 'tr_1', 'tr_2', 'tr_3', 'true_range', 'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'roc_10', 'var_95', 'target_3m', 'target_6m', 'year', 'month']
unique sbs:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 