In [1]:
!pip install adlfs
!pip install keras-tuner --quiet


Collecting adlfs
  Downloading adlfs-2024.12.0-py3-none-any.whl.metadata (7.7 kB)
Collecting azure-core<2.0.0,>=1.28.0 (from adlfs)
  Downloading azure_core-1.34.0-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-datalake-store<0.1,>=0.0.53 (from adlfs)
  Downloading azure_datalake_store-0.0.53-py2.py3-none-any.whl.metadata (19 kB)
Collecting azure-identity (from adlfs)
  Downloading azure_identity-1.23.0-py3-none-any.whl.metadata (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-storage-blob>=12.17.0 (from adlfs)
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl.metadata (26 kB)
Collecting msal<2,>=1.16.0 (from azure-datalake-store<0.1,>=0.

#### IMPORTS

In [2]:
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import random
import plotly.express as px
import joblib
from datetime import timedelta
from typing import List, Tuple, Dict,Optional

# Azure
from adlfs import AzureBlobFileSystem

from typing import Tuple, List, Optional
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


# Models
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
import joblib

#### FUNCTIONS

In [3]:

def load_data_from_dl(account_name: str,container_name: str,relative_path: str,access_key: str)->pd.DataFrame:
    """
    Loads all Parquet files from an Azure Blob Storage path into a single DataFrame.
    Args:
        - account_name (str): Azure Storage account name.
        - container_name (str): Name of the container.
        - relative_path (str): Path inside the container to search for .parquet files.
        - access_key (str): Storage account access key.
    Returns:
        - df (pd.DataFrame): Combined DataFrame from all found Parquet files.
    Raises:
        - ValueError: If no Parquet files are found in the path.
    """
    abfs = AzureBlobFileSystem(account_name=account_name, account_key=access_key)


    all_files = abfs.glob(f"{container_name}/{relative_path}/*.parquet")
    print(f"folder: {all_files}")

    if not all_files:
        raise ValueError("Not found .parquet files")

    dfs = []
    for f in all_files:
        print(f"Reading files: {f}")
        with abfs.open(f, "rb") as fp:
            dfs.append(pd.read_parquet(fp))

    df = pd.concat(dfs, ignore_index=True)
    print(df.head())
    return df

In [4]:
def model_evaluation(
    y_test: pd.Series,
    y_pred: pd.Series,
    y_prob: pd.Series
) -> Tuple[float, float, float, np.ndarray, float, float]:
    """
    Evaluates the classification model and plots metrics.

    Args:
        y_test (pd.Series): True target values.
        y_pred (pd.Series): Predicted class values.
        y_prob (pd.Series): Predicted probabilities for class 1.

    Returns:
        Tuple containing Accuracy, F1 Score, ROC AUC, Confusion Matrix, Precision, Recall.
    """
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    fig, ax = plt.subplots(figsize=(5, 4))
    cax = ax.matshow(cm, cmap='Blues')
    fig.colorbar(cax)
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['No', 'Yes'])
    ax.set_yticklabels(['No', 'Yes'])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return acc, f1, roc, cm, precision, recall


In [5]:
def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
  """
  Cleans a DataFrame by dropping unnecessary columns and handling missing values.

  Args:
      df (pd.DataFrame): Input DataFrame.

  Returns:
      pd.DataFrame: Cleaned DataFrame.
  """
  columns_to_drop = [
      'capital_gains',
      'ret_next_3m', 'ret_next_6m', 'ret_next_1y',
      'price_lead_3m', 'price_lead_6m', 'price_lead_1y',
      'open_v', 'high', 'low', 'dividends', 'stock_splits',
      'is_dividend_day', 'is_stock_split', 'gap_open', 'price_range',
      'tr_1', 'tr_2', 'tr_3', 'sma_5', 'bollinger_upper',
      'bollinger_lower', 'ema_12', 'macd_line'
  ]


  print(f"Shape before: {df.shape}")
  df = df.drop(columns=columns_to_drop, errors='ignore').copy()
  numeric_cols = df.select_dtypes(include=["float64", "int64", "int32"]).columns
  imputer = SimpleImputer(strategy="mean")
  df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
  print(f"Shape after: {df.shape}")
  return df;

In [6]:
def prepare_data(
    df: pd.DataFrame,
    targets: list
):
    """
    Prepares a DataFrame for LSTM modeling: imputes missing values and applies MinMax scaling.

    Args:
        df (pd.DataFrame): Input data with features + target + symbol + date.
        targets (list): List of target column names.

    Returns:
        pd.DataFrame: Scaled dataframe with symbol and date preserved.
        MinMaxScaler: The fitted scaler object.
    """
    df_clean = df.copy()

    # Separar columnas a preservar
    symbol_col = df_clean["symbol"]
    date_col = df_clean["date"]

    # Eliminar columnas no necesarias para el modelo
    df_clean = df_clean.drop(columns=["symbol", "date"], errors="ignore")

    # Codificación de categóricas si existieran (seguridad)
    for col in df_clean.select_dtypes(include=["object", "category"]).columns:
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))

    # Separar features y targets
    feature_cols = [col for col in df_clean.columns if col not in targets]
    X = df_clean[feature_cols]
    y = df_clean[targets]

    # Imputación + escalado
    imputer = SimpleImputer(strategy="mean")
    X_imputed = imputer.fit_transform(X)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    df_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=df_clean.index)
    df_scaled = pd.concat([df_scaled, y], axis=1)

    # Reincorporar columnas para split posterior
    df_scaled["symbol"] = symbol_col.values
    df_scaled["date"] = date_col.values

    print(f"df_scaled: {df_scaled.shape}")
    print(f"columns: {df_scaled.columns}")
    return df_scaled, scaler


In [7]:
def split_by_symbol (df, target_column, test_size=0.2):
    train_rows, test_rows = [], []

    for symbol in df["symbol"].unique():
        df_symbol = df[df["symbol"] == symbol].copy()
        df_symbol = df_symbol.sort_values("date")

        split_idx = int(len(df_symbol) * (1 - test_size))
        train_rows.append(df_symbol.iloc[:split_idx])
        test_rows.append(df_symbol.iloc[split_idx:])

    df_train = pd.concat(train_rows)
    df_test = pd.concat(test_rows)

    X_train = df_train.drop(columns=[target_column])
    y_train = df_train[target_column]

    X_test = df_test.drop(columns=[target_column])
    y_test = df_test[target_column]

    return X_train, X_test, y_train, y_test


In [8]:
def build_and_split_sequences_by_symbol(
    df: pd.DataFrame,
    target_column: str,
    sequence_length: int = 60,
    test_size: float = 0.2
):
    """
    Builds sequential data for each symbol independently and performs temporal train-test split.

    Args:
        df (pd.DataFrame): Preprocessed DataFrame with features and target.
        target_column (str): Column name for binary target.
        sequence_length (int): Time window for each sequence.
        test_size (float): Proportion of test samples (applied per symbol).

    Returns:
        Tuple of numpy arrays: X_train, X_test, y_train, y_test
    """
    X_train, y_train, X_test, y_test = [], [], [], []

    symbols = df["symbol"].unique()
    for symbol in symbols:
        df_symbol = df[df["symbol"] == symbol].copy()
        df_symbol = df_symbol.sort_values("date")

        if len(df_symbol) <= sequence_length:
            continue  # skip if not enough data

        df_symbol[target_column] = df_symbol[target_column].astype(int)
        features = df_symbol.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
        target = df_symbol[target_column].values
        #features.to_csv(f"{symbol}.csv",sep=";")
        #sys.exit(0)

        split_idx = int(len(features) * (1 - test_size))
        for i in range(len(features) - sequence_length):
            if i + sequence_length >= len(features):
                continue  # avoid index error

            X_seq = features.iloc[i:i + sequence_length].values.astype(np.float32)
            y_val = target[i + sequence_length]

            if i + sequence_length < split_idx:
                X_train.append(X_seq)
                y_train.append(y_val)
            else:
                X_test.append(X_seq)
                y_test.append(y_val)

    return (
        np.array(X_train),
        np.array(X_test),
        np.array(y_train),
        np.array(y_test)
    )


In [9]:
def train_base_lstm_classifier(
    X_train,
    X_test,
    y_train,
    y_test,
    sequence_length=60,
    epochs=30,
    batch_size=32,
    patience=4,
    class_weight=None
):
    """
    Trains a simple LSTM classifier with basic architecture and early stopping.

    Args:
        X_train (np.ndarray): Training sequences.
        X_test (np.ndarray): Test sequences.
        y_train (np.ndarray): Training labels.
        y_test (np.ndarray): Test labels.
        sequence_length (int): Length of each input sequence.
        epochs (int): Maximum number of epochs.
        batch_size (int): Batch size.
        patience (int): Early stopping patience.
        class_weight (dict, optional): Class weights to handle imbalance.

    Returns:
        model (Sequential): Trained Keras model.
        X_test (np.ndarray): Test features.
        y_test (np.ndarray): Test labels.
        y_prob (np.ndarray): Predicted probabilities.
        y_pred (np.ndarray): Predicted binary classes.
    """
    model = Sequential([
        LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=BinaryCrossentropy(),
        metrics=['accuracy', AUC(name='auc')]
    )

    early_stop = EarlyStopping(
        monitor='val_auc',
        mode='max',
        patience=patience,
        restore_best_weights=True
    )

    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1,
        class_weight=class_weight
    )

    y_prob = model.predict(X_test).flatten()
    y_pred = (y_prob > 0.5).astype(int)

    return model, X_test, y_test, y_prob, y_pred


In [10]:
def train_improved_lstm_classifier(
    X_train,
    X_test,
    y_train,
    y_test,
    sequence_length=120,
    epochs=30,
    batch_size=32,
    patience=6,
    learning_rate=1e-4,
    threshold=0.5,
    class_weight=None
):
    """
    Trains a bidirectional LSTM classifier with dropout, batch normalization, and threshold tuning.

    Args:
        X_train, X_test: Input sequences
        y_train, y_test: Labels
        sequence_length (int): Number of timesteps per sequence
        epochs (int): Max epochs
        batch_size (int): Training batch size
        patience (int): Early stopping patience
        learning_rate (float): Optimizer learning rate
        threshold (float): Threshold to binarize output
        class_weight (dict): Optional class weights

    Returns:
        model: Trained Keras model
        X_test, y_test: Evaluation inputs
        y_prob: Predicted probabilities
        y_pred: Binary predictions based on threshold
    """
    model = Sequential([
        Bidirectional(LSTM(64, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])),
        BatchNormalization(),
        Dropout(0.4),
        Bidirectional(LSTM(32, return_sequences=False)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=BinaryCrossentropy(),
        metrics=['accuracy', AUC(name='auc'), Precision(name='precision'), Recall(name='recall')]
    )

    early_stop = EarlyStopping(
        monitor='val_auc',
        mode='max',
        patience=patience,
        restore_best_weights=True
    )

    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1,
        class_weight=class_weight
    )

    y_prob = model.predict(X_test).flatten()
    y_pred = (y_prob > threshold).astype(int)

    return model, X_test, y_test, y_prob, y_pred


In [19]:
def model_evaluation(
    y_test: pd.Series,
    y_pred: pd.Series,
    y_prob: pd.Series
) -> Tuple[float, float, float, np.ndarray, float, float]:
    """
    Evaluates the classification model and plots metrics.

    Args:
        y_test (pd.Series): True target values.
        y_pred (pd.Series): Predicted class values.
        y_prob (pd.Series): Predicted probabilities for class 1.

    Returns:
        Tuple containing Accuracy, F1 Score, ROC AUC, Confusion Matrix, Precision, Recall.
    """
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    fig, ax = plt.subplots(figsize=(5, 4))
    cax = ax.matshow(cm, cmap='Blues')
    fig.colorbar(cax)
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['No', 'Yes'])
    ax.set_yticklabels(['No', 'Yes'])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return acc, f1, roc, cm, precision, recall


# **MAIN**

In [11]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

account_name = config['storage']['storage_account_name']
container_name = config['storage']['container_name']
relative_path = config['storage']['relative_path']
access_key = config['storage']['access_key']

# Cargar el DataFrame desde Azure
df_full = load_data_from_dl(account_name, container_name, relative_path, access_key)
df_full = df_full.dropna(subset=['ret_next_3m', 'ret_next_6m', 'ret_next_1y'])


folder: ['smart-wallet-dl/smart_wallet/stock_data_parquet/part-00000-tid-6319656906602864551-70988dce-8c71-437f-9993-81d0d24559f8-53-1-c000.snappy.parquet']
Reading files: smart-wallet-dl/smart_wallet/stock_data_parquet/part-00000-tid-6319656906602864551-70988dce-8c71-437f-9993-81d0d24559f8-53-1-c000.snappy.parquet
         date  symbol      open_v        high         low     close_v  \
0  2023-07-25  TEL.OL   99.110617   99.423126   98.128441   98.128441   
1  2023-07-25    ACIW   23.270000   23.530001   23.129999   23.420000   
2  2023-07-25     TER  114.730270  117.172184  114.730270  116.368141   
3  2023-07-25    ADCT    1.320000    1.330000    1.218000    1.290000   
4  2023-07-25     TEX   59.353587   59.490459   57.574251   57.769783   

      volume  dividends  stock_splits  capital_gains  ...  candle_color  \
0  1170518.0        0.0           0.0            NaN  ...           red   
1   301500.0        0.0           0.0            NaN  ...         green   
2  1256700.0       

In [12]:
df_full.to_parquet("df_full_spark.parquet")


In [13]:
#df_full=pd.read_parquet("df_full_spark.parquet")

In [14]:

#Addtargets
df_full["target_3m"] = (df_full["ret_next_3m"] > 0.1).astype(int)
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.1).astype(int)
df_full["target_1y"] = (df_full["ret_next_1y"] > 0.1).astype(int)
print(f" target_3m: {df_full['target_3m'].value_counts()}")
print(f" target_6m: {df_full['target_6m'].value_counts()}")
print(f" target_1y: {df_full['target_1y'].value_counts()}")


 target_3m: target_3m
0    7475299
1    3371248
Name: count, dtype: int64
 target_6m: target_6m
0    6463478
1    4383069
Name: count, dtype: int64
 target_1y: target_1y
0    5554342
1    5292205
Name: count, dtype: int64


#XGBOOST

In [15]:
def prepare_data_with_symbol_date(df: pd.DataFrame, targets: list):
    """
    Prepares a DataFrame for LSTM modeling: encodes 'symbol', extracts 'date' features,
    imputes missing values, and applies MinMax scaling.

    Args:
        df (pd.DataFrame): Input data with features + target + symbol + date.
        targets (list): List of target column names.

    Returns:
        pd.DataFrame: Scaled dataframe with 'symbol' and 'date' incorporated as features.
        MinMaxScaler: The fitted scaler object.
    """
    df_clean = df.copy()

    # Convert date and extract features
    df_clean["date"] = pd.to_datetime(df_clean["date"], errors="coerce")
    df_clean["year"] = df_clean["date"].dt.year
    df_clean["month"] = df_clean["date"].dt.month
    df_clean["dayofweek"] = df_clean["date"].dt.dayofweek

    date_col = df_clean["date"]  # store datetime (not string)

    # Encode symbol (remains numeric)
    df_clean["symbol"] = LabelEncoder().fit_transform(df_clean["symbol"].astype(str))

    # Drop original date before scaling
    df_clean = df_clean.drop(columns=["date"], errors="ignore")

    for col in df_clean.select_dtypes(include=["object", "category"]).columns:
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))

    feature_cols = [col for col in df_clean.columns if col not in targets]
    X = df_clean[feature_cols]
    y = df_clean[targets]

    imputer = SimpleImputer(strategy="mean")
    X_imputed = imputer.fit_transform(X)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    df_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=df_clean.index)
    df_scaled = pd.concat([df_scaled, y], axis=1)

    # Reattach encoded symbol and original date
    df_scaled["symbol"] = df_clean["symbol"].values
    df_scaled["date"] = date_col.values

    print(f"columnas scaler: {X.columns}")
    return df_scaled, scaler


In [19]:
# Crear targets binarios
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.05).astype(int)

# Filtros
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento general
targets = ["target_6m"]  # o el que quieras
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)
print(f"tipos: {df_processed_minmax.dtypes}")

# Dataset final
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pesos balanceados
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

# Entrenamiento
model = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, scale_pos_weight=weight_dict[0]/weight_dict[1])
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluación
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
tipos: symbol                          int64
close_v                       float64
volume                        float64
prev_close                    float64
prev_volume                   float64
daily_return                  float64
close_change_pct              float64
intraday_volatility           float64
log_return                    float64
volume_change_pct             float64
sma_20                        float64

In [16]:
# Crear targets binarios
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.05).astype(int)

# Filtros y orden
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# División temporal
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Cálculo de pesos por clase
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}

# Entrenamiento con parámetros ya ajustados
model = XGBClassifier(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=weight_dict[0] / weight_dict[1],
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predicción
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluación
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


Shape before: (10846547, 54)
Shape after: (10846547, 30)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'year', 'month', 'dayofweek'],
      dtype='object')
              precision    recall  f1-score   support

         0.0       0.64      0.64      0.64   1094399
         1.0       0.63      0.63      0.63   1074911

    accuracy                           0.64   2169310
   macro avg       0.64      0.64      0.64   2169310
weighted avg       0.64      0.64      0.64   2169310

ROC AUC: 0.6836790099500576


In [16]:
print("XGBoost version:", xgb.__version__)

# Crear targets binarios con umbral del 10%
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.10).astype(int)

# Usar todos los símbolos
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Crear feature de interacción
df_processed_minmax["rsi_x_momentum"] = df_processed_minmax["rsi_14"] * df_processed_minmax["momentum_10"]

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# Eliminar columnas con varianza casi nula
selector = VarianceThreshold(threshold=1e-4)
X = pd.DataFrame(selector.fit_transform(X), index=X.index)

# Split temporal
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pesos balanceados
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}
scale_weight = weight_dict[0] / weight_dict[1]

# Param grid para GridSearchCV (27 combinaciones)
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.02, 0.03],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "scale_pos_weight": [scale_weight]
}

# Modelo base
model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)

# Grid Search optimizando F1
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# Entrenamiento final
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# Predicción con threshold ajustado
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print("Mejores parámetros:", grid.best_params_)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Mejores parámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 400, 'scale_pos_weight': np.float64(0.6791664425130032), 'subsample': 0.8}
              precision    recall  f1-score   support

         0.0       0.68      0.82      0.75   1295892
         1.0       0.62      0.43      0.51    873418



In [18]:

# Crear targets binarios con umbral del 10%
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.10).astype(int)

# Usar todos los símbolos
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# Eliminar columnas con varianza casi nula
selector = VarianceThreshold(threshold=1e-4)
X = pd.DataFrame(selector.fit_transform(X), index=X.index)

# Split temporal
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pesos balanceados
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}
scale_weight = weight_dict[0] / weight_dict[1]

# Param grid para GridSearchCV (27 combinaciones)
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.02, 0.03],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "scale_pos_weight": [scale_weight]
}

# Modelo base
model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)

# Grid Search optimizando F1
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# Entrenamiento final
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# Predicción con threshold ajustado
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print("Mejores parámetros:", grid.best_params_)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Mejores parámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 400, 'scale_pos_weight': np.float64(0.6791664425130032), 'subsample': 0.8}
              precision    recall  f1-score   support

         0.0       0.68      0.83      0.75   1295892
         1.0       0.63      0.42      0.50    873418



In [21]:
print("XGBoost version:", xgb.__version__)

# Crear targets binarios con umbral del 7%
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.07).astype(int)

# Usar todos los símbolos
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# Eliminar columnas con varianza casi nula
selector = VarianceThreshold(threshold=1e-4)
X = pd.DataFrame(selector.fit_transform(X), index=X.index)

# Split temporal
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pesos balanceados
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}
scale_weight = weight_dict[0] / weight_dict[1]

# Param grid para GridSearchCV (27 combinaciones)
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.02, 0.03],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "scale_pos_weight": [scale_weight]
}

# Modelo base
model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)

# Grid Search optimizando F1
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# Entrenamiento final
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# Predicción con threshold ajustado
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print("Mejores parámetros:", grid.best_params_)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Mejores parámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 400, 'scale_pos_weight': np.float64(0.8483387558593741), 'subsample': 0.8}
              precision    recall  f1-score   support

         0.0       0.70      0.58      0.64   1176642
         1.0       0.59      0.70      0.64    992668



In [23]:

# Crear targets binarios con umbral del 5%
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.05).astype(int)

# Usar todos los símbolos
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# Eliminar columnas con varianza casi nula
selector = VarianceThreshold(threshold=1e-4)
X = pd.DataFrame(selector.fit_transform(X), index=X.index)

# Split temporal
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Pesos balanceados
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
weight_dict = {i: w for i, w in enumerate(weights)}
scale_weight = weight_dict[0] / weight_dict[1]

# Param grid para GridSearchCV (27 combinaciones)
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.02, 0.03],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "scale_pos_weight": [scale_weight]
}

# Modelo base
model = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)

# Grid Search optimizando F1
grid = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="f1",
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# Entrenamiento final
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

# Predicción con threshold ajustado
y_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print("Mejores parámetros:", grid.best_params_)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Mejores parámetros: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 400, 'scale_pos_weight': np.float64(0.9854333077829515), 'subsample': 0.8}
              precision    recall  f1-score   support

         0.0       0.73      0.41      0.53   1094399
         1.0       0.59      0.85      0.69   1074911



In [28]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
import joblib

print("XGBoost version:", xgb.__version__)

# Crear target binario con umbral del 5%
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.05).astype(int)

# Ordenar y filtrar
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_6m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Feature extra
df_processed_minmax["rsi_x_momentum"] = df_processed_minmax["rsi_14"] * df_processed_minmax["momentum_10"]

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_6m"]

# VarianceThreshold
selector = VarianceThreshold(threshold=1e-4)
X_sel = pd.DataFrame(selector.fit_transform(X), index=X.index)
features_selected = X.columns[selector.get_support()].tolist()

# División temporal
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, shuffle=False, test_size=0.2)

# Balanceo
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
scale_weight = 0.9854333077829515  # hiperparámetro óptimo encontrado

# Entrenamiento con hiperparámetros óptimos
model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_weight,
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predicción
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")

# Guardar artefactos
joblib.dump(model, "model_xgb_6m.pkl")
joblib.dump(selector, "selector_xgb_6m.pkl")
joblib.dump(features_selected, "features_xgb_6m.pkl")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
              precision    recall  f1-score   support

         0.0       0.73      0.41      0.52   1094399
         1.0       0.58      0.85      0.69   1074911

    accuracy                           0.63   2169310
   macro avg       0.66      0.63      0.61   2169310
weighted avg       0.66      0.63      0.61   2169310

ROC AUC: 0.7067042595732963
F1 Score: 0.6927564358869502
Confusion Matrix:

['features_xgb_6m.pkl']

In [29]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
import joblib

print("XGBoost version:", xgb.__version__)

# Crear target binario con umbral del 5%
df_full["target_3m"] = (df_full["ret_next_3m"] > 0.05).astype(int)

# Ordenar y filtrar
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_3m"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Feature extra
df_processed_minmax["rsi_x_momentum"] = df_processed_minmax["rsi_14"] * df_processed_minmax["momentum_10"]

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_3m"]

# VarianceThreshold
selector = VarianceThreshold(threshold=1e-4)
X_sel = pd.DataFrame(selector.fit_transform(X), index=X.index)
features_selected = X.columns[selector.get_support()].tolist()

# División temporal
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, shuffle=False, test_size=0.2)

# Balanceo
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
scale_weight = 0.9854333077829515  # reutilizamos valor óptimo

# Entrenamiento
model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_weight,
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predicción
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")

# Guardar artefactos
joblib.dump(model, "model_xgb_3m.pkl")
joblib.dump(selector, "selector_xgb_3m.pkl")
joblib.dump(features_selected, "features_xgb_3m.pkl")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_6m', 'target_1y', 'year',
       'month', 'dayofweek'],
      dtype='object')
              precision    recall  f1-score   support

         0.0       0.73      0.53      0.61   1232988
         1.0       0.54      0.74      0.63    936322

    accuracy                           0.62   2169310
   macro avg       0.63      0.63      0.62   2169310
weighted avg       0.65      0.62      0.62   2169310

ROC AUC: 0.6910185207210477
F1 Score: 0.6252430231329451
Confusion Matrix:

['features_xgb_3m.pkl']

In [30]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
import joblib

print("XGBoost version:", xgb.__version__)

# Crear target binario con umbral del 5%
df_full["target_1y"] = (df_full["ret_next_1y"] > 0.05).astype(int)

# Ordenar y filtrar
top_symbols = df_full["symbol"].unique()
df_clean = df_full[df_full["symbol"].isin(top_symbols)].copy()
df_clean = df_clean.sort_values(by=["symbol", "date"])

# Limpieza e imputación
df_clean = clean_columns(df_clean)

# Preprocesamiento
targets = ["target_1y"]
df_processed_minmax, _ = prepare_data_with_symbol_date(df_clean, targets=targets)

# Feature extra
df_processed_minmax["rsi_x_momentum"] = df_processed_minmax["rsi_14"] * df_processed_minmax["momentum_10"]

# Definir X e y
X = df_processed_minmax.drop(columns=["date", "symbol", "target_3m", "target_6m", "target_1y"], errors="ignore")
y = df_processed_minmax["target_1y"]

# VarianceThreshold
selector = VarianceThreshold(threshold=1e-4)
X_sel = pd.DataFrame(selector.fit_transform(X), index=X.index)
features_selected = X.columns[selector.get_support()].tolist()

# División temporal
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, shuffle=False, test_size=0.2)

# Balanceo
weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
scale_weight = 0.9854333077829515  # mismo valor

# Entrenamiento
model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_weight,
    random_state=42,
    use_label_encoder=False,
    verbosity=0,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predicción
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.4).astype(int)

# Evaluación
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("F1 Score:", f1_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print("Confusion Matrix:")
print(cm)
print(f"TN={tn}, FP={fp}, FN={fn}, TP={tp}")

# Guardar artefactos
joblib.dump(model, "model_xgb_1y.pkl")
joblib.dump(selector, "selector_xgb_1y.pkl")
joblib.dump(features_selected, "features_xgb_1y.pkl")


XGBoost version: 2.1.4
Shape before: (10846547, 56)
Shape after: (10846547, 32)
columnas scaler: Index(['symbol', 'close_v', 'volume', 'prev_close', 'prev_volume',
       'daily_return', 'close_change_pct', 'intraday_volatility', 'log_return',
       'volume_change_pct', 'sma_20', 'delta', 'gain', 'loss', 'rsi_14',
       'rel_volume', 'ema_26', 'macd_signal', 'macd_histogram', 'true_range',
       'atr_14', 'candle_body', 'upper_wick', 'lower_wick', 'candle_color',
       'momentum_10', 'roc_10', 'var_95', 'target_3m', 'target_6m', 'year',
       'month', 'dayofweek'],
      dtype='object')
              precision    recall  f1-score   support

         0.0       0.74      0.36      0.49    971505
         1.0       0.63      0.90      0.74   1197805

    accuracy                           0.66   2169310
   macro avg       0.69      0.63      0.61   2169310
weighted avg       0.68      0.66      0.63   2169310

ROC AUC: 0.7295725889101499
F1 Score: 0.742566203571656
Confusion Matrix:


['features_xgb_1y.pkl']