In [0]:
%run /Workspace/Users/jorgegarciaotero@gmail.com/tfm_databricks/config/database_connector

In [0]:

# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import random
import plotly.express as px
import joblib
from datetime import timedelta
from typing import List, Tuple, Dict,Optional

# Azure

from typing import Tuple, List, Optional
import pandas as pd
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd


# Models
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc ,  precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import AUC




In [0]:
def prepare_data(df, targets, scaler_type='standard', drop_cols=['date', 'symbol'], encode_symbol: bool = False):
    """
    Prepares data for ML models by imputing missing values and scaling features.

    Args:
        - df (pd.DataFrame): Full input DataFrame.
        - targets (list): List of target columns.
        - scaler_type (str): 'standard' or 'minmax'.
        - drop_cols (list): Columns to drop from the DataFrame.
        - encode_symbol (bool): Whether to keep and encode 'symbol' as a feature.

    Returns:
        - df_scaled (pd.DataFrame): Scaled features with targets.
        - scaler: The fitted scaler object.
    """
    df_clean = df.copy()

    # Opcionalmente preservar 'symbol' si se desea codificarlo
    if encode_symbol and 'symbol' in df_clean.columns:
        drop_cols = [col for col in drop_cols if col != 'symbol']

    # Eliminar columnas no deseadas
    df_clean = df_clean.drop(columns=drop_cols, errors='ignore')

    # Codificar variables categóricas (incluido symbol si se conserva)
    for col in df_clean.select_dtypes(include=['object', 'category']).columns:
        df_clean[col] = LabelEncoder().fit_transform(df_clean[col].astype(str))

    # Separar features y targets
    feature_cols = [col for col in df_clean.columns if col not in targets]
    X = df_clean[feature_cols]
    y = df_clean[targets]

    # Imputación de valores
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)

    # Escalado
    scaler = StandardScaler() if scaler_type == 'standard' else MinMaxScaler()
    X_scaled = scaler.fit_transform(X_imputed)

    # Reconstruir DataFrame escalado
    df_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=X.index)

    # Agregar los targets de nuevo
    df_scaled = pd.concat([df_scaled, y], axis=1)

    return df_scaled, scaler



In [0]:
def model_evaluation(
    y_test: pd.Series,
    y_pred: pd.Series,
    y_prob: pd.Series
) -> Tuple[float, float, float]:
    """
    Evaluates the classification model and plots the metrics
    Args:
        - y_test (pd.Series): True target vals
        - y_pred (pd.Series): Predicted class vals
        - y_prov (pd.Series): Predicted probss for clas 1
    Returns:
        - Tuple[float, float, float]: Accuracy, F1 Score, and ROC AUC.
    """
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    cm = confusion_matrix(y_test, y_pred)
    precission = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc:.4f}")
    print(f"Confusion Matrix:\n {cm}")
    print(f"Precision: {precission:.4f}")
    print(f"Recall: {recall:.4f}")

    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(5, 4))
    cax = ax.matshow(cm, cmap='Blues')
    fig.colorbar(cax)
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(['No', 'Yes'])
    ax.set_yticklabels(['No', 'Yes'])
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return acc, f1, roc,cm,precission,recall



In [0]:
def train_base_lstm_classifier(
    df: pd.DataFrame,
    target_column: str = 'target_1y',
    sequence_length: int = 20,
    epochs: int = 100,
    batch_size: int = 32,
    patience: int = 5,
    min_accuracy: float = 0.75  # Accuracy mínima deseada
):
    """
    Entrena un modelo base LSTM para clasificación binaria con early stopping.

    Returns:
        - model: modelo entrenado
        - X_test, y_test: conjunto de test
        - y_pred_prob, y_pred_class: predicciones
    """
    # Preparar datos
    feature_columns = [col for col in df.columns if col != target_column]
    data = df[feature_columns + [target_column]].copy()
    data[target_column] = (data[target_column] > 0).astype(int)

    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data.iloc[i:i + sequence_length, :-1].values)
        y.append(data.iloc[i + sequence_length, -1])
    X = np.array(X)
    y = np.array(y)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=False
    )

    # Modelo LSTM
    model = Sequential()
    model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss=BinaryCrossentropy(),
        metrics=['accuracy', AUC()]
    )

    # Callback para early stopping por valid_accuracy
    early_stop = EarlyStopping(
        monitor='val_accuracy',
        patience=patience,
        verbose=1,
        restore_best_weights=True,
        mode='max'
    )

    # Entrenar
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stop],
        verbose=1
    )

    # Evaluar
    y_pred_prob = model.predict(X_test).flatten()
    y_pred_class = (y_pred_prob > 0.5).astype(int)

    return model, X_test, y_test, y_pred_prob, y_pred_class

In [0]:
def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
  """
  """
  columns_to_drop = [
      'date', 'symbol', 'capital_gains',
      'ret_next_3m', 'ret_next_6m', 'ret_next_1y',
      'price_lead_3m', 'price_lead_6m', 'price_lead_1y',
      'open_v', 'high', 'low', 'dividends', 'stock_splits',
      'is_dividend_day', 'is_stock_split', 'gap_open', 'price_range',
      'tr_1', 'tr_2', 'tr_3', 'sma_5', 'bollinger_upper',
      'bollinger_lower', 'ema_12', 'macd_line'
  ]
  print(f"Shape before: {df.shape}")
  df = df.drop(columns=columns_to_drop, errors='ignore').copy()
  numeric_cols = df.select_dtypes(include=["float64", "int64", "int32"]).columns
  imputer = SimpleImputer(strategy="mean")
  df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
  print(f"Shape after: {df.shape}")
  return df


#### MAIN

In [0]:
# Creates the input widgets and sets the default values
dbutils.widgets.text("storage_account", "smartwalletjorge", "Storage Account")
dbutils.widgets.text("container", "smart-wallet-dl", "Container")
dbutils.widgets.text("database", "smart_wallet", "Database")

storage_account = dbutils.widgets.get("storage_account")
container = dbutils.widgets.get("container")
database_name = dbutils.widgets.get("database")
date_value = dbutils.widgets.get("date")
if (date_value is None) or (date_value==''):
    date_value=None

db_connector = DatabaseConnector()
print(f"database_name :{database_name}")

df=db_connector.read_table_from_path(container, database_name, "stock_data_parquet", date_value,"parquet")
    

In [0]:
df.count()

In [0]:
top_symbols_df = (
    df_full.groupBy("symbol")
    .count()
    .orderBy(F.desc("count"))
    .limit(50)
)
top_symbols = [row["symbol"] for row in top_symbols_df.collect()]
df_clean = df_full.filter(F.col("symbol").isin(top_symbols))


In [0]:
df_clean.count()

In [0]:
df_full["target_3m"] = (df_full["ret_next_3m"] > 0.10).astype(int)
df_full["target_6m"] = (df_full["ret_next_6m"] > 0.10).astype(int)
df_full["target_1y"] = (df_full["ret_next_1y"] > 0.10).astype(int)

In [0]:
df_full.shape

In [0]:
last_day = df_full['date'].max()
df_last = df_full[df_full['date'] == last_day].copy()
df_last.shape

In [0]:
print(df_clean.columns)

In [0]:
df_last_clean =clean_columns(df_last)
df_last_clean = df_last_clean.drop(columns=['target_3m', 'target_6m', 'target_1y'])

In [0]:

targets = ['target_3m', 'target_6m', 'target_1y']

df_processed_minmax, scaler_minmax = prepare_data(df_clean, targets, scaler_type='minmax', encode_symbol=True)
# Entrenar modelo para target_3m
model_3m, X_test_3m, y_test_3m, y_prob_3m, y_pred_3m = train_base_lstm_classifier(
    df_processed_minmax,
    target_column='target_3m'
)

# Evaluación para target_3m
model_evaluation(
    y_test=pd.Series(y_test_3m),
    y_pred=pd.Series(y_pred_3m),
    y_prob=pd.Series(y_prob_3m)
)
