Modelo de prueba metiendole FULL SENTIMIENTO. LSTM horizonte 5d. 

In [9]:


import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix)
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import regularizers


SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


sp_path = "../../datos/sp500_sent_FULL.csv"  
df = pd.read_csv(sp_path, parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")

NOMBRE = "LSTM"
TIPO_MODELO = "LSTM"
HORIZONTE = "5d"
USA_SENTIMIENTO = 1


In [10]:

Y = df["Target_5d"]
X = df.drop(columns=[
    "Target_1d", 
    "Target_5d", 
    "Return_5d_forward",
    "Close",
    "High",
    "Low",
    "Open",
    "Volume",
    "sentiment_mean",  
    "n_news"
])



In [11]:


train_mask = df.index < "2022-01-01"
X_train_raw, X_test_raw = X.loc[train_mask], X.loc[~train_mask]
y_train, y_test = Y.loc[train_mask], Y.loc[~train_mask]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)  


y_train_arr = y_train.values
y_test_arr = y_test.values

In [None]:

def make_sequences(X: np.ndarray, y: np.ndarray, lookback: int):
 
    X_seq, y_seq = [], []
    for i in range(lookback, len(X)):
        X_seq.append(X[i - lookback:i])
        y_seq.append(y[i])
    return np.array(X_seq, dtype=np.float32), np.array(y_seq, dtype=np.int32)

lookback = 10  
X_train_seq, y_train_seq = make_sequences(X_train_scaled, y_train_arr, lookback)
X_test_seq,  y_test_seq  = make_sequences(X_test_scaled,  y_test_arr,  lookback)

print(f"Entrenamiento: {X_train_seq.shape}")
print(f"Prueba (Test): {X_test_seq.shape}")


print("X_train_seq:", X_train_seq.shape, "y_train_seq:", y_train_seq.shape)
print("X_test_seq :", X_test_seq.shape,  "y_test_seq :", y_test_seq.shape)


test_dates_seq = X_test_raw.index[lookback:]
print("Fechas test (seq):", test_dates_seq.min(), "->", test_dates_seq.max())

Entrenamiento: (2257, 10, 18)
Prueba (Test): (534, 10, 18)
X_train_seq: (2257, 10, 18) y_train_seq: (2257,)
X_test_seq : (534, 10, 18) y_test_seq : (534,)
Fechas test (seq): 2022-01-18 00:00:00 -> 2024-03-04 00:00:00


In [13]:


n_features = X_train_seq.shape[-1]

model = models.Sequential([
    layers.Input(shape=(lookback, n_features)),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


model.summary()


In [14]:

callbacks = [
    EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5)
]

val_size = int(len(X_train_seq) * 0.2)

X_val_seq = X_train_seq[-val_size:]
y_val_seq = y_train_seq[-val_size:]

X_train_seq2 = X_train_seq[:-val_size]
y_train_seq2 = y_train_seq[:-val_size]



history = model.fit(
    X_train_seq2, y_train_seq2,
    validation_data=(X_val_seq, y_val_seq),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.6024 - loss: 0.6721 - val_accuracy: 0.6497 - val_loss: 0.6479 - learning_rate: 0.0010
Epoch 2/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6052 - loss: 0.6579 - val_accuracy: 0.6497 - val_loss: 0.6452 - learning_rate: 0.0010
Epoch 3/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6141 - loss: 0.6481 - val_accuracy: 0.6519 - val_loss: 0.6459 - learning_rate: 0.0010
Epoch 4/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6290 - loss: 0.6396 - val_accuracy: 0.6541 - val_loss: 0.6489 - learning_rate: 0.0010
Epoch 5/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6279 - loss: 0.6296 - val_accuracy: 0.6430 - val_loss: 0.6639 - learning_rate: 0.0010
Epoch 6/100
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step 

In [15]:
y_proba = model.predict(X_test_seq, verbose=0).ravel()
y_pred  = (y_proba >= 0.52).astype(int)  

metrics = {
    "Modelo": NOMBRE,
    "tipo:modelo": TIPO_MODELO,
    "horizonte": HORIZONTE,
    "usa_sentimiento": USA_SENTIMIENTO,
    "Acc": accuracy_score(y_test_seq, y_pred),
    "B_Acc": balanced_accuracy_score(y_test_seq, y_pred),
    "F1": f1_score(y_test_seq, y_pred),
    "ROC": roc_auc_score(y_test_seq, y_proba),
    "Conf_Matrix": confusion_matrix(y_test_seq, y_pred)
}


df_res = pd.DataFrame([metrics])

df_res.drop(columns="Conf_Matrix")

Unnamed: 0,Modelo,tipo:modelo,horizonte,usa_sentimiento,Acc,B_Acc,F1,ROC
0,LSTM,LSTM,5d,1,0.565543,0.5005,0.721823,0.576501
