### Notebook *NB05a – Modelos de Machine Learning con sentimiento FULL (horizonte 1 día)*  
**Autor:** Jesús Daniel Romeral Cortina

**Objetivo:**  
Entrenar y evaluar distintos modelos de machine learning utilizando variables financieras del S&P 500 junto con variables de sentimiento extraídas de noticias financieras, con el fin de analizar si la incorporación de información de sentimiento mejora la capacidad predictiva en la predicción direccional a 1 día frente al baseline sin sentimiento.

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier 
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix, roc_auc_score

In [2]:
with open("../../resultados/best_params_ml_1d.json", "r") as f:
    BEST_PARAMS_1D = json.load(f)


In [3]:
TIPO_MODELO = "ML"
HORIZONTE = "1d"
USA_SENTIMIENTO = 1

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
MODEL_PATH   = "../../datos/sp500_sent_FULL.csv" 
OUT_PATH = "../../resultados/resultados_ml_1d_SENT_FULL.csv"


In [5]:
df = pd.read_csv(MODEL_PATH, parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")

In [6]:
Y = df["Target_1d"]
X = df.drop(columns=[
    "Target_1d", 
    "Target_5d", 
    "Return_5d_forward",
    "Close",
    "High",
    "Low",
    "Open",
    "Volume",
    "sentiment_mean",
    "n_news"
])


In [7]:

train_mask = df.index < "2022-01-01"
X_train_raw, X_test_raw = X.loc[train_mask], X.loc[~train_mask]
y_train, y_test = Y.loc[train_mask], Y.loc[~train_mask]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)  

In [8]:

def evaluar_modelo(y_test, y_pred, y_proba, nombre):
    metrics = {
        "Modelo": nombre,
        "tipo:modelo": TIPO_MODELO,
        "horizonte": HORIZONTE,
        "usa_sentimiento": USA_SENTIMIENTO,
        "Acc": accuracy_score(y_test, y_pred),
        "B_Acc": balanced_accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC": roc_auc_score(y_test, y_proba),
        "Conf_Matrix": confusion_matrix(y_test, y_pred)
    }
    return metrics

resultados = []

In [9]:
# 1. Dummy Classifier (Baseline)
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train_raw, y_train)
y_pred = dummy.predict(X_test_raw)
y_proba = dummy.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Dummy_MostFreq"))

In [10]:
# 2. Logistic Regression

logreg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    **BEST_PARAMS_1D["LogisticRegression"]
)

 
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Logistic_Reg"))


In [11]:
# 3. Random Forest
rf = RandomForestClassifier(
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    **BEST_PARAMS_1D["RandomForest"]
)

rf.fit(X_train_raw, y_train)
y_pred = rf.predict(X_test_raw)
y_proba = rf.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Random_Forest"))

In [12]:
# 4. HistGradientBoosting (Scikit-Learn native)
hgb = HistGradientBoostingClassifier(
    random_state=42,
    **BEST_PARAMS_1D["HistGradientBoosting"]
)
hgb.fit(X_train_raw, y_train)
y_pred = hgb.predict(X_test_raw)
y_proba = hgb.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Hist_GB"))



In [13]:
df_res = pd.DataFrame(resultados)
df_res.drop(columns="Conf_Matrix").to_csv(OUT_PATH, index=False)

print("Resultados guardados en:", OUT_PATH)
print(df_res.drop(columns="Conf_Matrix"))

Resultados guardados en: ../../resultados/resultados_ml_1d_SENT_FULL.csv
           Modelo tipo:modelo horizonte  usa_sentimiento       Acc     B_Acc  \
0  Dummy_MostFreq          ML        1d                1  0.494485  0.500000   
1    Logistic_Reg          ML        1d                1  0.496324  0.497276   
2   Random_Forest          ML        1d                1  0.492647  0.497655   
3         Hist_GB          ML        1d                1  0.487132  0.490497   

         F1       ROC  
0  0.661747  0.500000  
1  0.534014  0.502494  
2  0.649746  0.498844  
3  0.605375  0.527557  
