### Notebook *NB02a – Modelos de Machine Learning sin sentimiento (horizonte 1 día)*  
**Autor:** Jesús Daniel Romeral Cortina

**Objetivo:**
Entrenar y evaluar distintos modelos de machine learning utilizando exclusivamente variables financieras del S&P 500, sin incorporar información de sentimiento, con el fin de establecer un baseline de referencia para la predicción direccional a 1 día.


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier 
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix, roc_auc_score

In [None]:
TIPO_MODELO = "ML"
HORIZONTE = "1d"
USA_SENTIMIENTO = 0

In [None]:
MODEL_PATH   = "../../datos/sp500_model.csv"
OUT_PATH = "../../resultados/resultados_ml_1d.csv"

In [4]:
df = pd.read_csv(MODEL_PATH, parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")

In [5]:
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Return,Target_1d,Return_5d_forward,Target_5d,ret_lag_1,ret_lag_2,ret_lag_3,ret_lag_4,ret_lag_5,ret_ma_5,ret_std_5,ret_ma_10,ret_std_10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2013-01-02,1462.420044,1462.430054,1426.189941,1426.189941,4202600000,0.025403,0,-0.000957,0,0.016942,-0.01105,-0.001218,-0.004787,-0.00244,0.005058,0.015419,0.002286,0.012203
2013-01-03,1459.369995,1465.469971,1455.530029,1462.420044,3829730000,-0.002086,1,0.008737,1,0.025403,0.016942,-0.01105,-0.001218,-0.004787,0.005598,0.01503,0.000928,0.011814
2013-01-04,1466.469971,1467.939941,1458.98999,1459.369995,3424290000,0.004865,0,0.003805,1,-0.002086,0.025403,0.016942,-0.01105,-0.001218,0.006815,0.01458,0.002174,0.011468
2013-01-07,1461.890015,1466.469971,1456.619995,1466.469971,3304970000,-0.003123,0,0.006013,1,0.004865,-0.002086,0.025403,0.016942,-0.01105,0.0084,0.012423,0.001313,0.011515
2013-01-08,1457.150024,1461.890015,1451.640015,1461.890015,3601600000,-0.003242,1,0.010424,1,-0.003123,0.004865,-0.002086,0.025403,0.016942,0.004363,0.012231,0.001926,0.011035


In [None]:
Y = df["Target_1d"]
X = df.drop(columns=[
    "Target_1d", 
    "Target_5d", 
    "Return_5d_forward",
    "Close",
    "High",
    "Low",
    "Open",
    "Volume"
])


In [7]:

train_mask = df.index < "2022-01-01"
X_train_raw, X_test_raw = X.loc[train_mask], X.loc[~train_mask]
y_train, y_test = Y.loc[train_mask], Y.loc[~train_mask]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)  

In [None]:

def evaluar_modelo(y_test, y_pred, y_proba, nombre):
    metrics = {
        "Modelo": nombre,
        "tipo:modelo": TIPO_MODELO,
        "horizonte": HORIZONTE,
        "usa_sentimiento": USA_SENTIMIENTO,
        "Acc": accuracy_score(y_test, y_pred),
        "B_Acc": balanced_accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC": roc_auc_score(y_test, y_proba),
        "Conf_Matrix": confusion_matrix(y_test, y_pred)
    }
    return metrics

resultados = []

In [9]:
# 1. Dummy Classifier (Baseline)
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train_raw, y_train)
y_pred = dummy.predict(X_test_raw)
y_proba = dummy.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Dummy_MostFreq"))

In [10]:
# 2. Logistic Regression
logreg = LogisticRegression(max_iter=1000, class_weight='balanced',solver='liblinear', C=0.1) 
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Logistic_Reg"))


In [11]:
# 3. Random Forest
rf = RandomForestClassifier(n_estimators=300,
                             max_depth=5,
                             min_samples_leaf=100,
                             max_features='sqrt',
                             n_jobs=-1,
                             random_state=42) 
rf.fit(X_train_raw, y_train)
y_pred = rf.predict(X_test_raw)
y_proba = rf.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Random_Forest"))

In [12]:
# 4. HistGradientBoosting (Scikit-Learn native)
hgb = HistGradientBoostingClassifier(max_iter=200, learning_rate=0.01, max_depth=3, random_state=42)
hgb.fit(X_train_raw, y_train)
y_pred = hgb.predict(X_test_raw)
y_proba = hgb.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Hist_GB"))

In [13]:
os.makedirs("../../resultados", exist_ok=True)
df_res = pd.DataFrame(resultados)
df_res.drop(columns="Conf_Matrix").to_csv(OUT_PATH, index=False)

print("Resultados guardados en:", OUT_PATH)
print(df_res.drop(columns="Conf_Matrix"))


Resultados guardados en: ../../resultados/resultados_ml_1d.csv
           Modelo tipo:modelo horizonte  usa_sentimiento       Acc     B_Acc  \
0  Dummy_MostFreq          ML        1d                0  0.494485  0.500000   
1    Logistic_Reg          ML        1d                0  0.503676  0.503981   
2   Random_Forest          ML        1d                0  0.492647  0.497979   
3         Hist_GB          ML        1d                0  0.500000  0.504481   

         F1       ROC  
0  0.661747  0.500000  
1  0.514388  0.493180  
2  0.656716  0.493897  
3  0.643045  0.518790  
