Modelo de prueba experimeinto// metemos FULL SENTIMIENTO con los modelos ML horizonte 5d. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier 
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix, roc_auc_score

MODEL_PATH   = "../../datos/sp500_sent_model.csv"
df = pd.read_csv(MODEL_PATH, parse_dates=["Date"])
df = df.sort_values("Date").set_index("Date")


Y = df["Target_5d"]
X = df.drop(columns=[
    "Target_1d", 
    "Target_5d", 
    "Return_5d_forward",
    "Close",
    "High",
    "Low",
    "Open",
    "Volume",
    "sentiment_mean",
    "n_news"
])

TIPO_MODELO = "ML"
HORIZONTE = "5d"
USA_SENTIMIENTO = 1

In [2]:

train_mask = df.index < "2022-01-01"
X_train_raw, X_test_raw = X.loc[train_mask], X.loc[~train_mask]
y_train, y_test = Y.loc[train_mask], Y.loc[~train_mask]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)  

In [3]:

def evaluar_modelo(y_real, y_pred, y_proba, nombre):
    metrics = {
        "Modelo": nombre,
        "Acc": accuracy_score(y_real, y_pred),
        "B_Acc": balanced_accuracy_score(y_real, y_pred),
        "F1": f1_score(y_real, y_pred),
        "ROC": roc_auc_score(y_real, y_proba),
        "Conf_Matrix": confusion_matrix(y_real, y_pred)
    }
    return metrics

resultados = []

In [4]:
# 1. Dummy Classifier (Baseline)
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train_raw, y_train)
y_pred = dummy.predict(X_test_raw)
y_proba = dummy.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Dummy_MostFreq"))

In [5]:
# 2. Logistic Regression

logreg = LogisticRegression(max_iter=1000, class_weight='balanced',solver='liblinear', C=0.1) 
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Logistic_Reg"))


In [6]:
# 3. Random Forest

rf = RandomForestClassifier(n_estimators=300,
                             max_depth=5,
                             min_samples_leaf=100,
                             max_features='sqrt',
                             n_jobs=-1,
                             random_state=42) 
rf.fit(X_train_raw, y_train)
y_pred = rf.predict(X_test_raw)
y_proba = rf.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Random_Forest"))

In [7]:
# 4. HistGradientBoosting (Scikit-Learn native)

hgb = HistGradientBoostingClassifier(max_iter=200, learning_rate=0.01, max_depth=3, random_state=42)
hgb.fit(X_train_raw, y_train)
y_pred = hgb.predict(X_test_raw)
y_proba = hgb.predict_proba(X_test_raw)[:, 1]
resultados.append(evaluar_modelo(y_test, y_pred, y_proba, "Hist_GB"))



In [8]:
df_res = pd.DataFrame(resultados)
df_res.drop(columns="Conf_Matrix")


Unnamed: 0,Modelo,Acc,B_Acc,F1,ROC
0,Dummy_MostFreq,0.556985,0.5,0.715466,0.5
1,Logistic_Reg,0.571691,0.531456,0.697009,0.565562
2,Random_Forest,0.553309,0.4967,0.712426,0.445449
3,Hist_GB,0.549632,0.493824,0.70868,0.472741
