In [3]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

# Carga base
df = pd.read_parquet(ROOT/"data/interim/ES_5m_2021_2024.parquet")
ev = pd.read_parquet(ROOT/"data/processed/events/events_labeled_2021_2024.parquet")

# -- Asegurar columnas de sesión en df
if "session_id" not in df.columns:
    df["session_id"] = df["NewSession"].cumsum()
if "idx_in_session" not in df.columns:
    df["idx_in_session"] = df.groupby("session_id").cumcount()

# -- Construir session_tag a partir de idx_in_session
idx2tag = pd.Series("ASIA", index=df.index)
idx2tag.loc[df["idx_in_session"].between(108,185)] = "EU"
idx2tag.loc[df["idx_in_session"].between(186,275)] = "USA"

# Añadir session_tag a la tabla de eventos (por idx)
ev["session_tag"] = ev["idx"].map(idx2tag)

# (opcional) persiste nueva versión con session_tag
out_ev = ROOT/"data/processed/events/events_labeled_2021_2024.plus_session.parquet"
ev.to_parquet(out_ev, engine="pyarrow", compression="zstd", index=False)
print("Guardado:", out_ev)

# Filtrado EU+USA
ev_euusa = ev[ev["session_tag"].isin(["EU","USA"])].reset_index(drop=True)
print("Eventos EU+USA:", ev_euusa.shape, ev_euusa["session_tag"].value_counts().to_dict())



Guardado: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\data\processed\events\events_labeled_2021_2024.plus_session.parquet
Eventos EU+USA: (8541, 13) {'USA': 8541}


In [5]:
# --- B) Entrenar + calibrar (auto-build si falta el parquet) ---
import numpy as np, pandas as pd, joblib, json
from pathlib import Path
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import average_precision_score, brier_score_loss
from sklearn.preprocessing import label_binarize
from importlib import reload

# localizar raíz del repo
ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"
PQ      = ROOT/"data/processed/features/supervised_EUUSA_EXT_k2p5.parquet"

# --- helper: asegurar ev_euusa (construye session_tag si no existe) ---
def load_ev_euusa():
    df = pd.read_parquet(DF_PATH)
    ev = pd.read_parquet(EV_PATH)

    if "session_id" not in df.columns:
        df["session_id"] = df["NewSession"].cumsum()
    if "idx_in_session" not in df.columns:
        df["idx_in_session"] = df.groupby("session_id").cumcount()

    # mapa idx -> tag por intradía
    idx2tag = pd.Series("ASIA", index=df.index)
    idx2tag.loc[df["idx_in_session"].between(108,185)] = "EU"
    idx2tag.loc[df["idx_in_session"].between(186,275)] = "USA"

    if "session_tag" not in ev.columns:
        ev["session_tag"] = ev["idx"].map(idx2tag)

    ev_euusa = ev[ev["session_tag"].isin(["EU","USA"])].reset_index(drop=True)
    return df, ev_euusa

# --- helper: construir dataset EXT k=2.5 si falta ---
def build_ext_k25():
    from ppz.pipelines.build_dataset import make_supervised_from_events
    df, ev_euusa = load_ev_euusa()

    X_ext, y_ext = make_supervised_from_events(
        df, ev_euusa,
        tick_size=0.25, n_short=20, n_long=60,
        L_prev_touches=60, r_touch_ticks=6, drop_none=False,
        add_mvc=True, add_orderflow=True, of_k=2.5, of_k_ext=4.5
    )
    num = X_ext.select_dtypes(include="number").columns
    X_ext[num] = X_ext[num].fillna(0.0).astype("float32")
    Xe = X_ext.copy(); Xe["target"] = y_ext.values
    PQ.parent.mkdir(parents=True, exist_ok=True)
    Xe.to_parquet(PQ, engine="pyarrow", compression="zstd", index=False)
    print("Construido y guardado:", PQ, "| shape:", Xe.shape)

# --- carga (o construcción) del parquet ---
if not PQ.exists():
    print("No existe:", PQ.name, "-> construyendo…")
    build_ext_k25()

# --- entrenamiento + calibración ---
D  = pd.read_parquet(PQ)
df = pd.read_parquet(DF_PATH)

t   = df["Time"].dt.tz_convert("Europe/Madrid")
idx = D["idx"].astype(int)
TR  = (t.loc[idx] < "2024-01-01").values
VA  = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
TE  = (t.loc[idx] >= "2024-07-01").values

X = D.drop(columns=["target","idx","zone_type"], errors="ignore").copy()
num = X.select_dtypes(include="number").columns
X[num] = X[num].fillna(0.0).astype("float32")
y = D["target"].astype(str).values
classes = sorted(np.unique(y).tolist())

hgb = HistGradientBoostingClassifier(
    learning_rate=0.06, max_leaf_nodes=25, min_samples_leaf=160, l2_regularization=0.25,
    random_state=42
).fit(X[TR].to_numpy(), y[TR])

# isotónica OvR en VALID
P_va = np.column_stack([hgb.predict_proba(X[VA].to_numpy())[:, list(hgb.classes_).index(c)] for c in classes])
Y_va_bin = label_binarize(y[VA], classes=classes)
isos, P_va_cal = {}, np.zeros_like(P_va)
for k,c in enumerate(classes):
    ir = IsotonicRegression(out_of_bounds="clip").fit(P_va[:,k], Y_va_bin[:,k])
    isos[c] = ir; P_va_cal[:,k] = ir.transform(P_va[:,k])

def predict_proba_cal(Xm):
    P = np.column_stack([hgb.predict_proba(Xm)[:, list(hgb.classes_).index(c)] for c in classes])
    Pcal = np.column_stack([isos[c].transform(P[:,k]) for k,c in enumerate(classes)])
    s = Pcal.sum(axis=1, keepdims=True); s[s==0]=1.0
    return Pcal/s

P_te = predict_proba_cal(X[TE].to_numpy()); y_te = y[TE]
ap_macro = average_precision_score(label_binarize(y_te, classes=classes), P_te, average="macro")
brier    = np.mean([brier_score_loss((y_te==c).astype(int), P_te[:,i]) for i,c in enumerate(classes)])
ap_pc    = {c: average_precision_score((y_te==c).astype(int), P_te[:,i]) for i,c in enumerate(classes)}
print("AP_macro:", round(ap_macro,4), "| AP:", {k:round(v,4) for k,v in ap_pc.items()}, "| Brier:", round(brier,5))

ART = ROOT/"models/artifacts"; ART.mkdir(parents=True, exist_ok=True)
art_path = ART/"HGB_EUUSA_EXTk2p5_calibrated_isotonic.joblib"
joblib.dump({"model":hgb,"classes":classes,"isos":isos}, art_path)
json.dump({
    "name":"HGB_EUUSA_EXTk2p5", "ap_macro_test": float(ap_macro),
    "ap_per_class_test": {k: float(v) for k,v in ap_pc.items()},
    "brier_test": float(brier), "n_features": int(X.shape[1]),
}, open(ART/"HGB_EUUSA_EXTk2p5_report_test.json","w"), indent=2)
print("Guardado artefactos en:", ART)


No existe: supervised_EUUSA_EXT_k2p5.parquet -> construyendo…


  (np.nanmax(buy_ratio)  >= k_ext) if np.isfinite(np.nanmax(buy_ratio))  else False
  (np.nanmax(sell_ratio) >= k_ext) if np.isfinite(np.nanmax(sell_ratio)) else False


Construido y guardado: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\data\processed\features\supervised_EUUSA_EXT_k2p5.parquet | shape: (8541, 33)
AP_macro: 0.3599 | AP: {'breakout': 0.3299, 'none': 0.265, 'rebound': 0.4847} | Brier: 0.21406
Guardado artefactos en: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\models\artifacts


In [7]:
# --- C) Re-tuning de umbrales por zona (VALIDACIÓN) ---
from pathlib import Path
import numpy as np, pandas as pd, json, joblib

# localizar raíz del repo
ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists():
        break
    ROOT = ROOT.parent

PQ       = ROOT/"data/processed/features/supervised_EUUSA_EXT_k2p5.parquet"
ART      = ROOT/"models/artifacts/HGB_EUUSA_EXTk2p5_calibrated_isotonic.joblib"
DF_PATH  = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH  = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"
OUTJSON  = ROOT/"models/artifacts/rebound_thresholds_by_zone_k2p5.json"

# carga datasets
D  = pd.read_parquet(PQ)
df = pd.read_parquet(DF_PATH)

# asegurar zone_type (join por idx si faltara)
if "zone_type" not in D.columns:
    ev = pd.read_parquet(EV_PATH)[["idx","zone_type"]]
    D = D.merge(ev, on="idx", how="left")

# split VALID por tiempo
t   = df["Time"].dt.tz_convert("Europe/Madrid")
idx = D["idx"].astype(int).values
VA_mask = ((t.loc[idx] >= "2024-01-01") & (t.loc[idx] < "2024-07-01")).to_numpy()

# X (solo numéricas, float32), y, zonas
Xva = D.drop(columns=["target","idx","zone_type"], errors="ignore").copy()
num_cols = Xva.select_dtypes(include=[np.number]).columns
Xva[num_cols] = Xva[num_cols].fillna(0.0).astype("float32")
Xva = Xva.iloc[VA_mask]

yva = D["target"].astype(str).values[VA_mask]
zva = D["zone_type"].astype(str).values[VA_mask]

# cargar modelo calibrado
art = joblib.load(ART)
model, classes, isos = art["model"], art["classes"], art["isos"]
c2i = {c:i for i,c in enumerate(classes)}

def proba_cal(X):
    P = np.column_stack([model.predict_proba(X)[:, c2i[c]] for c in classes])
    Pcal = np.column_stack([isos[c].transform(P[:,i]) for i,c in enumerate(classes)])
    s = Pcal.sum(axis=1, keepdims=True); s[s==0]=1.0
    return Pcal/s

Pva = proba_cal(Xva.to_numpy())
pr, pb, pn = Pva[:,c2i["rebound"]], Pva[:,c2i["breakout"]], Pva[:,c2i["none"]]

def f1_macro_bin(y_true, y_hat):
    yb = np.where(y_true=="rebound","rebound","none")
    tp   = np.sum((y_hat=="rebound") & (yb=="rebound"))
    fp   = np.sum((y_hat=="rebound") & (yb=="none"))
    fn   = np.sum((y_hat=="none")    & (yb=="rebound"))
    prec = tp/(tp+fp+1e-9); rec = tp/(tp+fn+1e-9)
    f1_r = 2*prec*rec/(prec+rec+1e-9)
    tp_n = np.sum((y_hat=="none") & (yb=="none"))
    fp_n = np.sum((y_hat=="none") & (yb=="rebound"))
    fn_n = np.sum((y_hat=="rebound") & (yb=="none"))
    prec_n = tp_n/(tp_n+fp_n+1e-9); rec_n = tp_n/(tp_n+fn_n+1e-9)
    f1_n = 2*prec_n*rec_n/(prec_n+rec_n+1e-9)
    return 0.5*(f1_r+f1_n)

def search_zone(mask):
    best_score, best_params = -1.0, None
    econf = np.maximum(pr[mask], pb[mask])  # “hay evento”
    for t_event in (0.55, 0.60, 0.65):
        for t_none in (0.50, 0.55, 0.60):
            for t_r in (0.575, 0.60, 0.625):
                for m_rb in (0.0, 0.05, 0.10):
                    y_hat = np.where(
                        (econf>=t_event) & (pn[mask]<=t_none) &
                        (pr[mask]>=t_r) & ((pr[mask]-pb[mask])>=m_rb),
                        "rebound","none"
                    )
                    score = f1_macro_bin(yva[mask], y_hat)
                    if score > best_score:
                        best_score = score
                        best_params = dict(t_event=t_event, t_none=t_none, t_r=t_r, m_rb=m_rb)
    return best_score, best_params

best_by_zone = {}
for z in pd.Series(zva).unique():
    m = (zva==z)
    if m.sum() < 80:   # evita sobreajuste con muy pocos ejemplos
        continue
    score, params = search_zone(m)
    best_by_zone[z] = {"f1_macro_val": float(score), "params": params}

OUTJSON.parent.mkdir(parents=True, exist_ok=True)
with open(OUTJSON, "w", encoding="utf-8") as f:
    json.dump(best_by_zone, f, indent=2)

print("Guardado:", OUTJSON)
best_by_zone



Guardado: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\models\artifacts\rebound_thresholds_by_zone_k2p5.json


{'USA_IBH': {'f1_macro_val': 0.4813475993615316,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'USA_IBL': {'f1_macro_val': 0.36296039090058924,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.6, 'm_rb': 0.0}},
 'VWAP': {'f1_macro_val': 0.3197979795746071,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'PDL_prev': {'f1_macro_val': 0.3203124997772827,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'VAL_D1': {'f1_macro_val': 0.37179487155654173,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'POC_D1': {'f1_macro_val': 0.3676676262020822,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'VAH_D1': {'f1_macro_val': 0.380566801351401,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}},
 'PDH_prev': {'f1_macro_val': 0.4146126757054678,
  'params': {'t_event': 0.55, 't_none': 0.5, 't_r': 0.575, 'm_rb': 0.0}}}

In [8]:
import numpy as np, pandas as pd, json, joblib
from pathlib import Path
from sklearn.metrics import precision_score, recall_score

# --- rutas robustas ---
ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

PQ      = ROOT/"data/processed/features/supervised_EUUSA_EXT_k2p5.parquet"
DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
ART     = ROOT/"models/artifacts/HGB_EUUSA_EXTk2p5_calibrated_isotonic.joblib"
TH_PATH = ROOT/"models/artifacts/rebound_thresholds_by_zone_k2p5.json"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"

# --- carga datos y modelo ---
D  = pd.read_parquet(PQ)
df = pd.read_parquet(DF_PATH)
art = joblib.load(ART)
model, classes, isos = art["model"], art["classes"], art["isos"]
c2i = {c:i for i,c in enumerate(classes)}

# asegurar zone_type (si no via join con events)
if "zone_type" not in D.columns:
    ev = pd.read_parquet(EV_PATH)[["idx","zone_type"]]
    D = D.merge(ev, on="idx", how="left")

# split TEST (2024H2)
t   = df["Time"].dt.tz_convert("Europe/Madrid")
idx = D["idx"].astype(int).values
TE_mask = (t.loc[idx] >= "2024-07-01").to_numpy()

X = D.drop(columns=["target","idx","zone_type"], errors="ignore").copy()
num = X.select_dtypes(include=[np.number]).columns
X[num] = X[num].fillna(0.0).astype("float32")
Xte = X.iloc[TE_mask].to_numpy()

y_true_full = D["target"].astype(str).values[TE_mask]
z_full      = D["zone_type"].astype(str).values[TE_mask]

# proba calibrada
def proba_cal(Xm):
    P = np.column_stack([model.predict_proba(Xm)[:, c2i[c]] for c in classes])
    Pcal = np.column_stack([isos[c].transform(P[:,i]) for i,c in enumerate(classes)])
    s = Pcal.sum(axis=1, keepdims=True); s[s==0]=1.0
    return Pcal/s

Pte = proba_cal(Xte)
pr, pb, pn = Pte[:,c2i["rebound"]], Pte[:,c2i["breakout"]], Pte[:,c2i["none"]]
event_conf = np.maximum(pr, pb)

# umbrales por zona
thr = json.load(open(TH_PATH,"r"))

rows = []
for z in sorted(pd.Series(z_full).dropna().unique()):
    if z not in thr: 
        continue
    p = thr[z]["params"]
    m = (z_full == z)
    if m.sum() < 30:
        continue

    # decisiones (binario rebound/none)
    y_hat = np.where(
        (event_conf[m] >= p["t_event"]) &
        (pn[m] <= p["t_none"]) &
        (pr[m] >= p["t_r"]) &
        ((pr[m] - pb[m]) >= p["m_rb"]),
        "rebound", "none"
    )
    y_bin_true = (y_true_full[m] == "rebound").astype(int)
    y_bin_hat  = (y_hat == "rebound").astype(int)

    prec_r = precision_score(y_bin_true, y_bin_hat, zero_division=0)
    rec_r  = recall_score(y_bin_true, y_bin_hat, zero_division=0)

    # métricas para clase none (invirtiendo etiquetas)
    yb_true_n = 1 - y_bin_true
    yb_hat_n  = 1 - y_bin_hat
    prec_n = precision_score(yb_true_n, yb_hat_n, zero_division=0)
    rec_n  = recall_score(yb_true_n, yb_hat_n, zero_division=0)

    rows.append({
        "zone_type": z,
        "support": int(m.sum()),
        "precision_rebound": round(prec_r,3),
        "recall_rebound": round(rec_r,3),
        "precision_none": round(prec_n,3),
        "recall_none": round(rec_n,3),
    })

tabla = pd.DataFrame(rows).sort_values("zone_type").reset_index(drop=True)
tabla


Unnamed: 0,zone_type,support,precision_rebound,recall_rebound,precision_none,recall_none
0,PDH_prev,118,0.167,0.02,0.562,0.926
1,PDL_prev,58,0.0,0.0,0.534,1.0
2,POC_D1,123,0.4,0.034,0.517,0.953
3,USA_IBH,181,0.519,0.152,0.494,0.854
4,USA_IBL,153,0.167,0.012,0.449,0.93
5,VAH_D1,111,0.4,0.04,0.547,0.951
6,VAL_D1,83,0.333,0.029,0.575,0.958
7,VWAP,201,0.0,0.0,0.508,0.981
