In [2]:
from pathlib import Path
import pandas as pd

# 1) Detecta la raíz del repo (sube directorios hasta encontrar /data)
ROOT = Path.cwd()
while not (ROOT / "data").exists() and ROOT.parent != ROOT:
    ROOT = ROOT.parent

print("ROOT:", ROOT)
EV_DIR = ROOT / "data" / "processed" / "events"
print("Events dir:", EV_DIR.resolve())

# 2) Lista lo que hay (por si el nombre difiere)
print("Archivos disponibles:")
for p in sorted(EV_DIR.glob("events_labeled_*.parquet")):
    print(" -", p.name)

# 3) Carga robusta: si existe el exacto, úsalo; si no, coge el primero que matchea
target = EV_DIR / "events_labeled_2021_2024.parquet"
if not target.exists():
    cands = sorted(EV_DIR.glob("events_labeled_2021_2024*.parquet"))
    if not cands:
        raise FileNotFoundError("No hay ningún 'events_labeled_2021_2024*.parquet' en " + str(EV_DIR))
    target = cands[0]
    print("Usando candidato:", target.name)

ev_all = pd.read_parquet(target)


ROOT: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\notebooks
Events dir: C:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\notebooks\data\processed\events
Archivos disponibles:
 - events_labeled_2021_2024_ALL.parquet
 - events_labeled_2021_2024_EU.parquet
 - events_labeled_2021_2024_EUUSA.parquet
 - events_labeled_2021_2024_USA.parquet
Usando candidato: events_labeled_2021_2024_ALL.parquet


In [3]:
# === Re-train baseline con condiciones MVC + primer toque estáticas + slope VWAP ===
import numpy as np, pandas as pd, json, joblib
from pathlib import Path
from importlib import reload

# 1) Carga base
from ppz.io.paths import events_path, interim_path
df     = pd.read_parquet(interim_path("ES_5m_2021_2024.parquet"))
ev_all = pd.read_parquet(events_path("events_labeled_2021_2024.parquet"))


# 2) Filtro “primer toque” para zonas estáticas (VWAP no se filtra)
STATIC_ZONES = {"PDH_prev","PDL_prev","VAH_D1","POC_D1","VAL_D1","USA_IBH","USA_IBL"}

ev_sorted = ev_all.sort_values(["session_id","idx"]).reset_index(drop=True)
mask_static  = ev_sorted["zone_type"].isin(STATIC_ZONES)
mask_vwap    = ev_sorted["zone_type"].eq("VWAP")

# primer toque por (session_id, zone_type)
first_touch = (
    ev_sorted[mask_static]
      .groupby(["session_id","zone_type"], as_index=False)
      .head(1)
      .assign(_keep=True)[["idx","_keep"]]
)

ev_ft = ev_sorted.merge(first_touch, on="idx", how="left")
ev_ft["_keep"] = ev_ft["_keep"].fillna(False)
ev_ft = pd.concat([
    ev_ft[mask_vwap],           # todos los VWAP
    ev_ft[mask_static & ev_ft["_keep"]]  # primer toque en estáticas
], axis=0).sort_values("idx").reset_index(drop=True)
ev_ft = ev_ft.drop(columns=["_keep"])

print("Eventos totales:", len(ev_all), "| Elegibles tras condición:", len(ev_ft))

# 3) Dataset supervisado con MVC features
import ppz.pipelines.build_dataset as bd; reload(bd)

X, y = bd.make_supervised_from_events(
    df, ev_ft,
    tick_size=0.25, n_short=20, n_long=60,
    L_prev_touches=60, r_touch_ticks=6,
    drop_none=False,          # mantenemos las 3 clases para referencia
    add_mvc=True,             # << MVC activado
    add_orderflow=False       # OF off por ahora
)

# 4) Añadir pendiente de VWAP (ticks/20 velas) como feature extra
N_SLOPE = 20
vwap = df["VWAP"].to_numpy(dtype=float)
slope = np.empty_like(vwap)
slope[:] = np.nan
slope[N_SLOPE:] = (vwap[N_SLOPE:] - vwap[:-N_SLOPE]) / (N_SLOPE * 0.25)  # ticks por vela
slope = pd.Series(slope)

idx_evt = X["idx"].astype(int).values
X["vwap_slope_n20"] = slope.iloc[idx_evt].astype(np.float32).fillna(0.0)
X["is_vwap_event"]  = ev_ft["zone_type"].eq("VWAP").astype(np.int8).values

# 5) Tipos compactos y guardado del dataset (para reproducibilidad)
num = X.select_dtypes(include=[np.number]).columns
X[num] = X[num].astype(np.float32, errors="ignore")
Path("data/processed/features").mkdir(parents=True, exist_ok=True)
Ds = X.copy(); Ds["target"] = y
Ds.to_parquet("data/processed/features/supervised_MVC_FIRSTTOUCH.parquet",
              engine="pyarrow", compression="zstd", index=False)

# 6) Split temporal TR/VA/TE
t = df["Time"].dt.tz_convert("Europe/Madrid")
TR = (t.loc[idx_evt] < "2024-01-01").values
VA = (t.loc[idx_evt] >= "2024-01-01").values & (t.loc[idx_evt] < "2024-07-01").values
TE = (t.loc[idx_evt] >= "2024-07-01").values

# 7) Entrenamiento HGB + calibración isotónica OvR
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.preprocessing import label_binarize
from sklearn.metrics import average_precision_score, brier_score_loss, confusion_matrix

Xtr = X.drop(columns=["idx","zone_type"], errors="ignore").iloc[TR].to_numpy()
Xva = X.drop(columns=["idx","zone_type"], errors="ignore").iloc[VA].to_numpy()
Xte = X.drop(columns=["idx","zone_type"], errors="ignore").iloc[TE].to_numpy()

y_arr = Ds["target"].astype(str).values
ytr, yva, yte = y_arr[TR], y_arr[VA], y_arr[TE]
classes = sorted(np.unique(y_arr))

hgb = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_leaf_nodes=31,
    min_samples_leaf=60,
    l2_regularization=0.1,
    class_weight=None,     # si 'breakout' queda muy flojo, prueba 'balanced'
    random_state=42
).fit(Xtr, ytr)

# calibración OvR
def proba_mat(model, Xmat, classes):
    cols = [list(model.classes_).index(c) for c in classes]
    return np.column_stack([model.predict_proba(Xmat)[:,i] for i in cols])

P_va = proba_mat(hgb, Xva, classes)
Y_va_bin = label_binarize(yva, classes=classes)
isos = {}
P_va_cal = np.zeros_like(P_va)
for k,c in enumerate(classes):
    ir = IsotonicRegression(out_of_bounds="clip").fit(P_va[:,k], Y_va_bin[:,k])
    isos[c] = ir
    P_va_cal[:,k] = ir.transform(P_va[:,k])

def predict_proba_cal(Xmat):
    P = proba_mat(hgb, Xmat, classes)
    Pcal = np.column_stack([isos[c].transform(P[:,k]) for k,c in enumerate(classes)])
    s = Pcal.sum(axis=1, keepdims=True); s[s==0]=1.0
    return Pcal/s

# 8) Métricas en TEST (multiclase)
P_te = predict_proba_cal(Xte)
ap_macro = average_precision_score(label_binarize(yte, classes=classes), P_te, average="macro")
ap_per_class = {c: average_precision_score((yte==c).astype(int), P_te[:,i]) for i,c in enumerate(classes)}
brier = np.mean([brier_score_loss((yte==c).astype(int), P_te[:,i]) for i,c in enumerate(classes)])
y_pred = np.array(classes)[P_te.argmax(1)]
cm = confusion_matrix(yte, y_pred, labels=classes).tolist()

print("=== HGB_MVC_FIRSTTOUCH ===")
print("AP macro (test):", round(ap_macro,4))
print("AP por clase:", {k: round(v,4) for k,v in ap_per_class.items()})
print("Brier OvR medio (test):", round(brier,5))
print("CM (test) [rows=true, cols=pred] (classes={}):".format(classes), cm)

# 9) Guardar artefactos
Path("models/artifacts").mkdir(parents=True, exist_ok=True)
joblib.dump({"model":hgb,"classes":classes,"isos":isos},
            "models/artifacts/HGB_MVC_FIRSTTOUCH_calibrated.joblib")
json.dump({
    "name":"HGB_MVC_FIRSTTOUCH",
    "ap_macro_test": float(ap_macro),
    "ap_per_class_test": {k: float(v) for k,v in ap_per_class.items()},
    "brier_test": float(brier),
    "classes": classes
}, open("models/artifacts/HGB_MVC_FIRSTTOUCH_report_test.json","w"), indent=2)
print("Guardado artefactos en models/artifacts/")


ImportError: cannot import name 'events_path' from 'ppz.io.paths' (C:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\src\ppz\io\paths.py)