In [1]:
# === 0) Imports ===
import os, json, joblib
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    average_precision_score, brier_score_loss, confusion_matrix, classification_report
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import HistGradientBoostingClassifier

# === 1) Carga df base (para el split temporal) ===
DF_PATH = "data/interim/ES_5m_2021_2024.parquet"
df = pd.read_parquet(DF_PATH)
t_series = df["Time"].dt.tz_convert("Europe/Madrid")  # para mapping idx→fecha

def split_by_time(X_idx):
    t = t_series.loc[X_idx]
    train = (t < "2024-01-01")
    val   = (t >= "2024-01-01") & (t < "2024-07-01")
    test  = (t >= "2024-07-01")
    return train.values, val.values, test.values

# === 2) utilidades métricas ===
def ap_macro(y_true, proba, classes):
    # y_true string -> binarizado OvR
    Y = np.zeros((len(y_true), len(classes)), dtype=int)
    for i,c in enumerate(classes):
        Y[:,i] = (y_true == c).astype(int)
    aps = []
    for i,_ in enumerate(classes):
        aps.append(average_precision_score(Y[:,i], proba[:,i]))
    return float(np.mean(aps)), dict(zip(classes, aps))

def eval_report(y_true, proba, classes):
    y_pred = classes[proba.argmax(1)]
    ap_mac, ap_per_class = ap_macro(y_true, proba, classes)
    # Brier (multiclase: media de Brier OvR)
    Y = np.zeros((len(y_true), len(classes)))
    for i,c in enumerate(classes):
        Y[:,i] = (y_true == c).astype(int)
    brier = float(np.mean((proba - Y)**2))
    cm = confusion_matrix(y_true, y_pred, labels=list(classes))
    clf_rep = classification_report(y_true, y_pred, labels=list(classes), target_names=list(classes), output_dict=True)
    return {
        "ap_macro": ap_mac,
        "ap_per_class": ap_per_class,
        "brier_ovr_mean": brier,
        "confusion_matrix": cm.tolist(),
        "classification_report": clf_rep,
    }

# === 3) entrenamiento + calibración + evaluación ===
def train_eval_one(DATA_PATH, name, random_state=42):
    print(f"\n=== {name} ===")
    Path("models/artifacts").mkdir(parents=True, exist_ok=True)

    # cargar dataset (X + target)
    D = pd.read_parquet(DATA_PATH)
    y_str = D["target"].astype(str).values
    X = D.drop(columns=["target", "idx", "zone_type"], errors="ignore").copy()

    # asegurar tipos numéricos y rellenar NaNs
    for c in X.columns:
        if X[c].dtype == "bool": X[c] = X[c].astype(np.int8)
    num = X.select_dtypes(include=[np.number]).columns
    X[num] = X[num].astype(np.float32)
    X[num] = X[num].fillna(0.0)

    # split temporal
    idx_series = D["idx"].astype(int) if "idx" in D.columns else None
    if idx_series is None:
        raise ValueError("El dataset debe contener la columna 'idx' para mapear a fechas.")
    tr, va, te = split_by_time(idx_series)

    Xtr, ytr = X.loc[tr].to_numpy(), y_str[tr]
    Xva, yva = X.loc[va].to_numpy(), y_str[va]
    Xte, yte = X.loc[te].to_numpy(), y_str[te]

    # clases y sample_weight por clase (inverso de la frecuencia en train)
    classes = np.unique(ytr)
    counts = pd.Series(ytr).value_counts()
    inv_freq = counts.sum() / (len(counts) * counts)
    sw_map = inv_freq.to_dict()
    sw_tr = np.array([sw_map[y] for y in ytr], dtype=np.float32)

    # modelo base
    hgb = HistGradientBoostingClassifier(
        learning_rate=0.06,
        max_depth=None,
        max_leaf_nodes=31,
        min_samples_leaf=50,
        l2_regularization=0.0,
        random_state=random_state
    )
    hgb.fit(Xtr, ytr, sample_weight=sw_tr)

    # calibración (isotónica) sobre VALIDACIÓN
    calib = CalibratedClassifierCV(hgb, method="isotonic", cv="prefit")
    calib.fit(Xva, yva)

    # evaluación
    proba_te = calib.predict_proba(Xte)
    metrics_test = eval_report(yte, proba_te, calib.classes_)
    print("AP macro (test):", round(metrics_test["ap_macro"], 4))
    print("AP por clase (test):", {k: round(v,4) for k,v in metrics_test["ap_per_class"].items()})
    print("Brier OvR medio (test):", round(metrics_test["brier_ovr_mean"], 5))
    print("CM (test):")
    print(np.array(metrics_test["confusion_matrix"]))

    # guardar artefactos
    base = f"models/artifacts/{name}"
    joblib.dump(calib, f"{base}_calibrated_isotonic.joblib")
    with open(f"{base}_report_test.json","w",encoding="utf-8") as f:
        json.dump(metrics_test, f, indent=2, ensure_ascii=False)
    print("Guardado:", f"{base}_calibrated_isotonic.joblib")

    # devolver cosas útiles (p.ej. para comparar ALL vs EU+USA)
    return {
        "name": name,
        "counts_train": counts.to_dict(),
        "ap_macro_test": metrics_test["ap_macro"],
        "ap_per_class_test": metrics_test["ap_per_class"],
        "brier_test": metrics_test["brier_ovr_mean"],
        "cm_test": metrics_test["confusion_matrix"],
    }

# === 4) Ejecutar para ALL y EU+USA (23 features) ===
res_all   = train_eval_one("data/processed/features/supervised_ALL.parquet",   "HGB_ALL_f23")
res_euusa = train_eval_one("data/processed/features/supervised_EUUSA.parquet", "HGB_EUUSA_f23")

res_all, res_euusa



=== HGB_ALL_f23 ===
AP macro (test): 0.4347
AP por clase (test): {'breakout': 0.3376, 'none': 0.3138, 'rebound': 0.6525}
Brier OvR medio (test): 0.19268
CM (test):
[[  93   21  601]
 [  63   40  642]
 [  69   40 1620]]
Guardado: models/artifacts/HGB_ALL_f23_calibrated_isotonic.joblib

=== HGB_EUUSA_f23 ===
AP macro (test): 0.4231
AP por clase (test): {'breakout': 0.3831, 'none': 0.2436, 'rebound': 0.6427}
Brier OvR medio (test): 0.19695
CM (test):
[[ 35   2 469]
 [ 13   0 389]
 [ 15   3 972]]
Guardado: models/artifacts/HGB_EUUSA_f23_calibrated_isotonic.joblib


({'name': 'HGB_ALL_f23',
  'counts_train': {'rebound': 10696, 'none': 4726, 'breakout': 4104},
  'ap_macro_test': 0.434655856245721,
  'ap_per_class_test': {'breakout': 0.337646328576001,
   'none': 0.3138268531590387,
   'rebound': 0.6524943870021234},
  'brier_test': 0.1926751684104424,
  'cm_test': [[93, 21, 601], [63, 40, 642], [69, 40, 1620]]},
 {'name': 'HGB_EUUSA_f23',
  'counts_train': {'rebound': 6232, 'breakout': 2958, 'none': 2623},
  'ap_macro_test': 0.4231125213905547,
  'ap_per_class_test': {'breakout': 0.3830821263378881,
   'none': 0.2435837823778073,
   'rebound': 0.6426716554559688},
  'brier_test': 0.19695132128305023,
  'cm_test': [[35, 2, 469], [13, 0, 389], [15, 3, 972]]})

In [2]:
import numpy as np, pandas as pd, joblib
from sklearn.metrics import f1_score, precision_recall_fscore_support

def load_data(PQ, DF_PATH="data/interim/ES_5m_2021_2024.parquet"):
    D  = pd.read_parquet(PQ)               # X + target + idx + zone_type
    df = pd.read_parquet(DF_PATH)
    t  = df["Time"].dt.tz_convert("Europe/Madrid")
    idx = D["idx"].astype(int)
    tr = (t.loc[idx] < "2024-01-01").values
    va = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
    te = (t.loc[idx] >= "2024-07-01").values
    X = D.drop(columns=["target", "idx", "zone_type"], errors="ignore")
    y = D["target"].astype(str).values
    num = X.select_dtypes(include=[np.number]).columns
    X[num] = X[num].fillna(0.0).astype(np.float32)
    return X.to_numpy(), y, tr, va, te

def tune_thresholds(calib_path, data_path):
    X, y, tr, va, te = load_data(data_path)
    model = joblib.load(calib_path)
    classes = model.classes_
    c2i = {c:i for i,c in enumerate(classes)}

    Pva = model.predict_proba(X[va])
    yva = y[va]

    best = None
    for th_r in np.linspace(0.55, 0.85, 16):      # umbral rebound
      for th_b in np.linspace(0.50, 0.80, 16):    # umbral breakout
        for margin in (0.00, 0.05, 0.10, 0.15):   # margen frente a la 2ª prob.
          # regla:
          #  - pred 'rebound' si p_r >= th_r y p_r - max(otros) >= margin
          #  - pred 'breakout' si p_b >= th_b y p_b - max(otros) >= margin
          #  - si ninguna dispara, pred 'none'
          pr = Pva[:, c2i["rebound"]]; pb = Pva[:, c2i["breakout"]]; pn = Pva[:, c2i["none"]]
          top2 = np.sort(Pva, axis=1)[:, -2]
          yhat = np.full(len(yva), "none", dtype=object)
          yhat[(pr >= th_r) & (pr - top2 >= margin)] = "rebound"
          yhat[(pb >= th_b) & (pb - top2 >= margin)] = "breakout"

          # macro-F1 en validación
          f1 = f1_score(yva, yhat, labels=["breakout","none","rebound"], average="macro")
          if (best is None) or (f1 > best[0]):
              best = (f1, th_r, th_b, margin)
    return best  # (f1_macro, th_r, th_b, margin)

def eval_with_thresholds(calib_path, data_path, th_r, th_b, margin):
    X, y, tr, va, te = load_data(data_path)
    model = joblib.load(calib_path)
    classes = model.classes_
    c2i = {c:i for i,c in enumerate(classes)}
    Pte = model.predict_proba(X[te]); yte = y[te]
    pr = Pte[:, c2i["rebound"]]; pb = Pte[:, c2i["breakout"]]
    top2 = np.sort(Pte, axis=1)[:, -2]
    yhat = np.full(len(yte), "none", dtype=object)
    yhat[(pr >= th_r) & (pr - top2 >= margin)] = "rebound"
    yhat[(pb >= th_b) & (pb - top2 >= margin)] = "breakout"
    rep = precision_recall_fscore_support(yte, yhat, labels=["breakout","none","rebound"], zero_division=0)
    return {"precision":rep[0].tolist(),"recall":rep[1].tolist(),"f1":rep[2].tolist(),"support":rep[3].tolist()}

# === aplica a EU+USA (recomendado) ===
best = tune_thresholds("models/artifacts/HGB_EUUSA_f23_calibrated_isotonic.joblib",
                       "data/processed/features/supervised_EUUSA.parquet")
print("mejor en VAL (macro-F1, th_r, th_b, margin) ->", best)
rep = eval_with_thresholds("models/artifacts/HGB_EUUSA_f23_calibrated_isotonic.joblib",
                           "data/processed/features/supervised_EUUSA.parquet",
                           best[1], best[2], best[3])
rep


mejor en VAL (macro-F1, th_r, th_b, margin) -> (0.3482834155100163, 0.55, 0.5, 0.0)


{'precision': [0.6428571428571429, 0.2406847935548842, 0.6251402918069585],
 'recall': [0.017786561264822136, 0.5945273631840796, 0.5626262626262626],
 'f1': [0.03461538461538462, 0.3426523297491039, 0.5922381711855397],
 'support': [506, 402, 990]}

In [4]:
import numpy as np, pandas as pd, joblib
from sklearn.metrics import precision_recall_fscore_support

# === utilidades ya conocidas ===
def load_data(PQ, DF_PATH="data/interim/ES_5m_2021_2024.parquet"):
    D  = pd.read_parquet(PQ)
    df = pd.read_parquet(DF_PATH)
    t  = df["Time"].dt.tz_convert("Europe/Madrid")
    idx = D["idx"].astype(int)
    tr = (t.loc[idx] < "2024-01-01").values
    va = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
    te = (t.loc[idx] >= "2024-07-01").values
    X = D.drop(columns=["target","idx","zone_type"], errors="ignore")
    y = D["target"].astype(str).values
    num = X.select_dtypes(include=[np.number]).columns
    X[num] = X[num].fillna(0.0).astype(np.float32)
    return X.to_numpy(), y, tr, va, te

def decide_hier(P, classes, t_event, t_none, t_r, t_b, m_rb=0.0, m_br=0.0):
    c2i = {c:i for i,c in enumerate(classes)}
    pr, pb, pn = P[:,c2i["rebound"]], P[:,c2i["breakout"]], P[:,c2i["none"]]
    # 1) gate evento vs none
    event_conf = np.maximum(pr, pb)
    yhat = np.where((event_conf >= t_event) & (pn <= t_none), "?", "none")

    # 2) dentro de evento, decidir r vs b con umbrales y márgenes
    mask = (yhat == "?")
    # márgenes uno-contra-uno
    choose_r = (pr >= t_r) & (pr - pb >= m_rb)
    choose_b = (pb >= t_b) & (pb - pr >= m_br)
    yhat[mask & choose_r] = "rebound"
    yhat[mask & (~choose_r) & choose_b] = "breakout"
    yhat[mask & (~choose_r) & (~choose_b)] = "none"
    return yhat

# === búsqueda en validación (EU+USA) ===
X, y, tr, va, te = load_data("data/processed/features/supervised_EUUSA.parquet")
model = joblib.load("models/artifacts/HGB_EUUSA_f23_calibrated_isotonic.joblib")
classes = model.classes_
Pva = model.predict_proba(X[va]); yva = y[va]

best = None
for t_event in np.linspace(0.50, 0.75, 6):
  for t_none in np.linspace(0.35, 0.55, 5):
    for t_r in np.linspace(0.55, 0.75, 5):
      for t_b in np.linspace(0.45, 0.70, 6):
        for m_rb in (0.00, 0.05, 0.10):
          for m_br in (0.00, 0.05, 0.10):
            yhat = decide_hier(Pva, classes, t_event, t_none, t_r, t_b, m_rb, m_br)
            P,R,F,S = precision_recall_fscore_support(
                yva, yhat, labels=["breakout","none","rebound"], zero_division=0
            )
            f1_macro = float(F.mean())
            # opcional: priorizar recall de breakout
            score = f1_macro + 0.05*R[0]
            cand = (score, f1_macro, R[0], (t_event,t_none,t_r,t_b,m_rb,m_br))
            if (best is None) or (cand > best):
                best = cand

best


(0.13180610889774239,
 0.13180610889774239,
 0.0,
 (0.5, 0.4, 0.65, 0.5, 0.1, 0.1))

## Con Features de Order Flow

In [6]:
# --- Resolución robusta de rutas (independiente del CWD) ---
from pathlib import Path
import pandas as pd, numpy as np, joblib, json
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import average_precision_score, brier_score_loss
from sklearn.preprocessing import label_binarize

def find_repo_root(max_up=6):
    p = Path.cwd()
    for _ in range(max_up):
        if (p/"data").exists() and (p/"src").exists():
            return p
        p = p.parent
    return Path.cwd()  # fallback

ROOT = find_repo_root()
DATA_DIR = ROOT / "data"
FEAT_DIR = DATA_DIR / "processed" / "features"
EV_DIR = DATA_DIR / "processed" / "events"
INT_DIR = DATA_DIR / "interim"

print("ROOT  :", ROOT)
print("FEATS :", FEAT_DIR)
print("EVENTS:", EV_DIR)
print("INTERM:", INT_DIR)

# --- localizar dataset extendido (match: supervised_*_EXT.parquet) ---
def find_ext():
    cands = sorted(FEAT_DIR.glob("supervised_*_EXT.parquet"))
    if not cands:
        # como en tu screenshot el nombre es exactamente supervised_EUUSA_EXT.parquet
        p = FEAT_DIR / "supervised_EUUSA_EXT.parquet"
        if p.exists():
            return p
        raise FileNotFoundError(f"No encontré 'supervised_*_EXT.parquet' en {FEAT_DIR}")
    print("Usando:", cands[0].name)
    return cands[0]

PQ = find_ext()
DF_PATH = INT_DIR / "ES_5m_2021_2024.parquet"

# --- carga datasets ---
D  = pd.read_parquet(PQ)
df = pd.read_parquet(DF_PATH)

# split temporal (TR < 2024-01-01, VA 2024H1, TE 2024H2)
t   = df["Time"].dt.tz_convert("Europe/Madrid")
idx = D["idx"].astype(int)
TR  = (t.loc[idx] < "2024-01-01").values
VA  = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
TE  = (t.loc[idx] >= "2024-07-01").values

# X, y
X = D.drop(columns=["target","idx","zone_type"], errors="ignore").copy()
num = X.select_dtypes(include=[np.number]).columns
X[num] = X[num].fillna(0.0).astype(np.float32)
y = D["target"].astype(str).values
classes = np.array(sorted(np.unique(y))).tolist()
print("Shapes -> X:", X.shape, "| y:", y.shape, "| clases:", classes)


ROOT  : c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction
FEATS : c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\data\processed\features
EVENTS: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\data\processed\events
INTERM: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\data\interim
Usando: supervised_EUUSA_EXT.parquet
Shapes -> X: (8541, 29) | y: (8541,) | clases: ['breakout', 'none', 'rebound']


In [7]:
# --- modelo base HGB (ligeramente regularizado) ---
hgb = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_depth=None,
    max_leaf_nodes=31,
    min_samples_leaf=60,
    l2_regularization=0.1,
    class_weight=None,        # si breakout sigue flojo: prueba 'balanced'
    random_state=42
).fit(X[TR].to_numpy(), y[TR])

# --- calibración isotónica one-vs-rest sobre VALIDACIÓN ---
P_va = np.column_stack([hgb.predict_proba(X[VA].to_numpy())[:, list(hgb.classes_).index(c)] for c in classes])

# entrenar un isotónico por clase (OvR)
isos = {}
P_va_cal = np.zeros_like(P_va)
Y_va_bin = label_binarize(y[VA], classes=classes)
for k, c in enumerate(classes):
    ir = IsotonicRegression(out_of_bounds="clip")
    ir.fit(P_va[:,k], Y_va_bin[:,k])
    isos[c] = ir
    P_va_cal[:,k] = ir.transform(P_va[:,k])

# función de predicción calibrada
def predict_proba_cal(Xmat):
    P = np.column_stack([hgb.predict_proba(Xmat)[:, list(hgb.classes_).index(c)] for c in classes])
    Pcal = np.column_stack([isos[c].transform(P[:,k]) for k,c in enumerate(classes)])
    # re-normaliza por fila para evitar drift numérico
    s = Pcal.sum(axis=1, keepdims=True)
    s[s==0] = 1.0
    return Pcal / s

# --- métricas en TEST ---
P_te_cal = predict_proba_cal(X[TE].to_numpy())
y_te = y[TE]

# AP macro
ap_macro = average_precision_score(label_binarize(y_te, classes=classes), P_te_cal, average="macro")

# Brier OvR medio
brier = np.mean([brier_score_loss((y_te==c).astype(int), P_te_cal[:,i]) for i,c in enumerate(classes)])

# AP por clase
ap_per_class = {c: average_precision_score((y_te==c).astype(int), P_te_cal[:,i]) for i,c in enumerate(classes)}

print("=== HGB_EUUSA_EXT ===")
print("AP macro (test):", round(ap_macro,4))
print("AP por clase:", {k:round(v,4) for k,v in ap_per_class.items()})
print("Brier OvR medio (test):", round(brier,5))

# guardar artefactos
Path("models/artifacts").mkdir(parents=True, exist_ok=True)
joblib.dump({"model":hgb,"classes":classes,"isos":isos}, "models/artifacts/HGB_EUUSA_EXT_calibrated_isotonic.joblib")

json.dump({
    "name":"HGB_EUUSA_EXT",
    "ap_macro_test": float(ap_macro),
    "ap_per_class_test": {k: float(v) for k,v in ap_per_class.items()},
    "brier_test": float(brier),
    "n_features": int(X.shape[1]),
}, open("models/artifacts/HGB_EUUSA_EXT_report_test.json","w"), indent=2)

print("Guardado:",
      "models/artifacts/HGB_EUUSA_EXT_calibrated_isotonic.joblib",
      "models/artifacts/HGB_EUUSA_EXT_report_test.json")


=== HGB_EUUSA_EXT ===
AP macro (test): 0.3561
AP por clase: {'breakout': 0.3071, 'none': 0.2542, 'rebound': 0.507}
Brier OvR medio (test): 0.21362
Guardado: models/artifacts/HGB_EUUSA_EXT_calibrated_isotonic.joblib models/artifacts/HGB_EUUSA_EXT_report_test.json


In [10]:
# opcional: fija CWD al raíz del repo (si el notebook vive en ./notebooks/)
import os, sys
from pathlib import Path
repo_root = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]
os.chdir(repo_root)
print("CWD fijado a:", os.getcwd())

CWD fijado a: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction


In [None]:
# === 0) helpers de ruta ===EUUSA
from pathlib import Path
import numpy as np, pandas as pd, joblib, json
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import average_precision_score, brier_score_loss
from sklearn.preprocessing import label_binarize
from sklearn.inspection import permutation_importance

ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"   # ajusta si usas EUUSA
print("ROOT:", ROOT)

# === 1) carga base y splits temporales ===
df = pd.read_parquet(DF_PATH)
events_labeled = pd.read_parquet(EV_PATH)

t = df["Time"].dt.tz_convert("Europe/Madrid")
def make_splits(idx):
    TR = (t.loc[idx] < "2024-01-01").values
    VA = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
    TE = (t.loc[idx] >= "2024-07-01").values
    return TR, VA, TE

# === 2) construir datasets on-the-fly para ablation ===
from importlib import reload
import ppz.pipelines.build_dataset as bd; reload(bd)

def build_set(add_mvc=False, add_of=False, of_k=3.0):
    X, y = bd.make_supervised_from_events(
        df, events_labeled,
        tick_size=0.25, n_short=20, n_long=60,
        L_prev_touches=60, r_touch_ticks=6, drop_none=False,
        add_mvc=add_mvc, add_orderflow=add_of, of_k=of_k, of_k_ext=5.0
    )
    num = X.select_dtypes(include=[np.number]).columns
    X[num] = X[num].fillna(0.0).astype(np.float32)
    return X, y

sets = {
    "base":  build_set(add_mvc=False, add_of=False),
    "mvc":   build_set(add_mvc=True,  add_of=False),
    "of3x":  build_set(add_mvc=False, add_of=True,  of_k=3.0),
    "ext3x": build_set(add_mvc=True,  add_of=True,  of_k=3.0),
}

# === 3) sanity check rápido (NaNs, constantes) en EXT ===
X_ext, y_ext = sets["ext3x"]
nan_cols = X_ext.columns[X_ext.isna().any()].tolist()
const_cols = [c for c in X_ext.columns if X_ext[c].nunique(dropna=True)<=1]
print("NaN cols:", nan_cols[:10], "...", len(nan_cols))
print("Const cols:", const_cols[:10], "...", len(const_cols))

# === 4) entrenamiento y métricas macro/por clase ===
def train_eval(X, y, name="set"):
    idx = X["idx"].astype(int).values
    TR, VA, TE = make_splits(idx)
    Xfit = X.drop(columns=["idx","zone_type"], errors="ignore").copy()
    y = y.astype(str).values
    classes = sorted(np.unique(y).tolist())

    hgb = HistGradientBoostingClassifier(
        learning_rate=0.06,
        max_leaf_nodes=25,
        min_samples_leaf=120,
        l2_regularization=0.2,
        random_state=42,
    ).fit(Xfit[TR].to_numpy(), y[TR])

    # isotónica OvR en VA
    P_va = np.column_stack([hgb.predict_proba(Xfit[VA].to_numpy())[:, list(hgb.classes_).index(c)] for c in classes])
    Y_va_bin = label_binarize(y[VA], classes=classes)
    isos, P_va_cal = {}, np.zeros_like(P_va)
    for k,c in enumerate(classes):
        ir = IsotonicRegression(out_of_bounds="clip").fit(P_va[:,k], Y_va_bin[:,k])
        isos[c] = ir; P_va_cal[:,k] = ir.transform(P_va[:,k])

    # test
    def pred_cal(Xm):
        P = np.column_stack([hgb.predict_proba(Xm)[:, list(hgb.classes_).index(c)] for c in classes])
        Pcal = np.column_stack([isos[c].transform(P[:,k]) for k,c in enumerate(classes)])
        s = Pcal.sum(axis=1, keepdims=True); s[s==0]=1.0
        return Pcal/s

    Xte = Xfit[TE].to_numpy(); yte = y[TE]
    Pte = pred_cal(Xte)
    ap_macro = average_precision_score(label_binarize(yte, classes=classes), Pte, average="macro")
    ap_pc = {c: average_precision_score((yte==c).astype(int), Pte[:,i]) for i,c in enumerate(classes)}
    brier = np.mean([brier_score_loss((yte==c).astype(int), Pte[:,i]) for i,c in enumerate(classes)])

    return {"name":name, "ap_macro":ap_macro, "ap":ap_pc, "brier":brier, "model":hgb, "classes":classes, "Xfit":Xfit, "TR":TR, "VA":VA, "TE":TE, "y":y}

res = {k: train_eval(*v, name=k) for k,v in sets.items()}
for k,v in res.items():
    print(f"{k:6s} | AP_macro={v['ap_macro']:.3f} | AP_reb={v['ap'].get('rebound',np.nan):.3f} | Brier={v['brier']:.3f}")

# === 5) (opcional) permutation importance en VALIDACIÓN para EXT, target=rebote (brier-like) ===
# OJO: puede tardar; reduce n_repeats si va lento.
from sklearn.metrics import log_loss
def perm_importance_rebound(v, n_repeats=5, max_feats=30):
    model, Xfit, VA, y, classes = v["model"], v["Xfit"], v["VA"], v["y"], v["classes"]
    Xv = Xfit[VA]; yv = (y[VA]=="rebound").astype(int)
    # usa proba(rebound) del modelo calibrado aproximando con predict_proba sin isotónica (proxy)
    pr = model.predict_proba(Xv.to_numpy())[:, list(model.classes_).index("rebound")]
    base = log_loss(yv, np.clip(pr,1e-6,1-1e-6))
    cols = Xv.columns.tolist()
    rng = np.random.default_rng(42)
    scores=[]
    for _ in range(n_repeats):
        for c in cols[:max_feats]:
            Xp = Xv.copy()
            Xp[c] = rng.permutation(Xp[c].values)
            pr_p = model.predict_proba(Xp.to_numpy())[:, list(model.classes_).index("rebound")]
            scores.append((c, base - log_loss(yv, np.clip(pr_p,1e-6,1-1e-6))))
    imp = pd.DataFrame(scores, columns=["feature","gain"]).groupby("feature")["gain"].mean().sort_values(ascending=False)
    return imp

# imp = perm_importance_rebound(res["ext3x"]) ; imp.head(20)


ROOT: c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction


  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),
  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),


NaN cols: [] ... 0
Const cols: [] ... 0
base   | AP_macro=0.354 | AP_reb=0.495 | Brier=0.212
mvc    | AP_macro=0.351 | AP_reb=0.495 | Brier=0.214
of3x   | AP_macro=0.345 | AP_reb=0.482 | Brier=0.213
ext3x  | AP_macro=0.357 | AP_reb=0.499 | Brier=0.213


In [13]:
# Mini grid de OF: k y k_ext
grid = [(2.0,4.0),(2.5,4.5),(3.0,5.0),(3.5,5.5)]
results = []
for k, kext in grid:
    Xk, yk = bd.make_supervised_from_events(
        df, events_labeled,
        tick_size=0.25, n_short=20, n_long=60,
        L_prev_touches=60, r_touch_ticks=6, drop_none=False,
        add_mvc=True, add_orderflow=True, of_k=k, of_k_ext=kext
    )
    num = Xk.select_dtypes(include=[np.number]).columns
    Xk[num] = Xk[num].fillna(0.0).astype(np.float32)
    r = train_eval(Xk, yk, name=f"ext k={k}")
    results.append((k, kext, r["ap_macro"], r["ap"].get("rebound", np.nan), r["brier"], Xk.shape[1]))

pd.DataFrame(results, columns=["k","k_ext","AP_macro","AP_rebound","Brier","n_feat"]).sort_values("AP_rebound", ascending=False)


  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),
  imb_buy_strength_avg=float(buy_strength[buy_strength>0].mean() if buy_cnt>0 else 0.0),
  ret = ret.dtype.type(ret / rcount)
  imb_sell_strength_avg=float(sell_strength[sell_strength>0].mean() if sell_cnt>0 else 0.0),


Unnamed: 0,k,k_ext,AP_macro,AP_rebound,Brier,n_feat
1,2.5,4.5,0.360059,0.519927,0.212116,31
2,3.0,5.0,0.357398,0.498636,0.212673,31
3,3.5,5.5,0.35704,0.495235,0.212117,31
0,2.0,4.0,0.358506,0.494386,0.213423,31


## Refino del entreno con MVC

In [1]:
# === Construcción (o carga) del dataset MVC-dir EU+USA ===
from importlib import reload
from pathlib import Path
import numpy as np, pandas as pd, json, joblib
import sys
sys.path.append("../src")
from ppz.pipelines import build_dataset as bd  # el archivo anterior
from ppz.features.mvc import MvcFlagsParams

ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"
OUT_FEAT = ROOT/"data/processed/features/supervised_EUUSA_MVCdir.parquet"

df = pd.read_parquet(DF_PATH)
ev = pd.read_parquet(EV_PATH)

# Build (si no existe) o carga
if not OUT_FEAT.exists():
    X_mvc, y_mvc = bd.make_supervised_from_events_mvcdir(
        df, ev,
        mvc_params=MvcFlagsParams(tick_size=0.25, mvc_lower=1/3, mvc_upper=2/3,
                                  vwap_slope_window=20, vwap_flat_th_ticks_per_bar=0.05,
                                  first_touch_static=True),
        subset_sessions=("EU","USA"),
        tick_size=0.25, n_short=20, n_long=60,
        L_prev_touches=60, r_touch_ticks=6,
        drop_none=False
    )
    bd.save_supervised_mvcdir(X_mvc, y_mvc, OUT_FEAT)

D  = pd.read_parquet(OUT_FEAT)
df = pd.read_parquet(DF_PATH)  # para cortes temporales

# === Split temporal: TR < 2024-01-01, VA 2024H1, TE 2024H2 ===
t   = df["Time"].dt.tz_convert("Europe/Madrid")
idx = D["idx"].astype(int)
TR  = (t.loc[idx] < "2024-01-01").values
VA  = (t.loc[idx] >= "2024-01-01").values & (t.loc[idx] < "2024-07-01").values
TE  = (t.loc[idx] >= "2024-07-01").values

# === X / y binario: rebound vs no-rebound ===
y_str = D["target"].astype(str).values
y_bin = (y_str == "rebound").astype(int)

# Drop columnas no predictoras
X = D.drop(columns=["target","idx","zone_type"], errors="ignore").copy()
num = X.select_dtypes(include=[np.number]).columns
X[num] = X[num].fillna(0.0).astype(np.float32)

Xtr, ytr = X.iloc[TR].to_numpy(), y_bin[TR]
Xva, yva = X.iloc[VA].to_numpy(), y_bin[VA]
Xte, yte = X.iloc[TE].to_numpy(), y_bin[TE]

# === Modelo base HGB binario ===
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import average_precision_score, brier_score_loss, precision_recall_fscore_support, confusion_matrix
from sklearn.isotonic import IsotonicRegression

hgb = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_leaf_nodes=31,
    min_samples_leaf=60,
    l2_regularization=0.1,
    class_weight="balanced",
    random_state=42
).fit(Xtr, ytr)

# === Calibración isotónica (en VALIDACIÓN) ===
p_va = hgb.predict_proba(Xva)[:,1]
iso  = IsotonicRegression(out_of_bounds="clip").fit(p_va, yva)

def predict_proba_cal(Xm):
    p = hgb.predict_proba(Xm)[:,1]
    return iso.transform(p)

# === Métricas ===
p_te_cal = predict_proba_cal(Xte)

ap_reb   = average_precision_score(yte, p_te_cal)
brier    = brier_score_loss(yte, p_te_cal)
# Umbral por F1 en VALIDACIÓN (grid sencillo)
ths = np.linspace(0.4, 0.8, 9)
f1_va, th_best = -1.0, 0.6
p_va_cal = predict_proba_cal(Xva)
for th in ths:
    yhat = (p_va_cal >= th).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(yva, yhat, average="binary", zero_division=0)
    if f1 > f1_va:
        f1_va, th_best = f1, th

yhat_te = (p_te_cal >= th_best).astype(int)
prec, rec, f1, sup = precision_recall_fscore_support(yte, yhat_te, average=None, labels=[1,0], zero_division=0)
cm = confusion_matrix(yte, yhat_te, labels=[1,0])

print("=== Rebound binario (EU+USA, MVC-dir) ===")
print(f"AP_rebound (test): {ap_reb:.4f}")
print(f"Brier (test):      {brier:.5f}")
print(f"Best th (VAL):     {th_best:.2f}")
print("CM (test) [rows=true, cols=pred] [rebound, not]:")
print(cm)

# === Guarda artefactos ===
ART_DIR = ROOT/"models/artifacts"; ART_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump({"model":hgb, "iso":iso, "th_best":float(th_best)}, ART_DIR/"HGB_rebound_EUUSA_MVCdir.joblib")
json.dump({
    "name":"HGB_rebound_EUUSA_MVCdir",
    "ap_rebound_test": float(ap_reb),
    "brier_test": float(brier),
    "th_best_val": float(th_best),
    "cm_test": cm.tolist()
}, open(ART_DIR/"HGB_rebound_EUUSA_MVCdir_report.json","w"), indent=2)
print("Guardado:", str(ART_DIR/"HGB_rebound_EUUSA_MVCdir.joblib"), str(ART_DIR/"HGB_rebound_EUUSA_MVCdir_report.json"))


ImportError: cannot import name 'make_supervised_from_events' from partially initialized module 'ppz.pipelines.build_dataset' (most likely due to a circular import) (c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\notebooks\../src\ppz\pipelines\build_dataset.py)