In [1]:
# Prospección MVC: ¿“agotamiento / impulso” discrimina rebote vs ruptura?
# -----------------------------------------------------------------------------------
# Requisitos previos:
#  - data/interim/ES_5m_2021_2024.parquet           (df base con OHLC, Volume, Delta, MVC, NewSession, Ask/Bid)
#  - data/processed/events/events_labeled_2021_2024.parquet  (eventos con columnas: idx, zone_type, label, Time, ...)

from pathlib import Path
import numpy as np, pandas as pd

# -------------------- Config --------------------
TICK = 0.25
N_VWAP_SLOPE = 20                 # velas para pendiente del VWAP (≈ 100 min en 5m)
VWAP_SLOPE_FLAT_TH = 0.05         # |pendiente| ≤ 0.05 ticks/bar => 'flat'
ONLY_EU_USA = True                # limitar a sesiones EU+USA
USE_FIRST_TOUCH_STATIC = True     # solo 1er toque en zonas estáticas
OUT_DIR = Path("experiments/runs/mvc_probe"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# Zonas "estáticas" (1er toque) vs "dinámicas" (siempre)
STATIC_ZONES  = {"PDH_prev","PDL_prev","VAH_D1","POC_D1","VAL_D1","USA_IBH","USA_IBL"}
DYNAMIC_ZONES = {"VWAP","VWAP_p1s","VWAP_m1s"}

# -------------------- Localizar raíz y cargar --------------------
ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists():
        break
    ROOT = ROOT.parent

DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"

df = pd.read_parquet(DF_PATH)
ev = pd.read_parquet(EV_PATH)

# -------------------- Derivados intradía (session_id, idx_in_session) --------------------
if "session_id" not in df.columns:
    df["session_id"] = df["NewSession"].cumsum()
if "idx_in_session" not in df.columns:
    df["idx_in_session"] = df.groupby("session_id").cumcount()

# Mapa idx -> (session_id, idx_in_session)
sid_by_idx = df["session_id"]
iis_by_idx = df["idx_in_session"]

if "session_id" not in ev.columns:
    ev["session_id"] = ev["idx"].map(sid_by_idx)
if "idx_in_session" not in ev.columns:
    ev["idx_in_session"] = ev["idx"].map(iis_by_idx)

# Filtro EU+USA por índice intradía
if ONLY_EU_USA:
    m_euusa = ev["idx_in_session"].between(108,275)
    ev = ev.loc[m_euusa].reset_index(drop=True)

# -------------------- 1er toque para zonas estáticas --------------------
if USE_FIRST_TOUCH_STATIC:
    ev = ev.sort_values(["session_id","zone_type","idx"]).reset_index(drop=True)
    ev["touch_rank"] = ev.groupby(["session_id","zone_type"]).cumcount() + 1
    m_static = ev["zone_type"].isin(STATIC_ZONES)
    m_dynamic = ev["zone_type"].isin(DYNAMIC_ZONES)
    # keep first touch for static, keep all dynamics
    ev = ev.loc[(m_dynamic) | ((~m_dynamic) & m_static & (ev["touch_rank"]==1))].reset_index(drop=True)

# -------------------- VWAP (por sesión) + pendiente --------------------
# VWAP por sesión = cumsum(typ*Vol)/cumsum(Vol)
typ = (df["High"] + df["Low"] + df["Close"]) / 3.0
gp  = df.groupby("session_id", group_keys=False)
cum_pv = gp.apply(lambda g: (typ.loc[g.index] * g["Volume"]).cumsum())
cum_v  = gp["Volume"].cumsum()
df["VWAP_sess"] = (cum_pv / cum_v)

# Pendiente corta (ticks/bar) sobre N_VWAP_SLOPE
v = df["VWAP_sess"]
v_lag = v.groupby(df["session_id"]).shift(N_VWAP_SLOPE)
df["vwap_slope_ticks_per_bar"] = ((v - v_lag) / max(N_VWAP_SLOPE,1)) / TICK

# Clasificación de pendiente
def slope_class(x, th=VWAP_SLOPE_FLAT_TH):
    if pd.isna(x): return "nan"
    if x > th:     return "up"
    if x < -th:    return "down"
    return "flat"

df["vwap_slope_class"] = df["vwap_slope_ticks_per_bar"].map(lambda x: slope_class(x, VWAP_SLOPE_FLAT_TH))

# -------------------- Subconjunto de eventos: solo 'rebote' vs 'ruptura' --------------------
ev2 = ev[ev["label"].isin(["rebound","breakout"])].copy()
ev2["label_bin"] = (ev2["label"]=="rebound").astype(int)  # 1=rebound, 0=breakout

# -------------------- Extraer OHLC/MVC de la vela del evento --------------------
cols_ohlc = ["Open","High","Low","Close","MVC"]
ohlc = df.loc[ev2["idx"].values, cols_ohlc].reset_index(drop=True)
ev2 = pd.concat([ev2.reset_index(drop=True), ohlc], axis=1)

# -------------------- Features MVC × tipo de vela --------------------
EPS = 1e-6
rng = (ev2["High"] - ev2["Low"]).clip(lower=EPS)
body = (ev2["Close"] - ev2["Open"]).abs()

ev2["body_ratio"]        = (body / rng).clip(0,1)
ev2["upper_wick_ratio"]  = ((ev2[["Close","Open"]].max(axis=1) - ev2["High"]).abs() / rng).clip(0,1)
ev2["lower_wick_ratio"]  = ((ev2[["Close","Open"]].min(axis=1) - ev2["Low"]).abs() / rng).clip(0,1)
ev2["dir_up"]            = (ev2["Close"] >= ev2["Open"]).astype(int)
ev2["mvc_pos_frac"]      = ((ev2["MVC"] - ev2["Low"]) / rng).clip(0,1)

# Flags de "cuerpo grande"
BIG = 0.60
big_body = (ev2["body_ratio"] >= BIG)
bull = ev2["dir_up"]==1
bear = ~bull

# Impulso / Agotamiento (según tu definición)
ev2["impulse_flag"] = (
    (bull & big_body & (ev2["mvc_pos_frac"] <= (1/3))) |
    (bear & big_body & (ev2["mvc_pos_frac"] >= (2/3)))
).astype(int)

ev2["exhaustion_flag"] = (
    (bull & big_body & (ev2["mvc_pos_frac"] >= (2/3))) |
    (bear & big_body & (ev2["mvc_pos_frac"] <= (1/3)))
).astype(int)

# -------------------- Añadir pendiente VWAP (solo informativa para VWAP) --------------------
ev2["vwap_slope_ticks_per_bar"] = df.loc[ev2["idx"].values, "vwap_slope_ticks_per_bar"].values
ev2["vwap_slope_class"]         = df.loc[ev2["idx"].values, "vwap_slope_class"].astype(str).values

# -------------------- Tablas de prospección --------------------
def rate_rebound(s):  # proporción de rebote
    s = pd.Series(s)
    return float((s==1).mean()) if len(s)>0 else np.nan

# 0) Baseline por zona (tasa de rebote)
baseline = (ev2.groupby("zone_type")["label_bin"].apply(rate_rebound)
            .rename("rebound_rate_baseline"))

# 1) Lift por 'exhaustion_flag'
tab_exh = (ev2.groupby(["zone_type","exhaustion_flag"])["label_bin"]
           .apply(rate_rebound).unstack(fill_value=np.nan))
tab_exh.columns = ["reb_rate_exh0","reb_rate_exh1"]
tab_exh["lift_exhaustion_pp"] = (tab_exh["reb_rate_exh1"] - tab_exh["reb_rate_exh0"])
tab_exh["support_exh1"] = ev2.groupby(["zone_type","exhaustion_flag"])["label_bin"].count().unstack(fill_value=0)[1]

# 2) Lift por 'impulse_flag'
tab_imp = (ev2.groupby(["zone_type","impulse_flag"])["label_bin"]
           .apply(rate_rebound).unstack(fill_value=np.nan))
tab_imp.columns = ["reb_rate_imp0","reb_rate_imp1"]
tab_imp["lift_impulse_pp"] = (tab_imp["reb_rate_imp1"] - tab_imp["reb_rate_imp0"])
tab_imp["support_imp1"] = ev2.groupby(["zone_type","impulse_flag"])["label_bin"].count().unstack(fill_value=0)[1]

# 3) VWAP: condicionando por pendiente (down/flat/up)
vw = ev2[ev2["zone_type"].isin({"VWAP","VWAP_p1s","VWAP_m1s"})].copy()
vw_tab = None
if len(vw):
    vw_tab = (vw.groupby(["zone_type","vwap_slope_class","exhaustion_flag"])["label_bin"]
                .apply(rate_rebound).unstack(fill_value=np.nan))
    # renombrar columnas por claridad
    vw_tab.columns = [f"reb_rate_exh{c}" for c in vw_tab.columns]
    # soporte por celda
    sup = vw.groupby(["zone_type","vwap_slope_class","exhaustion_flag"])["label_bin"].count().unstack(fill_value=0)
    sup.columns = [f"support_exh{c}" for c in sup.columns]
    vw_tab = vw_tab.join(sup)

# 4) Unir resumen general
summary = baseline.to_frame().join(tab_exh).join(tab_imp)
summary = summary.sort_values("rebound_rate_baseline", ascending=False)

# -------------------- Guardar y mostrar --------------------
summary_rounded = summary.copy()
for c in summary_rounded.columns:
    if "rate" in c or "lift" in c:
        summary_rounded[c] = summary_rounded[c].round(3)

summary_path = OUT_DIR/"mvc_probe_summary_by_zone.csv"
summary_rounded.to_csv(summary_path, index=True)

print("=== Prospección MVC · Rebote vs Ruptura (solo primer toque en estáticas) ===")
print(f"Eventos analizados: {len(ev2)}  | Zonas: {ev2['zone_type'].nunique()}")
print("\n> Resumen por ZONA (tasas y lifts en puntos porcentuales):")
display(summary_rounded)

if vw_tab is not None:
    vw_tab_rounded = vw_tab.copy()
    for c in vw_tab_rounded.columns:
        if "reb_rate" in c:
            vw_tab_rounded[c] = vw_tab_rounded[c].round(3)
    vwap_path = OUT_DIR/"mvc_probe_vwap_by_slope.csv"
    vw_tab_rounded.to_csv(vwap_path, index=True)
    print("\n> VWAP · Rebate por pendiente (down/flat/up) y exhaustion_flag:")
    display(vw_tab_rounded)

# También guardamos el dataset analizado por si quieres profundizar en notebook
ev_out = OUT_DIR/"mvc_probe_events_dataset.parquet"
ev2.to_parquet(ev_out, engine="pyarrow", compression="zstd", index=False)
print("\nGuardado:")
print(" -", summary_path)
if vw_tab is not None: print(" -", vwap_path)
print(" -", ev_out)


=== Prospección MVC · Rebote vs Ruptura (solo primer toque en estáticas) ===
Eventos analizados: 3930  | Zonas: 8

> Resumen por ZONA (tasas y lifts en puntos porcentuales):


Unnamed: 0_level_0,rebound_rate_baseline,reb_rate_exh0,reb_rate_exh1,lift_exhaustion_pp,support_exh1,reb_rate_imp0,reb_rate_imp1,lift_impulse_pp,support_imp1
zone_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
USA_IBH,0.76,0.775,0.721,-0.054,154,0.761,0.754,-0.007,61
USA_IBL,0.723,0.69,0.823,0.133,130,0.719,0.771,0.052,48
VAH_D1,0.653,0.662,0.622,-0.041,74,0.64,0.818,0.178,22
VAL_D1,0.602,0.62,0.547,-0.073,64,0.607,0.529,-0.078,17
PDH_prev,0.594,0.576,0.636,0.061,88,0.589,0.652,0.063,23
VWAP,0.594,0.595,0.589,-0.006,297,0.6,0.529,-0.071,119
POC_D1,0.582,0.613,0.495,-0.118,93,0.587,0.519,-0.069,27
PDL_prev,0.506,0.508,0.5,-0.008,58,0.509,0.478,-0.031,23



> VWAP · Rebate por pendiente (down/flat/up) y exhaustion_flag:


Unnamed: 0_level_0,Unnamed: 1_level_0,reb_rate_exh0,reb_rate_exh1,support_exh0,support_exh1
zone_type,vwap_slope_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VWAP,down,0.588,0.57,520,142
VWAP,flat,0.752,0.528,101,36
VWAP,up,0.567,0.63,441,119



Guardado:
 - experiments\runs\mvc_probe\mvc_probe_summary_by_zone.csv
 - experiments\runs\mvc_probe\mvc_probe_vwap_by_slope.csv
 - experiments\runs\mvc_probe\mvc_probe_events_dataset.parquet


## Mejora 

In [1]:
# notebooks/scratch/23_mvc_probe_v2.ipynb  (célula Python)
from pathlib import Path
import numpy as np, pandas as pd
import sys
sys.path.append("../src")
from ppz.features.mvc import annotate_mvc_directional_flags, MvcFlagsParams

# --- localizar raíz
ROOT = Path.cwd()
for _ in range(6):
    if (ROOT/"data").exists() and (ROOT/"src").exists(): break
    ROOT = ROOT.parent

DF_PATH = ROOT/"data/interim/ES_5m_2021_2024.parquet"
EV_PATH = ROOT/"data/processed/events/events_labeled_2021_2024.parquet"
OUTDIR  = ROOT/"experiments/runs/mvc_probe_v2"; OUTDIR.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(DF_PATH)
ev = pd.read_parquet(EV_PATH)

# Mantén EU+USA si lo deseas (opcional):
# df = df.copy(); df["session_id"] = df["NewSession"].cumsum()
# idx_in_sess = df.groupby("session_id").cumcount()
# ev = ev.copy(); ev["idx_in_session"] = ev["idx"].map(idx_in_sess)
# ev = ev[ev["idx_in_session"].between(108, 275)].reset_index(drop=True)

# 1) Anotar MVC direccional + VWAP slope + primer toque en estáticas
params = MvcFlagsParams(
    tick_size=0.25,
    mvc_lower=1/3,
    mvc_upper=2/3,
    vwap_slope_window=20,
    vwap_flat_th_ticks_per_bar=0.05,
    first_touch_static=True,
)
ev_mvc = annotate_mvc_directional_flags(df, ev, params)

# 2) Subconjunto para prospección: REBOTE vs RUPTURA (sin none)
ev_rb = ev_mvc[ev_mvc["label"].isin(["rebound","breakout"])].copy()
ev_rb["label_bin"] = (ev_rb["label"]=="rebound").astype(int)

def rate_rebound(s: pd.Series) -> float:
    return float((s==1).mean()) if len(s) else np.nan

# --- A) Baseline por zona (1er toque en estáticas ya aplicado)
base = ev_rb.groupby("zone_type")["label_bin"].apply(rate_rebound).rename("reb_baseline")

# --- B) Lifts por flags direccionales
def two_rate(df_, flag):
    # tasa de rebote por zona y valor del flag (0/1)
    g = df_.groupby(["zone_type", flag], dropna=False)["label_bin"].apply(lambda s: (s==1).mean() if len(s) else np.nan)
    tab = g.unstack()
    tab = tab.reindex(columns=[0, 1])              # fuerza ambas columnas
    tab.columns = [f"reb_rate_{flag}0", f"reb_rate_{flag}1"]

    # soporte por flag (cuentas) con relleno de NaN -> 0 antes de convertir a int
    sup = (df_.groupby(["zone_type", flag], dropna=False)["label_bin"]
              .size()
              .unstack())
    sup = sup.reindex(columns=[0, 1]).fillna(0).astype(int)

    tab[f"lift_{flag}_pp"]  = tab[f"reb_rate_{flag}1"] - tab[f"reb_rate_{flag}0"]
    tab[f"support_{flag}1"] = sup[1]
    return tab



tab_exh_up    = two_rate(ev_rb, "exh_up")
tab_exh_down  = two_rate(ev_rb, "exh_down")
tab_rej_up    = two_rate(ev_rb, "reject_up")
tab_rej_down  = two_rate(ev_rb, "reject_down")

summary = base.to_frame()
for t in [tab_exh_up, tab_exh_down, tab_rej_up, tab_rej_down]:
    summary = summary.join(t, how="outer")

# Orden y redondeo
summary = summary.sort_values("reb_baseline", ascending=False)
for c in summary.columns:
    if "reb_rate" in c or "lift_" in c:
        summary[c] = summary[c].round(3)

summary_path = OUTDIR/"mvc_flags_dir_summary_by_zone.csv"
summary.to_csv(summary_path, index=True)

print("=== MVC direccional · Rebote vs Ruptura (primer toque en estáticas) ===")
print(f"Eventos analizados: {len(ev_rb)}  | Zonas: {ev_rb['zone_type'].nunique()}")
display(summary)

# --- C) VWAP: condicionando por pendiente (down / flat / up)
vw = ev_rb[ev_rb["zone_type"].isin(["VWAP","VWAP_p1s","VWAP_m1s"])].copy()
if len(vw):
    def vwap_flag_block(df_, flag):
        t = (df_.groupby(["zone_type", "vwap_slope_class", flag], dropna=False)["label_bin"]
                .apply(lambda s: (s==1).mean() if len(s) else np.nan)
                .unstack())
        t = t.reindex(columns=[0, 1])                   # fuerza ambas columnas
        t.columns = [f"reb_rate_{flag}0", f"reb_rate_{flag}1"]

        sup = (df_.groupby(["zone_type", "vwap_slope_class", flag], dropna=False)["label_bin"]
                .size()
                .unstack())
        sup = sup.reindex(columns=[0, 1]).fillna(0).astype(int)
        sup.columns = [f"support_{flag}0", f"support_{flag}1"]

        return t.join(sup)



    vw_exh_up   = vwap_flag_block(vw, "exh_up")
    vw_exh_down = vwap_flag_block(vw, "exh_down")
    vw_rej_up   = vwap_flag_block(vw, "reject_up")
    vw_rej_down = vwap_flag_block(vw, "reject_down")

    # Guardar
    vw_exh_up.to_csv(OUTDIR/"vwap_by_slope_exh_up.csv")
    vw_exh_down.to_csv(OUTDIR/"vwap_by_slope_exh_down.csv")
    vw_rej_up.to_csv(OUTDIR/"vwap_by_slope_reject_up.csv")
    vw_rej_down.to_csv(OUTDIR/"vwap_by_slope_reject_down.csv")

    print("\n> VWAP · Rebote por pendiente y flags (exh/reject):")
    display(vw_exh_up.round(3))
    display(vw_exh_down.round(3))

# Dataset anotado para análisis posterior
ev_rb_out = OUTDIR/"mvc_flags_dir_events_dataset.parquet"
ev_rb.to_parquet(ev_rb_out, engine="pyarrow", compression="zstd", index=False)
print("\nGuardado:")
print(" -", summary_path)
if len(vw):
    print(" -", OUTDIR/"vwap_by_slope_exh_up.csv")
    print(" -", OUTDIR/"vwap_by_slope_exh_down.csv")
    print(" -", OUTDIR/"vwap_by_slope_reject_up.csv")
    print(" -", OUTDIR/"vwap_by_slope_reject_down.csv")
print(" -", ev_rb_out)


=== MVC direccional · Rebote vs Ruptura (primer toque en estáticas) ===
Eventos analizados: 3930  | Zonas: 8


Unnamed: 0_level_0,reb_baseline,reb_rate_exh_up0,reb_rate_exh_up1,lift_exh_up_pp,support_exh_up1,reb_rate_exh_down0,reb_rate_exh_down1,lift_exh_down_pp,support_exh_down1,reb_rate_reject_up0,reb_rate_reject_up1,lift_reject_up_pp,support_reject_up1,reb_rate_reject_down0,reb_rate_reject_down1,lift_reject_down_pp,support_reject_down1
zone_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
USA_IBH,0.760417,0.776,0.737,-0.039,224,0.768,0.5,-0.268,16,0.76,,,0,0.76,,,0
USA_IBL,0.723404,0.727,0.676,-0.051,37,0.694,0.777,0.083,184,0.723,,,0,0.723,,,0
VAH_D1,0.652597,0.653,0.652,-0.001,92,0.644,0.702,0.058,47,0.653,,,0,0.653,,,0
VAL_D1,0.60223,0.612,0.556,-0.056,45,0.595,0.623,0.028,69,0.602,,,0,0.602,,,0
PDH_prev,0.593857,0.584,0.612,0.027,103,0.591,0.618,0.027,34,0.594,,,0,0.594,,,0
VWAP,0.593819,0.599,0.575,-0.024,292,0.588,0.621,0.034,243,0.594,,,0,0.594,1.0,0.406,1
POC_D1,0.582173,0.576,0.602,0.026,83,0.592,0.549,-0.043,82,0.582,,,0,0.582,,,0
PDL_prev,0.506024,0.505,0.512,0.007,41,0.51,0.491,-0.019,55,0.506,,,0,0.506,,,0



> VWAP · Rebote por pendiente y flags (exh/reject):


Unnamed: 0_level_0,Unnamed: 1_level_0,reb_rate_exh_up0,reb_rate_exh_up1,support_exh_up0,support_exh_up1
zone_type,vwap_slope_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VWAP,down,0.583,0.589,460,202
VWAP,flat,0.762,0.5,101,36
VWAP,up,0.581,0.574,506,54


Unnamed: 0_level_0,Unnamed: 1_level_0,reb_rate_exh_down0,reb_rate_exh_down1,support_exh_down0,support_exh_down1
zone_type,vwap_slope_class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VWAP,down,0.592,0.5,610,52
VWAP,flat,0.67,0.8,112,25
VWAP,up,0.558,0.633,394,166



Guardado:
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\mvc_flags_dir_summary_by_zone.csv
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\vwap_by_slope_exh_up.csv
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\vwap_by_slope_exh_down.csv
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\vwap_by_slope_reject_up.csv
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\vwap_by_slope_reject_down.csv
 - c:\Users\jmbf2\OneDrive\Trading\Machine Learning\ZoneBasedPricePrediction\experiments\runs\mvc_probe_v2\mvc_flags_dir_events_dataset.parquet
