### Librerias

In [None]:
import pandas as pd
from importlib import reload
import sys
sys.path.append("../src")

from ppz.io.paths import interim_path, events_path

from ppz.labeling.events import LabelParams, label_events, join_labels_with_events
import ppz.labeling.events as lab


### Importamos los datos pretratados de velas y eventos

In [18]:
df = pd.read_parquet("data/interim/ES_5m_2021_2024.parquet")
events_df = pd.read_parquet("data/processed/events/events_first_pass_2021_2024.parquet")
df.shape, events_df.shape

((282944, 28), (8541, 6))

In [19]:

df.to_parquet(interim_path("ES_5m_2021_2024.parquet"), engine="pyarrow", compression="zstd", index=False)
events_df.to_parquet(events_path("events_first_pass_2021_2024.parquet"), engine="pyarrow", compression="zstd", index=False)

df = pd.read_parquet(interim_path("ES_5m_2021_2024.parquet"))
events_df = pd.read_parquet(events_path("events_first_pass_2021_2024.parquet"))


In [20]:
reload(lab)

params = LabelParams(
    tick_size=0.25,
    H_horizon_bars=12,
    p_inval_ticks=6,
    X_rebound_ticks=10,
    Y_break_confirm_ticks=16,
    restrict_same_session=True,
)

labels_df = label_events(df, events_df, params)
events_labeled = join_labels_with_events(events_df, labels_df)

print("Distribución de etiquetas:")
print(events_labeled["label"].value_counts(dropna=False))

# KPIs rápidos por zona
by_zone = events_labeled.groupby("zone_type")["label"].value_counts().unstack(fill_value=0)
by_zone["total"] = by_zone.sum(1)
by_zone = by_zone.sort_values("total", ascending=False)
by_zone.head(10)


Distribución de etiquetas:
label
rebound     4173
breakout    2286
none        2082
Name: count, dtype: int64


label,breakout,none,rebound,total
zone_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
VWAP,552,433,807,1792
USA_IBH,311,328,817,1456
USA_IBL,314,311,692,1317
POC_D1,277,263,473,1013
PDH_prev,209,203,392,804
VAH_D1,202,182,377,761
VAL_D1,189,192,327,708
PDL_prev,232,170,288,690


### Guardamos el etiquetado

In [4]:
import pandas as pd, json, datetime
from dataclasses import asdict
from ppz.io.paths import events_path

# `events_labeled` viene de join_labels_with_events(...)
out_parquet = events_path("events_labeled_2021_2024.parquet")
events_labeled.to_parquet(out_parquet, engine="pyarrow", compression="zstd", index=False)

# (Opcional) metadatos del etiquetado
meta = {
    "created_at": datetime.datetime.now().isoformat(),
    "n_events": int(len(events_labeled)),
    "label_counts": events_labeled["label"].value_counts(dropna=False).to_dict(),
    "params": asdict(params),  # LabelParams usado
    "sources": {
        "df_base": "data/interim/ES_5m_2021_2024.parquet",
        "events_first_pass": "data/processed/events/events_first_pass_2021_2024.parquet",
    }
}
with open(events_path("events_labeled_2021_2024.meta.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

out_parquet


WindowsPath('C:/Users/jmbf2/OneDrive/Trading/Machine Learning/ZoneBasedPricePrediction/data/processed/events/events_labeled_2021_2024.parquet')

In [15]:
import numpy as np
from ppz.utils.session_tag import add_session_tag_by_index

df = add_session_tag_by_index(df, eu=(108,185), usa=(186,264))
df["session_tag"].value_counts()


session_tag
ASIA    122164
EU       80568
USA      80212
Name: count, dtype: int64

In [12]:
events_labeled["session_tag"] = events_labeled["idx"].map(df["session_tag"])

counts = events_labeled.groupby("session_tag")["label"].value_counts().unstack(fill_value=0)
counts["total"] = counts.sum(1)
rates = (counts.T / counts["total"]).T
counts, rates[["none","rebound","breakout"]].round(3)


(label        breakout  none  rebound  total
 session_tag                                
 ASIA               24   175      116    315
 USA              2262  1907     4057   8226,
 label         none  rebound  breakout
 session_tag                          
 ASIA         0.556    0.368     0.076
 USA          0.232    0.493     0.275)

In [13]:
tmp = events_labeled.pivot_table(index="zone_type", columns="session_tag",
                                 values="label", aggfunc=lambda s: (s=="none").mean())
none_rate_zone_sess = (tmp.reindex(columns=["USA","EU","ASIA"]).fillna(0)
                          .sort_values(["USA","EU","ASIA"], ascending=False))
none_rate_zone_sess.round(3).head(15)


session_tag,USA,EU,ASIA
zone_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
VAL_D1,0.265,0.0,0.455
POC_D1,0.249,0.0,0.543
PDL_prev,0.237,0.0,0.522
PDH_prev,0.236,0.0,0.579
VAH_D1,0.23,0.0,0.481
VWAP,0.228,0.0,0.615
USA_IBL,0.224,0.0,0.565
USA_IBH,0.211,0.0,0.559


In [14]:
mask = events_labeled["session_tag"].isin(["EU","USA"])
events_eu_usa = events_labeled[mask].reset_index(drop=True)
events_eu_usa.to_parquet("data/processed/events/events_labeled_2021_2024_EUUSA.parquet",
                         engine="pyarrow", compression="zstd", index=False)


In [8]:
# 1) ¿Los idx de events están en el rango del df?
print("idx min/max events:", int(events_labeled["idx"].min()), int(events_labeled["idx"].max()))
print("df len:", len(df))

# 2) ¿El session_id del evento coincide con el session_id en df en ese idx?
chk = events_labeled.head(20).copy()
chk["session_id_df"] = chk["idx"].map(df["session_id"])
chk[["idx","session_id","session_id_df","zone_type","label"]]


idx min/max events: 198 282920
df len: 282944


Unnamed: 0,idx,session_id,session_id_df,zone_type,label
0,198,1,1,USA_IBL,rebound
1,465,2,2,USA_IBH,rebound
2,468,2,2,VWAP,rebound
3,476,2,2,POC_D1,breakout
4,512,2,2,USA_IBH,breakout
5,514,2,2,VAH_D1,rebound
6,525,2,2,VAH_D1,none
7,530,2,2,USA_IBH,rebound
8,734,3,3,VAH_D1,breakout
9,799,3,3,USA_IBH,rebound
