In [None]:
# ====== Normal Holiday concatenation with no dynamic time window ======
PATH_PROP = "events_with_duration_no_end_clean_propagated.csv"  # your propagated panel
PATH_HOL  = "holidays.csv"                                      # your holiday file

# columns in your holidays.csv (from your file)
HOL_COUNTRY_COL = "Country Code"
HOL_DATE_COL    = "Date"        # single holiday date (YYYY-MM-DD / etc.)
HOL_NAME_COL    = "Holiday"     # optional name; keep None if not needed

# holiday window (inclusive). Example: pre=1, post=2 -> [D-1, D+2]
PRE_DAYS  = 0
POST_DAYS = 4

# choose a canonical timezone per country (12 countries you use)
COUNTRY_TZ = {
    "AR": "America/Argentina/Buenos_Aires",
    "AU": "Australia/Sydney",
    "BR": "America/Sao_Paulo",
    "CA": "America/Toronto",
    "CN": "Asia/Shanghai",
    "DE": "Europe/Berlin",
    "FR": "Europe/Paris",
    "GB": "Europe/London",
    "JP": "Asia/Tokyo",
    "PL": "Europe/Warsaw",
    "TR": "Europe/Istanbul",
    "US": "America/New_York",
}

# ====== CODE ======
import pandas as pd
import numpy as np
from zoneinfo import ZoneInfo
from collections import defaultdict

def norm_cc(x: str) -> str:
    x = str(x).strip().upper()
    iso3_to_iso2 = {"ARG":"AR","AUS":"AU","BRA":"BR","CAN":"CA","CHN":"CN","DEU":"DE",
                    "FRA":"FR","GBR":"GB","JPN":"JP","POL":"PL","TUR":"TR","USA":"US"}
    name_map = {
        "ARGENTINA":"AR","AUSTRALIA":"AU","BRAZIL":"BR","CANADA":"CA","CHINA":"CN",
        "GERMANY":"DE","FRANCE":"FR","UNITED KINGDOM":"GB","UK":"GB","JAPAN":"JP",
        "POLAND":"PL","TURKEY":"TR","TURKIYE":"TR","UNITED STATES":"US",
        "UNITED STATES OF AMERICA":"US","USA":"US"
    }
    if len(x) == 2: return x
    if len(x) == 3 and x in iso3_to_iso2: return iso3_to_iso2[x]
    return name_map.get(x, x)

# 1) load events
ev = pd.read_csv(PATH_PROP, parse_dates=["timestamp"])
ev["country"] = ev["country"].astype(str).str.upper().str.strip()

# 2) load holidays and build date windows per country
hol = pd.read_csv(PATH_HOL)
hol["country"] = hol[HOL_COUNTRY_COL].map(norm_cc)
hol["holiday_date"] = pd.to_datetime(hol[HOL_DATE_COL], errors="coerce").dt.date

# build per-country calendar (set of all dates covered by [D-PRE, D+POST])
cal_dates = {}
name_by_date = defaultdict(list)  # optional: collect names per date
for cc, g in hol.groupby("country", sort=False):
    covered = set()
    for _, r in g.iterrows():
        if pd.isna(r["holiday_date"]):
            continue
        start = r["holiday_date"] - pd.Timedelta(days=PRE_DAYS)
        end   = r["holiday_date"] + pd.Timedelta(days=POST_DAYS)
        dr = pd.date_range(start, end, freq="D").date
        covered.update(dr)
        if HOL_NAME_COL in hol.columns:
            for d in dr:
                name_by_date[(cc, d)].append(str(r.get(HOL_NAME_COL, "")))
    cal_dates[cc] = covered

# 3) convert UTC timestamp -> local date per country
#    (assume your timestamps are UTC; if not, adjust tz_localize accordingly)
parts = []
for cc, g in ev.groupby("country", sort=False):
    tz = COUNTRY_TZ.get(cc, "UTC")
    loc_date = g["timestamp"].dt.tz_localize("UTC").dt.tz_convert(ZoneInfo(tz)).dt.date
    gg = g.copy()
    gg["local_date"] = loc_date
    # membership
    covered = cal_dates.get(cc, set())
    gg["is_holiday_local"] = gg["local_date"].isin(covered)
    # optional: holiday names (join multiple names if window expands across >1 holidays)
    gg["holiday_names_local"] = [
        ", ".join(name_by_date.get((cc, d), [])) if (cc, d) in name_by_date else pd.NA
        for d in gg["local_date"]
    ]
    parts.append(gg)

ev_local = pd.concat(parts, ignore_index=True)

# 4) save
out_path = "events_with_duration_no_end_clean_propagated_holiday_local.csv"
ev_local.to_csv(out_path, index=False)
print("Saved:", out_path)


Saved: events_with_duration_no_end_clean_propagated_holiday_local.csv


In [None]:
# === Local-time holiday labeling with dynamic windows and NAME CONCATENATION ===
import pandas as pd
import numpy as np
from zoneinfo import ZoneInfo
from collections import defaultdict

# ---- Inputs you already have ----
EV_LOCAL_CSV = "events_with_duration_no_end_clean_propagated_holiday_local (2).csv"  # or keep ev_local in memory
HOLIDAY_CSV  = "holidays.csv"
OUT_CSV_HOL  = "events_with_duration_no_end_clean_propagated_holiday_local_v2.csv"

# ---- Global default window used only when no per-holiday rule is found ----
DEFAULT_PRE_DAYS  = 0
DEFAULT_POST_DAYS = 0

# ---- Optional per-holiday windows (fallback rules). Case-insensitive. ----
# Key can be a name substring (applies to all countries) OR a (country, exact name) tuple.
DYNAMIC_WINDOWS = {
    "CHRISTMAS": (7, 5),
    "SPRING FESTIVAL": (1, 5),
    "GOLDEN WEEK": (1, 5),
    "THANKSGIVING": (1, 2),
    ("GB", "BOXING DAY"): (1, 2),
}

# ---- Country -> canonical timezone (adjust if you want different cities) ----
COUNTRY_TZ = {
    "AR": "America/Argentina/Buenos_Aires",
    "AU": "Australia/Sydney",
    "BR": "America/Sao_Paulo",
    "CA": "America/Toronto",
    "CN": "Asia/Shanghai",
    "DE": "Europe/Berlin",
    "FR": "Europe/Paris",
    "GB": "Europe/London",
    "JP": "Asia/Tokyo",
    "PL": "Europe/Warsaw",
    "TR": "Europe/Istanbul",
    "US": "America/New_York",
}

def norm_cc(x: str) -> str:
    """Normalize country string to ISO2 codes."""
    x = str(x).strip().upper()
    iso3_to_iso2 = {"ARG":"AR","AUS":"AU","BRA":"BR","CAN":"CA","CHN":"CN","DEU":"DE",
                    "FRA":"FR","GBR":"GB","JPN":"JP","POL":"PL","TUR":"TR","USA":"US"}
    names = {
        "ARGENTINA":"AR","AUSTRALIA":"AU","BRAZIL":"BR","CANADA":"CA","CHINA":"CN",
        "GERMANY":"DE","FRANCE":"FR","UNITED KINGDOM":"GB","UK":"GB","JAPAN":"JP",
        "POLAND":"PL","TURKEY":"TR","TURKIYE":"TR","UNITED STATES":"US",
        "UNITED STATES OF AMERICA":"US","USA":"US",
    }
    if len(x) == 2: return x
    if len(x) == 3 and x in iso3_to_iso2: return iso3_to_iso2[x]
    return names.get(x, x)

def pick_window(country: str, name: str, pre_csv, post_csv):
    """
    Decide (pre, post) for one holiday row with the following priority:
      1) If the CSV provides 'pre_days'/'post_days', use them.
      2) If a per-country rule exists in DYNAMIC_WINDOWS for (country, name), use it.
      3) If a name-substring rule exists in DYNAMIC_WINDOWS (case-insensitive), use it.
      4) Fallback to DEFAULT_PRE/DEFAULT_POST.
    """
    if pd.notna(pre_csv) or pd.notna(post_csv):
        pre  = 0 if pd.isna(pre_csv)  else int(pre_csv)
        post = 0 if pd.isna(post_csv) else int(post_csv)
        return pre, post

    key_country = (country.upper(), str(name).upper())
    for k, v in DYNAMIC_WINDOWS.items():
        if isinstance(k, tuple):
            if k == key_country:
                return int(v[0]), int(v[1])

    name_up = str(name).upper()
    for k, v in DYNAMIC_WINDOWS.items():
        if isinstance(k, str) and k in name_up:
            return int(v[0]), int(v[1])

    return int(DEFAULT_PRE_DAYS), int(DEFAULT_POST_DAYS)

# ---- 1) Load events ----
try:
    ev_local  # use in-memory df if already defined
except NameError:
    ev_local = pd.read_csv(EV_LOCAL_CSV, parse_dates=["timestamp"])

ev_local["country"] = ev_local["country"].astype(str).str.upper().str.strip()

# ---- 2) Load holidays ----
hol = pd.read_csv(HOLIDAY_CSV)
cols = {c.lower(): c for c in hol.columns}
cc_col   = cols.get("country code", "Country Code") if "country code" in cols else "Country Code"
date_col = cols.get("date", "Date")                  if "date" in cols        else "Date"
name_col = cols.get("holiday", "Holiday")            if "holiday" in cols     else "Holiday"

# Optional CSV-level per-row windows
pre_col  = next((c for c in ["pre_days","pre","preDays","PreDays"] if c in hol.columns), None)
post_col = next((c for c in ["post_days","post","postDays","PostDays"] if c in hol.columns), None)

hol["country"] = hol[cc_col].map(norm_cc)
hol["holiday_name"] = hol[name_col].astype(str)
hol["holiday_date"] = pd.to_datetime(hol[date_col], errors="coerce").dt.date

# ---- 3) Build coverage sets per country; collect names PER DATE and CONCAT ----
covered_dates = {}                     # country -> set of all covered dates
names_by_date = defaultdict(list)      # (country, date) -> list of names (we will de-dup & join)
window_by_date = {}                    # (country, date) -> (pre, post) used (max if overlapping)

for _, r in hol.dropna(subset=["holiday_date"]).iterrows():
    cc   = r["country"]
    name = r["holiday_name"]
    pre, post = pick_window(cc, name, r[pre_col] if pre_col else pd.NA, r[post_col] if post_col else pd.NA)
    start = r["holiday_date"] - pd.Timedelta(days=pre)
    end   = r["holiday_date"] + pd.Timedelta(days=post)
    for d in pd.date_range(start, end, freq="D").date:
        covered_dates.setdefault(cc, set()).add(d)
        names_by_date[(cc, d)].append(name)  # collect; we'll concat deduped names later
        prev = window_by_date.get((cc, d))
        if prev is None:
            window_by_date[(cc, d)] = (pre, post)
        else:
            window_by_date[(cc, d)] = (max(prev[0], pre), max(prev[1], post))

# ---- 4) Convert UTC -> local date by country; label membership & CONCAT names ----
parts = []
for cc, g in ev_local.groupby("country", sort=False):
    tz = COUNTRY_TZ.get(cc, "UTC")
    # If timestamp is tz-naive UTC, localize to UTC then convert; if already tz-aware, just convert
    ts = g["timestamp"]
    if pd.api.types.is_datetime64tz_dtype(ts):
        loc_date = ts.dt.tz_convert(ZoneInfo(tz)).dt.date
    else:
        loc_date = ts.dt.tz_localize("UTC").dt.tz_convert(ZoneInfo(tz)).dt.date

    gg = g.copy()
    gg["local_date"] = loc_date
    cov = covered_dates.get(cc, set())

    gg["is_holiday_local"] = gg["local_date"].isin(cov)

    # --- CONCATENATION: for dates covered by one or more holidays, dedupe and join names ---
    gg["holiday_names_local"] = [
        ", ".join(sorted(set(names_by_date.get((cc, d), [])))) if (cc, d) in names_by_date else pd.NA
        for d in gg["local_date"]
    ]

    # Expose the window actually used for that date (max if overlapping holidays)
    gg["holiday_window_pre_days"] = [
        window_by_date.get((cc, d), (DEFAULT_PRE_DAYS, DEFAULT_POST_DAYS))[0] for d in gg["local_date"]
    ]
    gg["holiday_window_post_days"] = [
        window_by_date.get((cc, d), (DEFAULT_PRE_DAYS, DEFAULT_POST_DAYS))[1] for d in gg["local_date"]
    ]
    parts.append(gg)

ev_local = pd.concat(parts, ignore_index=True)

# ---- 5) Save result with concatenated names ----
ev_local.to_csv(OUT_CSV_HOL, index=False)
print("Saved:", OUT_CSV_HOL)


  if pd.api.types.is_datetime64tz_dtype(ts):


Saved: events_with_duration_no_end_clean_propagated_holiday_local_v2.csv


In [None]:
# ===== FX builder (auto-detect needed pairs from ev_local & pull from IMF) =====
# Comments in ENGLISH.

import pandas as pd, numpy as np, re
from pathlib import Path

EV_LOCAL_PATH = "events_with_duration_no_end_clean_propagated_holiday_local_v2.csv"
IMF_PATH      = "dataset_2025-11-07T11_42_37.999821392Z_DEFAULT_INTEGRATION_IMF.STA_ER_4.0.1.csv"
FX_OUT        = "fx_daily_rates.csv"

# Country -> LCU map (12 countries)
LCU = {"AR":"ARS","AU":"AUD","BR":"BRL","CA":"CAD","CN":"CNY",
       "DE":"EUR","FR":"EUR","GB":"GBP","JP":"JPY","PL":"PLN","TR":"TRY","US":"USD"}

# For IMF country matching (LCU -> candidate country names)
COUNTRY_NAMES_BY_LCU = {
    "ARS": ["Argentina","ARGENTINA"],
    "TRY": ["Türkiye","Turkey","TURKEY","Turkey, Republic of","Türkiye, Republic of"],
    "JPY": ["Japan","JAPAN"],
    "PLN": ["Poland","POLAND"],
    "BRL": ["Brazil","BRAZIL"],
    "CNY": ["China","CHINA"],
    "AUD": ["Australia","AUSTRALIA"],
    "CAD": ["Canada","CANADA"],
    "EUR": ["Euro Area","EURO AREA","Germany","GERMANY","France","FRANCE"],  # safe fallbacks
    "GBP": ["United Kingdom","UNITED KINGDOM","UK","GBR","Great Britain"],
    "USD": ["United States","UNITED STATES","USA","United States of America"]
}

# 1) detect needed FX pairs from events (currency != LCU)
ev = pd.read_csv(EV_LOCAL_PATH, parse_dates=["timestamp"])
ev["country"]  = ev["country"].astype(str).str.upper().str.strip()
ev["currency"] = ev["currency"].astype(str).str.upper().str.strip()
ev["LCU"]      = ev["country"].map(LCU)
needs = (ev.loc[ev["currency"]!=ev["LCU"], ["currency","LCU"]]
           .drop_duplicates()
           .rename(columns={"currency":"base","LCU":"quote"}))
if needs.empty:
    # nothing to build; write an empty-but-valid file
    pd.DataFrame(columns=["date","base","quote","rate"]).to_csv(FX_OUT, index=False)
    fx_df = pd.read_csv(FX_OUT)  # legacy variable name
    print("[FX] No FX needed; wrote empty fx_daily_rates.csv")
else:
    print("[FX] Pairs needed:", needs.to_dict(orient="records"))

    # 2) load IMF wide file
    imf = pd.read_csv(IMF_PATH)

    def pick_rows(imf_df, country_names):
        m = False
        for nm in country_names:
            m = m | imf_df["COUNTRY"].astype(str).str.fullmatch(nm, case=False, na=False)
        return imf_df[m]

    def time_columns(df):
        pat = re.compile(r"^(\d{4})(?:[-]?(Q[1-4]|M\d{2}))?$|^\d{4}Q[1-4]$|^\d{4}M\d{2}$")
        return [c for c in df.columns if pat.match(str(c))]

    def period_to_month_end(s):
        s = str(s)
        if re.fullmatch(r"\d{4}", s):
            y = int(s); return pd.Period(f"{y}-12", freq="M").end_time.date()
        m = re.match(r"^(\d{4})-?Q([1-4])$", s)
        if m:
            y, q = int(m.group(1)), int(m.group(2))
            month = {1:3,2:6,3:9,4:12}[q]
            return pd.Period(f"{y}-{month:02d}", freq="M").end_time.date()
        m = re.match(r"^(\d{4})-?M(\d{2})$", s)
        if m:
            y, mth = int(m.group(1)), int(m.group(2))
            return pd.Period(f"{y}-{mth:02d}", freq="M").end_time.date()
        return None

    def imf_series_USD_to_LCU(imf_df, lcu_code):
        """Return monthly USD->LCU as DataFrame(date, base, quote, rate)."""
        names = COUNTRY_NAMES_BY_LCU.get(lcu_code, [])
        sub = pick_rows(imf_df, names)
        sub = sub[sub["INDICATOR"].astype(str).str.contains("Domestic currency per US Dollar", case=False, na=False)]
        if sub.empty:
            return pd.DataFrame(columns=["date","base","quote","rate"])
        tcols = time_columns(sub)
        if not tcols:
            return pd.DataFrame(columns=["date","base","quote","rate"])
        long = sub.melt(id_vars=["COUNTRY","INDICATOR"], value_vars=tcols,
                        var_name="period", value_name="rate").dropna(subset=["rate"])
        long["date"] = long["period"].map(period_to_month_end)
        long = (long.dropna(subset=["date"])
                    .sort_values("date")
                    .drop_duplicates("date", keep="last"))
        out = long[["date"]].assign(base="USD", quote=lcu_code,
                                    rate=pd.to_numeric(long["rate"], errors="coerce"))
        return out.dropna(subset=["rate"]).sort_values("date").reset_index(drop=True)

    def monthly_to_daily(g):
        g = g.sort_values("date")
        idx = pd.date_range(g["date"].min(), g["date"].max(), freq="D").date
        tmp = pd.DataFrame({"date": idx})
        tmp["base"]  = g["base"].iloc[0]
        tmp["quote"] = g["quote"].iloc[0]
        tmp = tmp.merge(g[["date","rate"]], on="date", how="left").sort_values("date")
        tmp["rate"] = tmp["rate"].ffill()
        return tmp

    fx_parts = []
    for _, row in needs.iterrows():
        b, q = row["base"], row["quote"]
        if b == "USD":
            m = imf_series_USD_to_LCU(imf, q)
            if m.empty:
                print(f"[WARN] IMF has no USD->{q}. Upload a small CSV with columns [date,base,quote,rate] for USD{q}.")
            else:
                fx_parts.append(monthly_to_daily(m))
        else:
            # generic cross: base->quote = (USD->quote) / (USD->base)
            m_q = imf_series_USD_to_LCU(imf, q)
            m_b = imf_series_USD_to_LCU(imf, b)
            if m_q.empty or m_b.empty:
                print(f"[WARN] Cannot build {b}->{q} via USD cross (missing USD->{q} or USD->{b}).")
            else:
                cross = (m_q.merge(m_b, on="date", suffixes=("_uq","_ub"))
                           .assign(base=b, quote=q, rate=lambda d: d["rate_uq"]/d["rate_ub"])
                           [["date","base","quote","rate"]])
                fx_parts.append(monthly_to_daily(cross))

    fx_daily = (pd.concat(fx_parts, ignore_index=True) if fx_parts
                else pd.DataFrame(columns=["date","base","quote","rate"]))
    fx_daily.to_csv(FX_OUT, index=False)
    fx_df = pd.read_csv(FX_OUT)  # legacy variable for old code paths
    print("Saved FX file:", FX_OUT, "| rows:", len(fx_daily),
          "| pairs:", fx_daily[["base","quote"]].drop_duplicates().shape[0])



[FX] Pairs needed: [{'base': 'USD', 'quote': 'ARS'}, {'base': 'USD', 'quote': 'TRY'}]
Saved FX file: fx_daily_rates.csv | rows: 54514 | pairs: 2


In [None]:
# --- Stretch fx_daily_rates.csv to cover full event date range ---
import pandas as pd, numpy as np

EV = "events_with_duration_no_end_clean_propagated_holiday_local_v2.csv"
FX = "fx_daily_rates.csv"

ev = pd.read_csv(EV, parse_dates=["timestamp"])
dmin, dmax = ev["timestamp"].dt.date.min(), ev["timestamp"].dt.date.max()

fx = pd.read_csv(FX)
fx.columns = [c.lower() for c in fx.columns]
date_col = "date" if "date" in fx.columns else ("day" if "day" in fx.columns else "timestamp")
fx[date_col] = pd.to_datetime(fx[date_col], errors="coerce").dt.date
fx["base"]   = fx["base"].astype(str).str.upper().str.strip()
fx["quote"]  = fx["quote"].astype(str).str.upper().str.strip()
fx["rate"]   = pd.to_numeric(fx["rate"], errors="coerce")
fx = fx.dropna(subset=[date_col,"base","quote","rate"]).drop_duplicates([date_col,"base","quote"], keep="last")

def stretch(g):
    idx = pd.date_range(dmin, dmax, freq="D").date
    out = pd.DataFrame({"date": idx})
    out["base"]  = g["base"].iloc[0]
    out["quote"] = g["quote"].iloc[0]
    out = out.merge(g[[date_col,"rate"]].rename(columns={date_col:"date"}), on="date", how="left")
    out["rate"] = out["rate"].ffill().bfill()  # as-of: use last known, bfill for very early events
    return out

fx_long = (fx.sort_values(date_col)
             .groupby(["base","quote"], group_keys=False)
             .apply(stretch))

fx_long.to_csv(FX, index=False)
print("FX stretched to:", dmax, "| rows:", len(fx_long))


FX stretched to: 2025-11-05 | rows: 7268


  .apply(stretch))


In [None]:
# --- FX coverage audit ---
import pandas as pd, numpy as np

EV = "events_with_duration_no_end_clean_propagated_holiday_local_v2.csv"
FX = "fx_daily_rates.csv"

LCU = {"AR":"ARS","AU":"AUD","BR":"BRL","CA":"CAD","CN":"CNY",
       "DE":"EUR","FR":"EUR","GB":"GBP","JP":"JPY","PL":"PLN","TR":"TRY","US":"USD"}

ev = pd.read_csv(EV, parse_dates=["timestamp"])
ev["country"]  = ev["country"].astype(str).str.upper().str.strip()
ev["currency"] = ev["currency"].astype(str).str.upper().str.strip()
ev["LCU"]      = ev["country"].map(LCU)
ev["event_date"] = ev["timestamp"].dt.date

need = (ev.loc[ev["currency"]!=ev["LCU"], ["currency","LCU"]]
          .drop_duplicates().rename(columns={"currency":"base","LCU":"quote"}))
print("Pairs needed:", need.to_dict(orient="records"))

fx = pd.read_csv(FX)
fx.columns = [c.lower() for c in fx.columns]
date_col = "date" if "date" in fx.columns else ("day" if "day" in fx.columns else "timestamp")
fx[date_col] = pd.to_datetime(fx[date_col], errors="coerce").dt.date
fx["base"]   = fx["base"].astype(str).str.upper().str.strip()
fx["quote"]  = fx["quote"].astype(str).str.upper().str.strip()
fx["rate"]   = pd.to_numeric(fx["rate"], errors="coerce")
fx = fx.dropna(subset=[date_col,"rate"]).drop_duplicates([date_col,"base","quote"], keep="last")

# per-pair date span
span = (fx.groupby(["base","quote"])[date_col]
          .agg(["min","max","count"]).reset_index().sort_values(["base","quote"]))
print(span)

# uncovered rows by exact match
sub = ev[ev["currency"]!=ev["LCU"]][["event_date","currency","LCU"]].copy()
k = pd.MultiIndex.from_frame(sub.rename(columns={"currency":"base","LCU":"quote"}))
fx_map = fx.set_index([date_col,"base","quote"])["rate"]
miss = sub.index[fx_map.reindex(k).isna()]
print("Exact-match missing rows:", len(miss))

# show where they are
if len(miss):
    audit = (ev.loc[miss, ["country","currency","LCU","event_date"]]
               .value_counts().reset_index(name="n").sort_values("n", ascending=False))
    print(audit.head(10))

Pairs needed: [{'base': 'USD', 'quote': 'ARS'}, {'base': 'USD', 'quote': 'TRY'}]
  base quote         min         max  count
0  USD   ARS  2015-11-25  2025-11-05   3634
1  USD   TRY  2015-11-25  2025-11-05   3634
Exact-match missing rows: 0


In [None]:
# ===================== PPP block (FX -> LCU -> PPP -> US parity) =====================
# Robust, suffix-free (no _x/_y), with AS-OF fallback for FX coverage.
# Comments in ENGLISH.

import pandas as pd
import numpy as np
from pathlib import Path

# -------- CONFIG --------
PATH_EV_LOCAL_FALLBACK = "events_with_duration_no_end_clean_propagated_holiday_local_v2.csv"
PATH_PPP = "PPP__LCU_per_int___-_12_Countries_-_Latest.csv"
PATH_FX  = "fx_daily_rates.csv"

OUT_CSV         = "events_with_duration_no_end_clean_propagated_holiday_local_ppp_parity.csv"
NEEDS_FX_REPORT = "rows_needing_fx.csv"

# Country -> Local Currency (LCU) for the 12 markets in your project
CCY_LCU = {
    "AR":"ARS","AU":"AUD","BR":"BRL","CA":"CAD","CN":"CNY",
    "DE":"EUR","FR":"EUR","GB":"GBP","JP":"JPY","PL":"PLN","TR":"TRY","US":"USD",
}

# -------- Helpers --------
def norm_cc(x: str) -> str:
    """Normalize country name/code to ISO2 used in your project."""
    x = str(x).strip().upper()
    iso3_to_iso2 = {"ARG":"AR","AUS":"AU","BRA":"BR","CAN":"CA","CHN":"CN","DEU":"DE",
                    "FRA":"FR","GBR":"GB","JPN":"JP","POL":"PL","TUR":"TR","USA":"US"}
    names = {
        "ARGENTINA":"AR","AUSTRALIA":"AU","BRAZIL":"BR","CANADA":"CA","CHINA":"CN",
        "GERMANY":"DE","FRANCE":"FR","UNITED KINGDOM":"GB","UK":"GB","JAPAN":"JP",
        "POLAND":"PL","TURKEY":"TR","TURKIYE":"TR","TÜRKIYE":"TR",
        "UNITED STATES":"US","UNITED STATES OF AMERICA":"US","USA":"US",
        "EURO AREA":"DE"  # safe fallback to EUR anchor when PPP file uses 'Euro Area'
    }
    if len(x) == 2:
        return x
    if len(x) == 3 and x in iso3_to_iso2:
        return iso3_to_iso2[x]
    return names.get(x, x)

def load_ev_local() -> pd.DataFrame:
    """Use in-memory ev_local if present; else load from fallback CSV."""
    if "ev_local" in globals():
        df = ev_local.copy()
    else:
        df = pd.read_csv(PATH_EV_LOCAL_FALLBACK, parse_dates=["timestamp"])
    # normalize minimal schema
    df["appid"]   = df["appid"].astype(str).str.strip()
    df["country"] = df["country"].astype(str).str.upper().str.strip()
    if "currency" not in df.columns:
        raise ValueError("`ev_local` must contain a 'currency' column for FX->LCU.")
    df["currency"] = df["currency"].astype(str).str.upper().str.strip()
    if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    return df

def load_ppp(path_ppp: str) -> pd.DataFrame:
    """Load PPP and keep the latest per country. Returns ['country','ppp_lcu_per_int_dollar']."""
    ppp_raw = pd.read_csv(path_ppp)

    # detect the numeric PPP column robustly
    ppp_val_col = None
    for c in ppp_raw.columns:
        cl = str(c).lower()
        if ("lcu" in cl and "int" in cl) or ("ppp" in cl and "lcu" in cl):
            ppp_val_col = c; break
    if ppp_val_col is None:
        num_cols = [c for c in ppp_raw.columns if pd.api.types.is_numeric_dtype(ppp_raw[c])]
        if not num_cols:
            raise ValueError("Cannot detect PPP numeric column.")
        ppp_val_col = num_cols[0]

    # detect country column
    cc_col = next((n for n in ["Country Code","country_code","ISO3","iso3","Code","code",
                               "Country","Country Name","name"] if n in ppp_raw.columns), None)
    if cc_col is None:
        raise ValueError("Cannot detect PPP country column.")

    # optional year/period to pick latest
    year_col = next((n for n in ["Year","year","Time","time","Date","date","Period","period"]
                     if n in ppp_raw.columns), None)

    p = ppp_raw.copy()
    p["country"] = p[cc_col].map(norm_cc)
    p["ppp_lcu_per_int_dollar"] = pd.to_numeric(p[ppp_val_col], errors="coerce")
    if year_col is not None:
        p["_ppp_year"] = pd.to_numeric(p[year_col], errors="coerce")
        p = (p.sort_values(["country","_ppp_year"], ascending=[True,False])
               .drop_duplicates(subset=["country"], keep="first"))
    p = p.dropna(subset=["country","ppp_lcu_per_int_dollar"])
    return p[["country","ppp_lcu_per_int_dollar"]]

def load_fx_map(path_fx: str):
    """
    Load FX file if present and non-empty.
    Expected columns (case-insensitive): date, base, quote, rate
    Return a MultiIndex Series: (date, base, quote) -> rate; or None if unavailable.
    """
    p = Path(path_fx)
    if (not p.exists()) or (p.stat().st_size == 0):
        return None
    fx = pd.read_csv(p)
    if fx.empty:
        return None
    cols = {c.lower(): c for c in fx.columns}
    date_col  = cols.get("date")  or cols.get("day") or cols.get("timestamp")
    base_col  = cols.get("base")
    quote_col = cols.get("quote")
    rate_col  = cols.get("rate")  or cols.get("px") or cols.get("value")
    if not all([date_col, base_col, quote_col, rate_col]):
        raise ValueError("FX file must have columns: date, base, quote, rate.")
    fx[date_col]  = pd.to_datetime(fx[date_col], errors="coerce")
    fx[base_col]  = fx[base_col].astype(str).str.upper().str.strip()
    fx[quote_col] = fx[quote_col].astype(str).str.upper().str.strip()
    fx[rate_col]  = pd.to_numeric(fx[rate_col], errors="coerce")
    fx = fx.dropna(subset=[date_col, rate_col])
    fx["date_only"] = fx[date_col].dt.date
    return fx.set_index(["date_only", base_col, quote_col])[rate_col]

def fx_to_lcu_vectorized(sub: pd.DataFrame, fx_map: pd.Series) -> np.ndarray:
    """Vectorized exact-match lookup; if missing, try inverse and reciprocal."""
    k_direct  = pd.MultiIndex.from_frame(sub[["event_date","currency","LCU"]])
    k_inverse = pd.MultiIndex.from_frame(sub[["event_date","LCU","currency"]])
    r_direct  = fx_map.reindex(k_direct).to_numpy()
    r_inverse = fx_map.reindex(k_inverse).to_numpy()
    return np.where(~pd.isna(r_direct), r_direct,
                    np.where(~pd.isna(r_inverse), 1.0 / r_inverse, np.nan))

def asof_fill_fx(ev: pd.DataFrame, fx_path: str) -> None:
    """
    AS-OF fallback: for remaining rows with missing FX, fill by nearest
    historical (backward) rate; if still missing (event earlier than first FX),
    try forward.
    This function mutates 'ev' inplace on column 'fx_to_lcu'.
    """
    fx = pd.read_csv(fx_path)
    if fx.empty:
        return
    fx.columns = [c.lower() for c in fx.columns]
    date_col  = "date" if "date" in fx.columns else ("day" if "day" in fx.columns else "timestamp")
    base_col, quote_col = "base", "quote"
    rate_col = "rate" if "rate" in fx.columns else ("px" if "px" in fx.columns else "value")

    fx[date_col]  = pd.to_datetime(fx[date_col], errors="coerce")
    fx[base_col]  = fx[base_col].astype(str).str.upper().str.strip()
    fx[quote_col] = fx[quote_col].astype(str).str.upper().str.strip()
    fx[rate_col]  = pd.to_numeric(fx[rate_col], errors="coerce")
    fx = fx.dropna(subset=[date_col, rate_col]).sort_values(date_col)

    need_mask = (~ev["currency"].eq(ev["LCU"])) & (ev["fx_to_lcu"].isna())
    if not need_mask.any():
        return

    need_idx = ev.index[need_mask]
    pairs = ev.loc[need_idx, ["currency","LCU"]].drop_duplicates().itertuples(index=False)

    for base_ccy, quote_ccy in pairs:
        idx_pair = need_idx[(ev.loc[need_idx,"currency"]==base_ccy) &
                            (ev.loc[need_idx,"LCU"]==quote_ccy)]
        fx_pair = fx[(fx[base_col]==base_ccy) & (fx[quote_col]==quote_ccy)][[date_col, rate_col]].copy()
        if fx_pair.empty:
            fx_inv = fx[(fx[base_col]==quote_ccy) & (fx[quote_col]==base_ccy)][[date_col, rate_col]].copy()
            if not fx_inv.empty:
                fx_inv = fx_inv.rename(columns={rate_col:"_r"})
                fx_pair = fx_inv.copy()
                fx_pair[rate_col] = 1.0 / fx_inv["_r"]
                fx_pair.drop(columns=["_r"], inplace=True)
        if fx_pair.empty:
            continue

        sub = ev.loc[idx_pair, ["event_date"]].copy()
        sub["event_date"] = pd.to_datetime(sub["event_date"], errors="coerce")
        fx_pair = fx_pair.sort_values(date_col)

        m  = pd.merge_asof(sub.sort_values("event_date"), fx_pair, left_on="event_date",
                           right_on=date_col, direction="backward")
        r  = m[rate_col].to_numpy()

        if np.isnan(r).any():
            m2 = pd.merge_asof(sub.sort_values("event_date"), fx_pair, left_on="event_date",
                               right_on=date_col, direction="forward")
            r  = np.where(np.isnan(r), m2[rate_col].to_numpy(), r)

        ev.loc[idx_pair, "fx_to_lcu"] = r

# -------- MAIN --------
# A) Load base table + PPP (suffix-free)
ev_local = load_ev_local()
ppp_use  = load_ppp(PATH_PPP)  # ['country','ppp_lcu_per_int_dollar']

# Clean any lingering PPP columns from previous runs
ppp_cols_to_drop = [c for c in ev_local.columns if c.startswith("ppp_lcu_per_int_dollar")]
if ppp_cols_to_drop:
    ev_local = ev_local.drop(columns=ppp_cols_to_drop)

# Attach PPP by mapping (one-to-one by country)
ppp_map = (ppp_use
           .drop_duplicates(subset=["country"], keep="first")
           .set_index("country")["ppp_lcu_per_int_dollar"]
           .to_dict())
ev_local["ppp_lcu_per_int_dollar"] = ev_local["country"].map(ppp_map)

print(f"[PPP] attached via map; missing (no PPP): {int(ev_local['ppp_lcu_per_int_dollar'].isna().sum())}")

# B) LCU & date & price/regular hygiene
ev_local["LCU"] = ev_local["country"].map(CCY_LCU)
ev_local["event_date"] = ev_local["timestamp"].dt.date
if "regular" not in ev_local.columns:
    ev_local["regular"] = np.nan

# C) FX exact-match first
if "fx_to_lcu" in ev_local.columns:
    ev_local["fx_to_lcu"] = pd.to_numeric(ev_local["fx_to_lcu"], errors="coerce")
else:
    ev_local["fx_to_lcu"] = np.nan

same_ccy = ev_local["currency"].eq(ev_local["LCU"])
ev_local.loc[same_ccy, "fx_to_lcu"] = 1.0

fx_map = load_fx_map(PATH_FX)
need_fx_mask = (~same_ccy) & ev_local["fx_to_lcu"].isna()
if fx_map is not None and need_fx_mask.any():
    sub = ev_local.loc[need_fx_mask, ["event_date","currency","LCU"]]
    ev_local.loc[need_fx_mask, "fx_to_lcu"] = fx_to_lcu_vectorized(sub, fx_map)

# D) AS-OF fallback for remaining FX gaps (backward->forward)
asof_fill_fx(ev_local, PATH_FX)

# E) Compute LCU amounts for PPP
if "price_lcu_for_ppp" not in ev_local.columns:
    ev_local["price_lcu_for_ppp"] = np.nan
if "regular_lcu_for_ppp" not in ev_local.columns:
    ev_local["regular_lcu_for_ppp"] = np.nan

can_mul_p = ev_local["price_lcu_for_ppp"].isna() & ev_local["fx_to_lcu"].notna()
ev_local.loc[can_mul_p, "price_lcu_for_ppp"] = ev_local.loc[can_mul_p, "price"] * ev_local.loc[can_mul_p, "fx_to_lcu"]

can_mul_r = ev_local["regular_lcu_for_ppp"].isna() & ev_local["fx_to_lcu"].notna() & ev_local["regular"].notna()
ev_local.loc[can_mul_r, "regular_lcu_for_ppp"] = ev_local.loc[can_mul_r, "regular"] * ev_local.loc[can_mul_r, "fx_to_lcu"]

# F) PPP normalization (LCU -> international $)
with np.errstate(divide="ignore", invalid="ignore"):
    ev_local["price_ppp_intd"]   = ev_local["price_lcu_for_ppp"]   / ev_local["ppp_lcu_per_int_dollar"]
    ev_local["regular_ppp_intd"] = ev_local["regular_lcu_for_ppp"] / ev_local["ppp_lcu_per_int_dollar"]
    ev_local["ppp_price_over_regular_intd"] = ev_local["price_ppp_intd"] / ev_local["regular_ppp_intd"]

# G) US parity (no merge, suffix-free)
#    1) drop lingering parity columns from previous runs
ev_local = ev_local.drop(columns=[c for c in ev_local.columns
                                  if c.startswith("price_ppp_intd_US")
                                  or c == "ppp_parity_vs_us"], errors="ignore")
#    2) US series indexed by (appid, timestamp)
us_series = (ev_local.loc[ev_local["country"]=="US", ["appid","timestamp","price_ppp_intd"]]
             .dropna(subset=["price_ppp_intd"])
             .drop_duplicates(subset=["appid","timestamp"], keep="last")
             .set_index(["appid","timestamp"])["price_ppp_intd"])
#    3) reindex to all rows
keys = pd.MultiIndex.from_arrays([ev_local["appid"].astype(str), ev_local["timestamp"]])
ev_local["price_ppp_intd_US"] = us_series.reindex(keys).to_numpy()
with np.errstate(divide="ignore", invalid="ignore"):
    ev_local["ppp_parity_vs_us"] = ev_local["price_ppp_intd"] / ev_local["price_ppp_intd_US"]

# H) Diagnostics & optional export of missing-FX rows
needs_fx = (~same_ccy) & ev_local["fx_to_lcu"].isna()
print(f"[QC] Missing PPP rows (no PPP): {int(ev_local['ppp_lcu_per_int_dollar'].isna().sum())}")
print(f"[QC] Rows still needing FX (currency!=LCU but fx_to_lcu is NaN): {int(needs_fx.sum())}")

if needs_fx.any():
    cols = ["appid","country","timestamp","currency","LCU","price","regular"]
    ev_local.loc[needs_fx, cols].to_csv(NEEDS_FX_REPORT, index=False)
    print(f"[WARN] Exported rows needing FX to: {NEEDS_FX_REPORT}")

# I) Optional sanity checks
dups = ev_local.duplicated(["appid","country","timestamp"]).sum()
print("[QC] Duplicates (appid,country,timestamp):", int(dups))
us_rows = ev_local["country"].eq("US") & ev_local["price_ppp_intd_US"].notna()
if us_rows.any():
    diff = np.nanmax(np.abs(ev_local.loc[us_rows, "ppp_parity_vs_us"] - 1.0))
    print(f"[QC] Max |US parity - 1|: {diff:.3e}")

# J) Save final CSV
ev_local.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV)
print("Added columns:",
      [c for c in ["LCU","event_date","fx_to_lcu","price_lcu_for_ppp","regular_lcu_for_ppp",
                   "ppp_lcu_per_int_dollar","price_ppp_intd","regular_ppp_intd",
                   "ppp_price_over_regular_intd","price_ppp_intd_US","ppp_parity_vs_us"]
       if c in ev_local.columns])
# ===================== END PPP block =====================



[PPP] attached via map; missing (no PPP): 0
[QC] Missing PPP rows (no PPP): 0
[QC] Rows still needing FX (currency!=LCU but fx_to_lcu is NaN): 0
[QC] Duplicates (appid,country,timestamp): 0
[QC] Max |US parity - 1|: 0.000e+00
Saved: events_with_duration_no_end_clean_propagated_holiday_local_ppp_parity.csv
Added columns: ['LCU', 'event_date', 'fx_to_lcu', 'price_lcu_for_ppp', 'regular_lcu_for_ppp', 'ppp_lcu_per_int_dollar', 'price_ppp_intd', 'regular_ppp_intd', 'ppp_price_over_regular_intd', 'price_ppp_intd_US', 'ppp_parity_vs_us']
