## M1 monetary aggregates

In [8]:
# ---------------------------------------------
# M1 monthly -> daily (log-linear interpolation)
# ---------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path

# Paths
base_dir = Path(r"C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code")
data_dir = base_dir / "1. data"
out_dir  = base_dir / "2. preprocesing"
out_dir.mkdir(parents=True, exist_ok=True)

# Window
RAW_START   = "2014-12-31"
FINAL_START = "2015-01-01"
END_DATE    = "2024-12-31"

def _clean_numeric(s):
    """Coerce strings like '1,239,868,918,894.70' -> float."""
    if pd.isna(s): return np.nan
    if isinstance(s, (int, float)): return float(s)
    s = str(s).replace(',', '').strip()
    return pd.to_numeric(s, errors='coerce')

def monthly_stock_to_daily_loginterp(df_m, date_col, value_col):
    """Core: log-linear interpolate monthly stock to daily."""
    m = df_m[[date_col, value_col]].copy()
    m.columns = ["date", "m1"]
    m["date"] = pd.to_datetime(m["date"], errors="coerce")
    m["m1"]   = m["m1"].map(_clean_numeric)
    m = m[(m["date"] >= RAW_START) & (m["date"] <= END_DATE)].dropna().sort_values("date")
    m = m.set_index("date")

    # logs -> linear interp in log space
    m["log_m1"] = np.log(m["m1"])
    daily_index = pd.date_range(start=RAW_START, end=END_DATE, freq="D")
    d = m[["log_m1"]].reindex(daily_index).interpolate(method="linear")
    d = d.loc[FINAL_START:].copy()
    d["m1_daily"] = np.exp(d["log_m1"])

    # optional normalized index 2015-01=100 (solo para gráficos)
    base_val = d.loc[pd.Timestamp("2015-01-01"), "m1_daily"]
    d["m1_rebased_2015m1_100"] = (d["m1_daily"] / base_val) * 100.0

    d.index.name = "date"
    return d.reset_index().rename(columns={"log_m1": "log_m1_daily"})

# -----------------------------
# EURO AREA (ECB file)
# cols: "DATE","TIME PERIOD","Monetary aggregate M1 ... "
# value = 3ra columna
# -----------------------------
m1_eu = pd.read_csv(data_dir / "M1_EU.csv")
m1_eu.columns = m1_eu.columns.str.strip()
eu_date_col   = "DATE"
eu_value_col  = m1_eu.columns[-1]  # última columna numérica
eu_daily = monthly_stock_to_daily_loginterp(m1_eu, eu_date_col, eu_value_col)
eu_daily.to_csv(out_dir / "M1_EU_daily.csv", index=False)
print("Saved:", out_dir / "M1_EU_daily.csv")

# -----------------------------
# USA (ajusta nombres según tu CSV)
# si tu archivo tiene 2 primeras columnas [DATE, M1], usa eso;
# en caso de duda, toma segunda columna como valor.
# -----------------------------
m1_us = pd.read_csv(data_dir / "M1_USA.csv")
m1_us.columns = m1_us.columns.str.strip()
us_date_col  = m1_us.columns[0]
us_value_col = m1_us.columns[1]  # ajusta si tu header es 'M1'/'M1SL'
us_daily = monthly_stock_to_daily_loginterp(m1_us, us_date_col, us_value_col)
us_daily.to_csv(out_dir / "M1_USA_daily.csv", index=False)
print("Saved:", out_dir / "M1_USA_daily.csv")

# -----------------------------
# PERU (semicolon, d/m/Y)
# -----------------------------
m1_pe = pd.read_csv(data_dir / "M1_PEN.csv", sep=';')
m1_pe.columns = m1_pe.columns.str.strip()
pe_date_col  = "date"
pe_value_col = [c for c in m1_pe.columns if c.lower().startswith("m1")][0]  # 'M1_PEN'
# Normaliza fecha d/m/Y -> ISO
m1_pe["date"] = pd.to_datetime(m1_pe["date"], dayfirst=True, errors="coerce")
pe_daily = monthly_stock_to_daily_loginterp(m1_pe, pe_date_col, pe_value_col)
pe_daily.to_csv(out_dir / "M1_PEN_daily.csv", index=False)
print("Saved:", out_dir / "M1_PEN_daily.csv")

# -----------------------------
# SOUTH AFRICA (semicolon; extra cols ZAR, DEX..., USD)
# usaremos la columna 'ZAR' (en moneda local), ignorar USD y tipo de cambio.
# -----------------------------
m1_za = pd.read_csv(data_dir / "M1_ZAR.csv", sep=';')
m1_za.columns = m1_za.columns.str.replace('\ufeff','', regex=False).str.strip()
# Algunos CSV vienen con nombres como 'ZAR ' con espacios; limpiamos:
rename_map = {c: c.strip() for c in m1_za.columns}
m1_za = m1_za.rename(columns=rename_map)
za_date_col  = "date"
za_value_col = "ZAR"
za_daily = monthly_stock_to_daily_loginterp(m1_za, za_date_col, za_value_col)
za_daily.to_csv(out_dir / "M1_ZAR_daily.csv", index=False)
print("Saved:", out_dir / "M1_ZAR_daily.csv")





Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\M1_EU_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\M1_USA_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\M1_PEN_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\M1_ZAR_daily.csv


## Interest rate - 3 months

In [9]:
# ---------------------------------------------
# 3M Interest Rate (monthly) -> Daily (step/ffill)
# ---------------------------------------------
import pandas as pd
from pathlib import Path

# Paths
base_dir = Path(r"C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code")
data_dir = base_dir / "1. data"
out_dir  = base_dir / "2. preprocesing"
out_dir.mkdir(parents=True, exist_ok=True)

# Date window
RAW_START   = "2014-12-31"   # buffer day
FINAL_START = "2015-01-01"
END_DATE    = "2024-12-31"

def _coerce_find_column(cols, target):
    """Find `target` in `cols` tolerating BOM/spaces/case diffs."""
    norm = lambda s: s.replace('\ufeff','').strip().lower().replace(' ', '')
    t = norm(target)
    # exact
    for c in cols:
        if c == target:
            return c
    # tolerant
    cands = [c for c in cols if norm(c) == t]
    if len(cands) == 1:
        return cands[0]
    raise KeyError(f"Column '{target}' not found. Available: {list(cols)}")

def monthly_rate_to_daily_ffill(
    filepath: Path,
    date_col: str,
    rate_col: str,
    out_name: str,
    source_unit: str = "percent"  # 'percent' (e.g., 0.03 or 6.15) or 'decimal' (0.0003 or 0.0615)
) -> pd.DataFrame:
    """
    Convert a MONTHLY 3M interest rate series to DAILY by forward-fill within month.
    Saves both percent and decimal columns.
    """
    df = pd.read_csv(filepath)
    # Clean headers
    df.columns = df.columns.str.replace('\ufeff','', regex=False).str.strip()

    # Resolve column names robustly
    date_col = _coerce_find_column(df.columns, date_col)
    rate_col = _coerce_find_column(df.columns, rate_col)

    # Keep & rename
    m = df[[date_col, rate_col]].rename(columns={date_col: "date", rate_col: "rate_raw"}).copy()
    m["date"] = pd.to_datetime(m["date"], errors="coerce")
    m = m[(m["date"] >= RAW_START) & (m["date"] <= END_DATE)].sort_values("date")
    m["rate_raw"] = pd.to_numeric(m["rate_raw"], errors="coerce")

    # Units -> percent & decimal
    if source_unit.lower() == "percent":
        # Examples: EU ~0.06 (0.06%), USA TB3MS ~0.03 (%), ZAR ~6.0 (%), PEN ~2.9 (%)
        m["rate_percent"] = m["rate_raw"]
        m["rate_decimal"] = m["rate_raw"] / 100.0
    elif source_unit.lower() == "decimal":
        m["rate_decimal"] = m["rate_raw"]
        m["rate_percent"] = m["rate_raw"] * 100.0
    else:
        raise ValueError("source_unit must be 'percent' or 'decimal'.")

    # Map to daily with forward-fill (step per month)
    daily_index = pd.date_range(start=RAW_START, end=END_DATE, freq="D")
    d = m.set_index("date")[["rate_percent", "rate_decimal"]].reindex(daily_index).ffill()
    d = d.loc[FINAL_START:].copy()
    d.index.name = "date"
    d = d.reset_index()

    # Save
    out_path = out_dir / out_name
    d.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")
    return d

# -----------------------------
# Euro Area (OECD IR3TIB01EZM156N) -> in percent
# -----------------------------
monthly_rate_to_daily_ffill(
    filepath  = data_dir / "interest_rate_3m_EU.csv",
    date_col  = "observation_date",
    rate_col  = "IR3TIB01EZM156N",
    out_name  = "interest_rate_3m_EU_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# USA (FRED TB3MS) -> in percent
# -----------------------------
monthly_rate_to_daily_ffill(
    filepath  = data_dir / "interest_rate_3m_USA.csv",
    date_col  = "observation_date",
    rate_col  = "TB3MS",
    out_name  = "interest_rate_3m_USA_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# Peru (BCRP monthly average) -> in percent
# Note: missing 2015-03; daily ffill will bridge that gap cleanly.
# -----------------------------
monthly_rate_to_daily_ffill(
    filepath  = data_dir / "interest_rate_3m_PEN.csv",
    date_col  = "date",
    rate_col  = "1_dia-3_meses_average",
    out_name  = "interest_rate_3m_PEN_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# South Africa (OECD IR3TIB01ZAM156N) -> in percent
# -----------------------------
monthly_rate_to_daily_ffill(
    filepath  = data_dir / "interest_rate_3m_ZAR.csv",
    date_col  = "observation_date",
    rate_col  = "IR3TIB01ZAM156N",
    out_name  = "interest_rate_3m_ZAR_daily.csv",
    source_unit = "percent"
)


Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\interest_rate_3m_EU_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\interest_rate_3m_USA_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\interest_rate_3m_PEN_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\interest_rate_3m_ZAR_daily.csv


Unnamed: 0,date,rate_percent,rate_decimal
0,2015-01-01,6.007619,0.060076
1,2015-01-02,6.007619,0.060076
2,2015-01-03,6.007619,0.060076
3,2015-01-04,6.007619,0.060076
4,2015-01-05,6.007619,0.060076
...,...,...,...
3648,2024-12-27,7.800000,0.078000
3649,2024-12-28,7.800000,0.078000
3650,2024-12-29,7.800000,0.078000
3651,2024-12-30,7.800000,0.078000


## Industrial production

In [13]:
# ------------------------------------------------------------
# Industrial Production -> Seasonally Adjusted -> Daily (log-linear)
# ------------------------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path
import warnings

# Paths
base_dir = Path(r"C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code")
data_dir = base_dir / "1. data"
out_dir  = base_dir / "2. preprocesing"
out_dir.mkdir(parents=True, exist_ok=True)

# Date window
RAW_START   = "2014-12-31"
FINAL_START = "2015-01-01"
END_DATE    = "2024-12-31"

# ------------- helpers -------------
def _clean_numeric(x):
    if pd.isna(x): return np.nan
    if isinstance(x, (int, float)): return float(x)
    s = str(x).replace(',', '').strip()
    return pd.to_numeric(s, errors="coerce")

def _ensure_monthly_contiguous(df, date_col, value_col):
    """
    Ensure a contiguous monthly (MS) index between FINAL_START..END_DATE.
    Linear interpolation for rare gaps BEFORE seasonal adjustment.
    """
    m = df[[date_col, value_col]].copy()
    m.columns = ["date", "val"]
    m["date"] = pd.to_datetime(m["date"], errors="coerce")
    m["val"]  = m["val"].map(_clean_numeric)
    m = m[(m["date"] >= FINAL_START) & (m["date"] <= END_DATE)].dropna(subset=["date"])

    # Snap to month start (use how='S' instead of 'MS' arg)
    m["date"] = m["date"].dt.to_period("M").dt.to_timestamp(how="S")
    m = m.groupby("date", as_index=True)["val"].last().to_frame()

    # Reindex to full monthly span
    monthly_index = pd.date_range(start=FINAL_START, end=END_DATE, freq="MS")
    m = m.reindex(monthly_index)

    # Fill odd missing month via linear interpolation (before SA)
    m["val"] = m["val"].interpolate(method="linear", limit_direction="both")
    m.index.name = "date"
    return m

def seasonally_adjust_monthly(df, date_col, value_col, already_sa=False):
    """
    Return a DataFrame with columns ['date','ip_sa'] (monthly, MS).
    If already_sa=True, only cleans and standardizes.
    Else: try X-13; fallback to STL multiplicative on logs.
    """
    m = _ensure_monthly_contiguous(df, date_col, value_col)

    if already_sa:
        out = m.rename(columns={"val": "ip_sa"}).reset_index()
        return out

    # Try X-13 (if installed); fallback STL multiplicative
    ip_sa = None
    try:
        from statsmodels.tsa.x13 import x13_arima_analysis as x13
        tmp = m["val"].dropna()
        res = x13(tmp, freq='M')
        ip_sa = res.seasadj.reindex(m.index)
    except Exception as e:
        warnings.warn(f"X-13 unavailable or failed ({e}). Falling back to STL multiplicative.")
        from statsmodels.tsa.seasonal import STL
        y = np.log(m["val"])
        stl = STL(y, period=12, robust=True)
        fit = stl.fit()
        # multiplicative SA = exp(trend + resid)
        sa_log = fit.trend + fit.resid
        ip_sa = np.exp(sa_log)

    out = pd.DataFrame({"date": m.index, "ip_sa": ip_sa.values})
    return out

def sa_monthly_to_daily_loginterp(df_sa, value_col="ip_sa"):
    """
    Monthly SA index -> DAILY via log-linear.
    Returns: ['date','ip_daily_sa','log_ip_daily_sa','ip_rebased_2015m1_100','log_ip_rebased_2015m1_100'].
    """
    m = df_sa.copy()
    m = m[(m["date"] >= FINAL_START) & (m["date"] <= END_DATE)].dropna(subset=[value_col])
    m = m.set_index("date").sort_index()

    m["log_ip"] = np.log(m[value_col])
    daily_index = pd.date_range(start=RAW_START, end=END_DATE, freq="D")
    d = m[["log_ip"]].reindex(daily_index).interpolate(method="linear")
    d = d.loc[FINAL_START:].copy()
    d["ip_daily_sa"] = np.exp(d["log_ip"])
    d.index.name = "date"

    # Rebase (fallback to first available in Jan-2015 if exact day missing)
    base_date = pd.Timestamp("2015-01-01")
    if base_date in d.index:
        base_val = d.loc[base_date, "ip_daily_sa"]
    else:
        jan2015 = d.loc["2015-01-01":"2015-01-31"]
        base_val = jan2015["ip_daily_sa"].iloc[0]
    d["ip_rebased_2015m1_100"] = (d["ip_daily_sa"] / base_val) * 100.0
    d["log_ip_rebased_2015m1_100"] = np.log(d["ip_rebased_2015m1_100"])

    return d.reset_index().rename(columns={"log_ip": "log_ip_daily_sa"})

# ------------- EURO AREA (already SA) -------------
eu = pd.read_csv(data_dir / "INDPRO_EU.csv")
eu.columns = eu.columns.str.replace('\ufeff','', regex=False).str.strip()
eu_sa_m = seasonally_adjust_monthly(eu, date_col="date", value_col="ind_pro_sa_index_2021", already_sa=True)
eu_sa_d = sa_monthly_to_daily_loginterp(eu_sa_m, value_col="ip_sa")
eu_sa_d.to_csv(out_dir / "INDPRO_EU_daily_SA.csv", index=False)
print("Saved:", out_dir / "INDPRO_EU_daily_SA.csv")

# ------------- USA (likely NSA: IPB50001N) -------------
us = pd.read_csv(data_dir / "INDPRO_USA.csv")
us.columns = us.columns.str.strip()
us_sa_m = seasonally_adjust_monthly(us, date_col="observation_date", value_col="IPB50001N", already_sa=False)
us_sa_d = sa_monthly_to_daily_loginterp(us_sa_m, value_col="ip_sa")
us_sa_d.to_csv(out_dir / "INDPRO_USA_daily_SA.csv", index=False)
print("Saved:", out_dir / "INDPRO_USA_daily_SA.csv")

# ------------- SOUTH AFRICA (NSA; semicolon + dayfirst) -------------
za = pd.read_csv(data_dir / "INDPRO_ZAR.csv", sep=';')
za.columns = za.columns.str.replace('\ufeff','', regex=False).str.strip()
za["observation_date"] = pd.to_datetime(za["observation_date"], dayfirst=True, errors="coerce")
za_sa_m = seasonally_adjust_monthly(za, date_col="observation_date", value_col="index_2019_NSA", already_sa=False)
za_sa_d = sa_monthly_to_daily_loginterp(za_sa_m, value_col="ip_sa")
za_sa_d.to_csv(out_dir / "INDPRO_ZAR_daily_SA.csv", index=False)
print("Saved:", out_dir / "INDPRO_ZAR_daily_SA.csv")

# ------------- PERU (NSA; manufactura index) -------------
pe = pd.read_csv(data_dir / "INDPRO_PEN.csv")
pe.columns = pe.columns.str.replace('\ufeff','', regex=False).str.strip()
pe_sa_m = seasonally_adjust_monthly(pe, date_col="date", value_col="prod_manufacturera_index_2007_PN02079AM", already_sa=False)
pe_sa_d = sa_monthly_to_daily_loginterp(pe_sa_m, value_col="ip_sa")
pe_sa_d.to_csv(out_dir / "INDPRO_PEN_daily_SA.csv", index=False)
print("Saved:", out_dir / "INDPRO_PEN_daily_SA.csv")


Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\INDPRO_EU_daily_SA.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\INDPRO_USA_daily_SA.csv




Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\INDPRO_ZAR_daily_SA.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\INDPRO_PEN_daily_SA.csv




## CPI

In [6]:
# ---------------------------------------------
# CPI monthly -> daily (log-linear interpolation)
# ---------------------------------------------
import pandas as pd
import numpy as np
from pathlib import Path

# ---- Paths (edit if needed) ----
base_dir = Path(r"C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code")
data_dir = base_dir / "1. data"
out_dir  = base_dir / "2. preprocesing"
out_dir.mkdir(parents=True, exist_ok=True)

# ---- Date window ----
RAW_START   = "2014-12-31"   # buffer day for interpolation coverage
FINAL_START = "2015-01-01"
END_DATE    = "2024-12-31"

def monthly_cpi_to_daily(
    filepath: Path,
    date_col: str,
    value_col: str,
    out_name: str,
    rebase_2015m1: bool = True
) -> pd.DataFrame:
    """
    Load CPI monthly, produce DAILY CPI by log-linear interpolation.
    Optionally rebase to 2015-01-01 = 100 for convenient comparability.
    Output columns:
      - date
      - cpi_daily, log_cpi_daily
      - (optional) cpi_rebased_2015m1_100, log_cpi_rebased_2015m1_100
    """
    # Load & basic clean
    m = pd.read_csv(filepath).rename(columns={date_col: "date", value_col: "cpi"})
    m["date"] = pd.to_datetime(m["date"])
    m = m[(m["date"] >= RAW_START) & (m["date"] <= END_DATE)].sort_values("date")

    # Interpolate in log space (→ constant daily growth between monthly fixes)
    m = m.set_index("date")
    m["log_cpi"] = np.log(m["cpi"])

    daily_index = pd.date_range(start=RAW_START, end=END_DATE, freq="D")
    d = m[["log_cpi"]].reindex(daily_index).interpolate(method="linear")
    d = d.loc[FINAL_START:].copy()
    d["cpi_daily"] = np.exp(d["log_cpi"])

    # Optional rebase to 2015-01-01 = 100
    if rebase_2015m1:
        base_val = d.loc[pd.Timestamp("2015-01-01"), "cpi_daily"]
        d["cpi_rebased_2015m1_100"] = (d["cpi_daily"] / base_val) * 100.0
        d["log_cpi_rebased_2015m1_100"] = np.log(d["cpi_rebased_2015m1_100"])

    # Final tidy & save
    d.index.name = "date"
    d = d.reset_index().rename(columns={"log_cpi": "log_cpi_daily"})
    d.to_csv(out_dir / out_name, index=False)
    print(f"Saved: {out_dir / out_name}")
    return d

# -----------------------------
# USA (FRED CPIAUCNS, monthly)
# -----------------------------
monthly_cpi_to_daily(
    filepath = data_dir / "CPI_AUCNS_USA.csv",
    date_col = "observation_date",
    value_col = "CPIAUCNS",
    out_name = "CPI_USA_daily.csv",
    rebase_2015m1 = True
)

# -----------------------------
# Euro Area HICP (CP0000EZ19M086NEST, monthly)
# -----------------------------
monthly_cpi_to_daily(
    filepath = data_dir / "CPI_HICP_EU.csv",
    date_col = "observation_date",
    value_col = "CP0000EZ19M086NEST",
    out_name = "CPI_EU_daily.csv",
    rebase_2015m1 = True
)

# -----------------------------
# Peru CPI (monthly; base Dec-2021=100 in your file)
# -----------------------------
monthly_cpi_to_daily(
    filepath = data_dir / "CPI_PEN.csv",
    date_col = "date",
    value_col = "CPI_index_dic_2021_PN38705PM",
    out_name = "CPI_PEN_daily.csv",
    rebase_2015m1 = True
)

# -----------------------------
# South Africa CPI (clean monthly index as you clarified)
# -----------------------------
monthly_cpi_to_daily(
    filepath = data_dir / "CPI_ZAR.csv",
    date_col = "date",
    value_col = "cpi_index_dec_2024_value",
    out_name = "CPI_ZAR_daily.csv",
    rebase_2015m1 = True
)


Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\CPI_USA_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\CPI_EU_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\CPI_PEN_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\CPI_ZAR_daily.csv


Unnamed: 0,date,log_cpi_daily,cpi_daily,cpi_rebased_2015m1_100,log_cpi_rebased_2015m1_100
0,2015-01-01,4.120662,61.600000,100.000000,4.605170
1,2015-01-02,4.120871,61.612863,100.020881,4.605379
2,2015-01-03,4.121079,61.625728,100.041767,4.605588
3,2015-01-04,4.121288,61.638597,100.062657,4.605797
4,2015-01-05,4.121497,61.651468,100.083551,4.606005
...,...,...,...,...,...
3648,2024-12-27,4.605170,100.000000,162.337662,5.089679
3649,2024-12-28,4.605170,100.000000,162.337662,5.089679
3650,2024-12-29,4.605170,100.000000,162.337662,5.089679
3651,2024-12-30,4.605170,100.000000,162.337662,5.089679


## Inflation

In [7]:
# ---------------------------------------------
# Inflation (YoY monthly) -> Daily (step/ffill)
# ---------------------------------------------
import pandas as pd
from pathlib import Path

# Paths
base_dir = Path(r"C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code")
data_dir = base_dir / "1. data"
out_dir  = base_dir / "2. preprocesing"
out_dir.mkdir(parents=True, exist_ok=True)

# Date window
RAW_START   = "2014-12-31"   # buffer day
FINAL_START = "2015-01-01"
END_DATE    = "2024-12-31"

def _coerce_find_column(cols, target):
    """Find `target` in `cols` tolerating stray spaces/BOM/case differences."""
    norm = lambda s: s.replace('\ufeff','').strip().lower().replace(' ', '')
    target_n = norm(target)
    # exact first
    for c in cols:
        if c == target:
            return c
    # tolerant
    candidates = [c for c in cols if norm(c) == target_n]
    if len(candidates) == 1:
        return candidates[0]
    raise KeyError(f"Column '{target}' not found. Available: {list(cols)}")

def monthly_inflation_to_daily(
    filepath: Path,
    date_col: str,
    rate_col: str,
    out_name: str,
    source_unit: str = "percent"   # "percent" or "decimal"
) -> pd.DataFrame:
    """
    Convert monthly YoY inflation to daily by forward-fill within month.
    Outputs both percent and decimal:
      - pi_yoy_percent
      - pi_yoy_decimal
    """
    df = pd.read_csv(filepath)
    # Clean headers
    df.columns = df.columns.str.replace('\ufeff','', regex=False).str.strip()

    # Resolve column names robustly
    date_col  = _coerce_find_column(df.columns, date_col)
    rate_col  = _coerce_find_column(df.columns, rate_col)

    # Keep & rename
    m = df[[date_col, rate_col]].rename(columns={date_col: "date", rate_col: "pi_yoy_raw"}).copy()
    m["date"] = pd.to_datetime(m["date"], errors="coerce")
    m = m[(m["date"] >= RAW_START) & (m["date"] <= END_DATE)].sort_values("date")
    m["pi_yoy_raw"] = pd.to_numeric(m["pi_yoy_raw"], errors="coerce")

    # Units
    if source_unit.lower() == "percent":
        m["pi_yoy_percent"] = m["pi_yoy_raw"]
        m["pi_yoy_decimal"] = m["pi_yoy_raw"] / 100.0
    elif source_unit.lower() == "decimal":
        m["pi_yoy_decimal"] = m["pi_yoy_raw"]
        m["pi_yoy_percent"] = m["pi_yoy_raw"] * 100.0
    else:
        raise ValueError("source_unit must be 'percent' or 'decimal'.")

    # Map to daily with ffill (step function per month)
    daily_index = pd.date_range(start=RAW_START, end=END_DATE, freq="D")
    d = m.set_index("date")[["pi_yoy_percent", "pi_yoy_decimal"]].reindex(daily_index).ffill()
    d = d.loc[FINAL_START:].copy()
    d.index.name = "date"
    d = d.reset_index()

    # Save
    out_path = out_dir / out_name
    d.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")
    return d

# -----------------------------
# Euro Area (annual_change_rate_hicp) -> given in percent
# -----------------------------
monthly_inflation_to_daily(
    filepath = data_dir / "inflation_EU.csv",
    date_col = "date",
    rate_col = "annual_change_rate_hicp",
    out_name = "inflation_EU_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# Peru (var%_12_months_inflation_value) -> given in percent
# -----------------------------
monthly_inflation_to_daily(
    filepath = data_dir / "inflation_PEN.csv",
    date_col = "date",
    rate_col = "var%_12_months_inflation_value",
    out_name = "inflation_PEN_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# USA (from_CPI-U_12_months_ago) -> given in percent
# -----------------------------
monthly_inflation_to_daily(
    filepath = data_dir / "inflation_USA.csv",
    date_col = "date",
    rate_col = "from_CPI-U_12_months_ago",
    out_name = "inflation_USA_daily.csv",
    source_unit = "percent"
)

# -----------------------------
# South Africa (value_year_on_year) -> given in percent
# -----------------------------
monthly_inflation_to_daily(
    filepath = data_dir / "inflation_ZAR.csv",
    date_col = "date",
    rate_col = "value_year_on_year",
    out_name = "inflation_ZAR_daily.csv",
    source_unit = "percent"
)


Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\inflation_EU_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\inflation_PEN_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\inflation_USA_daily.csv
Saved: C:\Users\luish\OneDrive - University of Leeds\9. DISERTATION\2. Development\code\2. preprocesing\inflation_ZAR_daily.csv


Unnamed: 0,date,pi_yoy_percent,pi_yoy_decimal
0,2015-01-01,4.4,0.044
1,2015-01-02,4.4,0.044
2,2015-01-03,4.4,0.044
3,2015-01-04,4.4,0.044
4,2015-01-05,4.4,0.044
...,...,...,...
3648,2024-12-27,3.0,0.030
3649,2024-12-28,3.0,0.030
3650,2024-12-29,3.0,0.030
3651,2024-12-30,3.0,0.030
