In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from data_import import load_data
from model_dfs import prepare_nig_inputs
from nig_gibbs import gibbs_sampler
from nig_em_paper import EM_algo, one_year_pd_timeseries

In [9]:
# Load Accenture dataset
ret_daily, bs = load_data(
    xlsx_path= None,
    verbose=True
)

print(ret_daily.head())
print("-"*40)
print(bs.head())
print("-"*40)

# Load ECB 1Y risk-free yield data

# If first time, call API to get data, otherwise:
path = Path.cwd() / "data/derived" / "ecb_riskfree_1y_daily.csv"
df_rf= pd.read_csv(path, parse_dates=["date"])
print(df_rf.head())
# Build NIG inputs and fill missing liabilities
df_nig_panel, nig_em_data = prepare_nig_inputs(ret_daily, bs, df_rf)
print(df_nig_panel.head())


      country_iso          isin       date                       company  \
41651         DEU  DE0005190003 2010-01-05  BAYERISCHE MOTOREN WERKE AKT   
41652         DEU  DE0005190003 2010-01-06  BAYERISCHE MOTOREN WERKE AKT   
41653         DEU  DE0005190003 2010-01-07  BAYERISCHE MOTOREN WERKE AKT   
41654         DEU  DE0005190003 2010-01-08  BAYERISCHE MOTOREN WERKE AKT   
41655         DEU  DE0005190003 2010-01-11  BAYERISCHE MOTOREN WERKE AKT   

        gvkey   shares_out   close  mcap_reported  shares_out_filled  \
41651  100022  601995196.0  32.310   1.945046e+10        601995196.0   
41652  100022  601995196.0  32.810   1.975146e+10        601995196.0   
41653  100022  601995196.0  33.100   1.992604e+10        601995196.0   
41654  100022  601995196.0  32.655   1.965815e+10        601995196.0   
41655  100022  601995196.0  32.170   1.936619e+10        601995196.0   

               mcap  bad_day  logret_close  logret_mcap  
41651  1.945046e+10    False      0.008080     0.008

In [10]:
# PD for X firm X date
TRADING_DAYS = 250  # paper convention uses 250; your earlier code used 252 but this file uses 250 internally

def firm_pd_on_date(
    df_panel: pd.DataFrame,
    gvkey: int,
    asof_date: str | pd.Timestamp,
    *,
    window_years: int = 5,
    start_params: dict = None,
    use_physical: bool = True,
):
    """
    Returns 1-year PD for one firm at a chosen 'as-of' date, where the as-of date
    is the end_date of the EM training window.
    """

    if start_params is None:
        # paper-style reasonable starting values; tune if needed
        start_params = {"alpha": 10.0, "beta1": 0.0, "delta": 1.0, "beta0": 0.0}

    asof_date = pd.to_datetime(asof_date)

    # 1) firm slice, sort, keep only data up to as-of date
    dff = df_panel.loc[df_panel["gvkey"] == gvkey].copy()
    dff["date"] = pd.to_datetime(dff["date"])
    dff = dff.sort_values("date")
    dff = dff.loc[dff["date"] <= asof_date]

    if dff.empty:
        raise ValueError("No observations for this firm up to asof_date.")

    # 2) define training window start/end (calendar-based; you can also do last N trading days)
    start_date = asof_date - pd.DateOffset(years=window_years)
    end_date = asof_date

    # 3) build the series required by EM_algo (must align 1:1)
    E_series  = dff["E"].to_numpy(dtype=float)     # equity market value (market cap)
    L_series  = dff["L"].to_numpy(dtype=float)     # liabilities proxy used as face value L
    rf_series = dff["r"].to_numpy(dtype=float)     # daily risk-free rate series
    dates     = dff["date"].to_numpy()             # numpy datetime64 array

    # 4) run EM on that window
    out = EM_algo(
        E_series=E_series,
        L_face_series=L_series,
        rf_series=rf_series,
        dates=dates,
        start_params=start_params,
        start_date=np.datetime64(start_date),
        end_date=np.datetime64(end_date),
        max_iter=10,
        min_iter=3,
        tol=1e-3,
    )

    # 5) compute PD series and pick the as-of row (last row should be end_date)
    pd_ts = one_year_pd_timeseries(out, L_face_series_full=L_series)

    # pick the row closest to the as-of date inside dates_win
    pd_ts["date"] = pd.to_datetime(pd_ts["date"])
    row = pd_ts.loc[pd_ts["date"] == asof_date]
    if row.empty:
        row = pd_ts.iloc[[-1]]  # fallback: last available date in window

    col = "PD_physical" if use_physical else "PD_risk_neutral"
    return float(row[col].iloc[0]), row.iloc[0].to_dict()



In [11]:
# PD for all firms on date D
def all_firms_pd_on_date(df_panel, asof_date, **kwargs):
    asof_date = pd.to_datetime(asof_date)
    out_rows = []

    for gvkey in sorted(df_panel["gvkey"].unique()):
        try:
            pd_val, _ = firm_pd_on_date(df_panel, gvkey, asof_date, **kwargs)
            out_rows.append({"gvkey": gvkey, "date": asof_date, "PD_1y": pd_val})
        except Exception as e:
            out_rows.append({"gvkey": gvkey, "date": asof_date, "PD_1y": np.nan, "error": str(e)})

    return pd.DataFrame(out_rows)



In [None]:
# USAGE
df = df_nig_panel.copy()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["gvkey", "date"])

gv = df["gvkey"].iloc[0]          # pick any firm id
g = df[df["gvkey"] == gv].copy().sort_values("date")

# Firm gv (randomly picked) on date D (2 years training vs 1 year training)
pd_value_1y, debug_1y = firm_pd_on_date(df, gvkey=gv, asof_date="2015-12-31", window_years=1)

print("train=1y:", pd_value_1y)

In [13]:
pd_value_2y, debug_2y = firm_pd_on_date(df, gvkey=gv, asof_date="2015-12-31", window_years=2)
print("train=2y:", pd_value_2y)

train=2y: 2.6622981541948273e-07


In [5]:
# create a set of 20 firms representative of the sample
def select_representative_firms(
    df_panel: pd.DataFrame,
    asof_date: str | pd.Timestamp,
    *,
    n_firms: int = 20,
    criterion: str = "lev_EplusL",   # or "lev_LoverE", "size_E"
    stratify_bins: int = 5,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      df_subpanel: panel restricted to the selected firms (all dates)
      snapshot: firm-level snapshot on asof_date with ordering variable and bin label
    """

    asof_date = pd.to_datetime(asof_date)

    df = df_panel.copy()
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values(["gvkey", "date"])

    # Firm snapshot at (or before) asof_date: last available row up to that date
    snap = (df[df["date"] <= asof_date]
            .groupby("gvkey", as_index=False)
            .tail(1)
            .copy())

    # economically meaningful ordering variables
    snap["lev_EplusL"] = snap["L"] / (snap["E"] + snap["L"])
    snap["lev_LoverE"] = snap["L"] / snap["E"]
    snap["size_E"] = snap["E"]

    if criterion not in snap.columns:
        raise ValueError(f"Unknown criterion '{criterion}'. Choose from: lev_EplusL, lev_LoverE, size_E")

    # Drop problematic values
    snap = snap.replace([np.inf, -np.inf], np.nan).dropna(subset=[criterion])

    # Stratify firms into bins across the criterion distribution
    snap["bin"] = pd.qcut(snap[criterion], q=stratify_bins, labels=False, duplicates="drop")

    # allocate ~equal number of firms per bin
    bins = sorted(snap["bin"].dropna().unique())
    per_bin = n_firms // len(bins)
    remainder = n_firms - per_bin * len(bins)

    selected = []
    rng = np.random.default_rng(42)

    for b in bins:
        firms_b = snap.loc[snap["bin"] == b, "gvkey"].tolist()
        if len(firms_b) == 0:
            continue

        k = min(per_bin + (1 if remainder > 0 else 0), len(firms_b))
        remainder = max(remainder - 1, 0)

        # choose within each bin: pick median-ish + extremes if you prefer deterministic;
        # here: random but reproducible
        chosen = rng.choice(firms_b, size=k, replace=False).tolist()
        selected.extend(chosen)

    selected = selected[:n_firms]

    # Order selected firms by the criterion (economically meaningful ordering)
    snap_sel = snap[snap["gvkey"].isin(selected)].copy()
    snap_sel = snap_sel.sort_values(criterion, ascending=True)

    # Filter the full panel to only those firms (keep all dates)
    df_sub = df[df["gvkey"].isin(selected)].copy()

    return df_sub, snap_sel


In [None]:
# filter sample accordingly and ispect
df = df_nig_panel.copy()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["gvkey", "date"])

df_20, snapshot_20 = select_representative_firms(
    df,
    asof_date="2015-12-31",
    n_firms=20,
    criterion="lev_EplusL",   # leverage proxy grounded in Merton logic
    stratify_bins=5
)

print(snapshot_20[["gvkey", "company", "country_iso", "E", "L", "lev_EplusL", "bin"]].head(20))


        gvkey                  company country_iso             E  \
68745  203053     HERMES INTERNATIONAL         FRA  3.291126e+10   
68756  245663  IND DE DISENO TEXTIL SA         ESP  9.876670e+10   
68720  100581                LOREAL SA         FRA  8.692372e+10   
68729  103487                   SAP SE         DEU  9.014764e+10   
68722  101202         L'AIR LIQUIDE SA         FRA  3.566948e+10   
68754  241637     ANHEUSER-BUSCH INBEV         BEL  1.839829e+11   
68741   17452                DANONE SA         FRA  4.079036e+10   
68718  100080                 BAYER AG         DEU  9.576056e+10   
68740   17436                  BASF SE         DEU  6.495481e+10   
68726  101336   SCHNEIDER ELECTRIC S E         FRA  3.088425e+10   
68753  241456         DEUTSCHE POST AG         DEU  3.147702e+10   
68743   19349               SIEMENS AG         DEU  7.918428e+10   
68747  220940                ORANGE SA         FRA  4.101799e+10   
68721  100957             IBERDROLA SA         E

In [16]:
# PD for 20 firms on date D (1 year training)
pds_20 = all_firms_pd_on_date(df_20, "2015-12-31", window_years=1)

In [17]:
print(pds_20.head(20))

     gvkey       date         PD_1y  \
0   100080 2015-12-31  2.976672e-08   
1   100312 2015-12-31  8.314583e-10   
2   100581 2015-12-31  2.515562e-21   
3   100957 2015-12-31  5.411404e-15   
4   101202 2015-12-31  2.106317e-11   
5   101336 2015-12-31  4.289951e-07   
6   103487 2015-12-31  1.266919e-24   
7    15532 2015-12-31  1.265506e-04   
8    15549 2015-12-31           NaN   
9    16348 2015-12-31  2.351386e-04   
10   17436 2015-12-31  3.961693e-08   
11   17452 2015-12-31  2.598451e-16   
12   19349 2015-12-31  3.963482e-08   
13  203053 2015-12-31  2.483952e-37   
14  220940 2015-12-31  2.279329e-06   
15  241456 2015-12-31  3.228206e-08   
16  241637 2015-12-31  3.169705e-14   
17  243774 2015-12-31  1.495575e-07   
18  245663 2015-12-31  3.028250e-37   
19   61616 2015-12-31  8.049185e-05   

                                                error  
0                                                 NaN  
1                                                 NaN  
2           

In [14]:
# look for BMW
df = df_nig_panel.copy()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["gvkey", "date"])

pd_value_1y, debug_1y = firm_pd_on_date(df, gvkey=100022, asof_date="2012-12-31", window_years=1)
print("BMW train=1y:", pd_value_1y)


ValueError: No observations for this firm up to asof_date.