In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from data_import import load_data
from model_dfs import prepare_nig_inputs
from nig_em_paper import EM_algo, one_year_pd_timeseries
from nig_gibbs import gibbs_sampler

In [3]:
# Load Accenture dataset
ret_daily, bs = load_data(
    xlsx_path= "C:/Users/afons/OneDrive/Desktop/ESE/FCS/Merton_NIGbayesian/Jan2025_Accenture_Dataset_ErasmusCase.xlsx",
    verbose=True
)

print(ret_daily.head())
print("-"*40)
print(bs.head())
print("-"*40)

# Load ECB 1Y risk-free yield data

# If first time, call API to get data, otherwise:
df_rf= pd.read_csv("ecb_riskfree_1y_daily.csv", parse_dates=["date"])
print(df_rf.head())

      country_iso          isin       date                       company  \
41651         DEU  DE0005190003 2010-01-05  BAYERISCHE MOTOREN WERKE AKT   
41652         DEU  DE0005190003 2010-01-06  BAYERISCHE MOTOREN WERKE AKT   
41653         DEU  DE0005190003 2010-01-07  BAYERISCHE MOTOREN WERKE AKT   
41654         DEU  DE0005190003 2010-01-08  BAYERISCHE MOTOREN WERKE AKT   
41655         DEU  DE0005190003 2010-01-11  BAYERISCHE MOTOREN WERKE AKT   

        gvkey   shares_out   close  mcap_reported  shares_out_filled  \
41651  100022  601995196.0  32.310   1.945046e+10        601995196.0   
41652  100022  601995196.0  32.810   1.975146e+10        601995196.0   
41653  100022  601995196.0  33.100   1.992604e+10        601995196.0   
41654  100022  601995196.0  32.655   1.965815e+10        601995196.0   
41655  100022  601995196.0  32.170   1.936619e+10        601995196.0   

               mcap  bad_day  logret_close  logret_mcap  
41651  1.945046e+10    False      0.008080     0.008

In [4]:
# Build NIG inputs and fill missing liabilities
df_nig_panel, nig_em_data = prepare_nig_inputs(ret_daily, bs, df_rf)
print(df_nig_panel.head())


    gvkey       date             E          isin  \
0  100022 2010-01-05  1.945046e+10  DE0005190003   
1  100080 2010-01-05  4.578810e+10  DE000BAY0017   
2  100312 2010-01-05  1.765719e+09  DE0007030009   
3  100581 2010-01-05  4.701386e+10  FR0000120321   
4  100957 2010-01-05  3.539016e+10  ES0144580Y14   

                        company country_iso         r             L  
0  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007934  8.576700e+10  
1                      BAYER AG         DEU  0.007934  3.254300e+10  
2                RHEINMETALL AG         DEU  0.007934  3.105000e+09  
3                     LOREAL SA         FRA  0.007934  9.178700e+09  
4                  IBERDROLA SA         ESP  0.007934  6.203788e+10  


In [None]:
df_nig_panel = df_nig_panel.sort_values(["gvkey", "date"]).reset_index(drop=True)
print(df_nig_panel.head())

    gvkey       date             E          isin  \
0  100022 2010-01-05  1.945046e+10  DE0005190003   
1  100022 2010-01-06  1.975146e+10  DE0005190003   
2  100022 2010-01-07  1.992604e+10  DE0005190003   
3  100022 2010-01-08  1.965815e+10  DE0005190003   
4  100022 2010-01-11  1.936619e+10  DE0005190003   

                        company country_iso         r             L  
0  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007934  8.576700e+10  
1  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007782  8.576700e+10  
2  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007491  8.576700e+10  
3  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007417  8.576700e+10  
4  BAYERISCHE MOTOREN WERKE AKT         DEU  0.007056  8.576700e+10  


In [6]:
start_date = np.datetime64("2015-01-01")
end_date   = np.datetime64("2015-12-31")

# EM start params (your defaults)
start_params = {"alpha": 1.0, "beta1": 0.0, "delta": 1.0, "beta0": 0.0}

# how many firms to run (EM + PD) right now
n_firms = 5  # <-- change this

errors = []
em_results = []
pd_panels = []

done = 0

for gvkey, g in df_nig_panel.groupby("gvkey", sort=False):
    if done >= n_firms:
        break

    company = g["company"].iloc[0]

    try:
        # ensure numpy datetime64[ns]
        dates_np = g["date"].to_numpy()
        if not np.issubdtype(dates_np.dtype, np.datetime64):
            dates_np = pd.to_datetime(dates_np).to_numpy(dtype="datetime64[ns]")
        else:
            dates_np = pd.to_datetime(dates_np).to_numpy(dtype="datetime64[ns]")

        # --- EM ---
        em_out = EM_algo(
            E_series=g["E"].to_numpy(dtype=float),
            L_face_series=g["L"].to_numpy(dtype=float),
            rf_series=g["r"].to_numpy(dtype=float),
            dates=dates_np,
            start_params=start_params,
            start_date=start_date,
            end_date=end_date,
            max_iter=10,
            min_iter=3,
            tol=1e-3,
        )

        em_params = em_out["params"]

        print(
            f"{company} | EM_converged={em_out['converged']} (n_iter={em_out['n_iter']}) | "
            f"alpha={em_params['alpha']:.6f} | beta1={em_params['beta1']:.6f} | "
            f"delta={em_params['delta']:.6f} | beta0={em_params['beta0']:.6f}"
        )

        # store EM params
        em_results.append({
            "gvkey": gvkey,
            "company": company,
            "converged": bool(em_out["converged"]),
            "n_iter": int(em_out["n_iter"]),
            "alpha": float(em_params["alpha"]),
            "beta1": float(em_params["beta1"]),
            "delta": float(em_params["delta"]),
            "beta0": float(em_params["beta0"]),
        })

        # --- PD time series on training window ---
        pd_df = one_year_pd_timeseries(em_out, L_face_series_full=g["L"].to_numpy(dtype=float))
        pd_df["date"] = pd.to_datetime(pd_df["date"])
        pd_df["gvkey"] = gvkey
        pd_df["company"] = company
        pd_panels.append(pd_df)

        tail = pd_df[["date", "PD_physical", "PD_risk_neutral"]].tail(5)
        for _, row in tail.iterrows():
            date_str = row["date"].strftime("%Y-%m-%d")
            print(
                f"  {date_str} | PD_P(%)={row['PD_physical']*100:.6f} | "
                f"PD_Q(%)={row['PD_risk_neutral']*100:.6f}"
            )

        done += 1  # <-- count only successful runs

    except Exception as e:
        print(f"{company} | ERROR: {e}")
        errors.append({"gvkey": gvkey, "company": company, "error": str(e)})
        # (optional) still count failures toward n_firms:
        # done += 1

errors_df = pd.DataFrame(errors)
print("\nErrors:", len(errors_df))

em_results_df = pd.DataFrame(em_results)
em_params_by_gvkey = (
    em_results_df
    .set_index("gvkey")[["alpha", "beta1", "delta", "beta0"]]
    .to_dict(orient="index")
)

pd_panel_df = pd.concat(pd_panels, ignore_index=True) if len(pd_panels) else pd.DataFrame()


BAYERISCHE MOTOREN WERKE AKT | EM_converged=True (n_iter=6) | alpha=365.143996 | beta1=-0.741226 | delta=3.737591 | beta0=0.042198
  2015-12-22 | PD_P(%)=0.000745 | PD_Q(%)=0.004824
  2015-12-23 | PD_P(%)=0.000531 | PD_Q(%)=0.003552
  2015-12-28 | PD_P(%)=0.000640 | PD_Q(%)=0.004201
  2015-12-29 | PD_P(%)=0.000530 | PD_Q(%)=0.003552
  2015-12-30 | PD_P(%)=0.000610 | PD_Q(%)=0.004053
BAYER AG | EM_converged=False (n_iter=10) | alpha=263.049651 | beta1=62.163825 | delta=10.189464 | beta0=-2.461600
  2015-12-22 | PD_P(%)=0.000006 | PD_Q(%)=0.000019
  2015-12-23 | PD_P(%)=0.000004 | PD_Q(%)=0.000012
  2015-12-28 | PD_P(%)=0.000005 | PD_Q(%)=0.000014
  2015-12-29 | PD_P(%)=0.000003 | PD_Q(%)=0.000009
  2015-12-30 | PD_P(%)=0.000004 | PD_Q(%)=0.000013
RHEINMETALL AG | EM_converged=False (n_iter=10) | alpha=150.656040 | beta1=6.645442 | delta=2.088616 | beta0=0.114841
  2015-12-22 | PD_P(%)=0.000000 | PD_Q(%)=0.002059
  2015-12-23 | PD_P(%)=0.000000 | PD_Q(%)=0.001735
  2015-12-28 | PD_P(%)=0

In [9]:
em_params_by_gvkey = (
    em_results_df
    .dropna(subset=["alpha","beta1","delta","beta0"])
    .set_index("gvkey")[["alpha","beta1","delta","beta0"]]
    .to_dict(orient="index")
)

n_firms = 5
start_date = np.datetime64("2015-01-01")
end_date   = np.datetime64("2015-12-31")

rng = np.random.default_rng(12345)

gibbs_results = []
errors = []

for j, (gvkey, g) in enumerate(df_nig_panel.groupby("gvkey", sort=False), start=1):
    if j > n_firms:
        break

    company = g["company"].iloc[0]

    try:
        # pull EM params (skip firm if missing)
        if gvkey not in em_params_by_gvkey:
            raise ValueError("No EM params stored for this gvkey (run EM first or fix results_df).")

        em_params = em_params_by_gvkey[gvkey]  # {'alpha':..., 'beta1':..., 'delta':..., 'beta0':...}

        dates_arr = pd.to_datetime(g["date"]).to_numpy(dtype="datetime64[ns]")
        E_arr  = g["E"].to_numpy(dtype=float)
        L_arr  = g["L"].to_numpy(dtype=float)
        rf_arr = g["r"].to_numpy(dtype=float)

        gibbs_out = gibbs_sampler(
            E_series=E_arr,
            L_series=L_arr,
            rf_series=rf_arr,
            dates=dates_arr,
            start_date=start_date,
            end_date=end_date,
            max_iter=100,
            burn_in=20,
            thin=2,
            trading_days=250,
            em_params=em_params,     # <-- used to center priors/hyperparams inside gibbs_sampler
            rng=rng,
        )

        meta = gibbs_out["meta"]
        print(f"{company} | Gibbs kept={meta['n_keep_actual']} | reject={meta['n_reject']}")

        # ---- print last 5 kept (thinned) parameter draws ----
        draws = np.asarray(gibbs_out["params_draws"], dtype=float)  # columns: [alpha, beta1, delta, beta0]
        if draws.shape[0] == 0:
            print("  (no kept draws)")
        else:
            last5 = draws[-5:, :]  # if fewer than 5, this just returns all
            for k_i, (a, b1, d, b0) in enumerate(last5, start=max(1, draws.shape[0]-last5.shape[0]+1)):
                print(
                    f"  draw {k_i:>4d} | alpha={a:.6f} | beta1={b1:.6f} | delta={d:.6f} | beta0={b0:.6f}"
                )


        gibbs_results.append({"gvkey": gvkey, "company": company, "gibbs": gibbs_out})

    except Exception as e:
        print(f"{company} | ERROR: {e}")
        errors.append({"gvkey": gvkey, "company": company, "error": str(e)})

errors_df = pd.DataFrame(errors)

BAYERISCHE MOTOREN WERKE AKT | Gibbs kept=40 | reject=0
  draw   36 | alpha=281.228970 | beta1=0.640942 | delta=2.909185 | beta0=0.035513
  draw   37 | alpha=274.069835 | beta1=-3.599190 | delta=2.908148 | beta0=0.172146
  draw   38 | alpha=220.893710 | beta1=11.636202 | delta=2.387276 | beta0=0.109794
  draw   39 | alpha=242.755335 | beta1=3.069099 | delta=2.382807 | beta0=0.123524
  draw   40 | alpha=242.335257 | beta1=1.271418 | delta=2.332377 | beta0=-0.004897


KeyboardInterrupt: 

In [None]:
# Approach 2
from nig_gibbs import gibbs_sampler
import numpy as np

em_params = em_init.params.copy()
em_params["theta"] = float(em_init.theta_series[-1])  # starting theta for pricing

draws = gibbs_sampler(
    equity=E, liabilities_L=float(L), r_series=r,
    maturity_T=1.0,                 # PD horizon used inside sampler
    n_iter=5000, burn_in=1000,
    em_params=em_params,
    thin=20,
    physical_measure=True,
)

pd_mean = float(np.mean(draws["pd"]))
pd_p05, pd_p95 = np.quantile(draws["pd"], [0.05, 0.95])
print("Posterior mean PD:", pd_mean, "90% CI:", (pd_p05, pd_p95))
