In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from data_import import (
    load_data, load_ecb_1y_yield,
    fill_liabilities, drop_high_leverage_firms,
    prepare_merton_inputs
)
from merton_calibration import calibrate_merton_panel, add_physical_pd_from_implied_assets

print(Path.cwd())

c:\Users\afons\OneDrive\Desktop\ESE\FCS\Merton_NIGbayesian


In [2]:
# data loading and initial processing
ret_daily, bs, coverage = load_data(
    Path.cwd() / "Jan2025_Accenture_Dataset_ErasmusCase.xlsx",
    start_date="2012-01-01",
    end_date="2025-12-19",
    enforce_coverage=True,
    coverage_tol=0.95,
    liabilities_scale="auto",
    verbose=True,
)

df_rf = load_ecb_1y_yield(
    startPeriod="2010-01-01",
    endPeriod="2025-12-31",
    out_file="ecb_yc_1y_aaa.xml",
    verify_ssl=True,  # recommended if it works
)

df_cal = ret_daily[["date"]].drop_duplicates().sort_values("date").reset_index(drop=True)

debt_daily = fill_liabilities(bs, df_cal)

ret_filt, bs_filt, lev_by_firm, dropped = drop_high_leverage_firms(
    ret_daily,
    bs,
    df_calendar=df_cal,
    debt_daily=debt_daily,
    lev_threshold=8.0,
    lev_agg="median",
    verbose=True,
)

# keep debt panel consistent with filtered firms
keep = set(ret_filt["gvkey"].astype(str).unique())
debt_daily_filt = debt_daily[debt_daily["gvkey"].astype(str).isin(keep)].copy()

# Merton
merton_df = prepare_merton_inputs(ret_filt, bs_filt, df_rf, debt_daily=debt_daily_filt)

[load_data] Firms (ret_daily): 46
[load_data] Date range (ret_daily): 2012-01-03 .. 2025-12-19
[load_data] Coverage min/median/max: 0.999 / 1.000 / 1.000
[load_data] liabilities_scale_used: 1e+06
[load_data] QA mcap_reported<=0 rows (raw windowed mkt): 62
Data has been written to ecb_yc_1y_aaa.xml
[drop_high_leverage_firms] agg=median, threshold=8.0
[drop_high_leverage_firms] firms before: 46 | after: 36
[drop_high_leverage_firms] dropped firms: 10


In [3]:
# Check missingness of sigma_E
print("Missing sigma_E %:", merton_df["sigma_E"].isna().mean() * 100)
print(merton_df["sigma_E"].describe())

# Check missingness of B and r
print("Missing B %:", merton_df["B"].isna().mean() * 100)
print("Missing r %:", merton_df["r"].isna().mean() * 100)

Missing sigma_E %: 3.431054858754908
count    126655.000000
mean          0.254476
std           0.083942
min           0.117892
25%           0.195147
50%           0.238274
75%           0.292771
max           0.743236
Name: sigma_E, dtype: float64
Missing B %: 0.0
Missing r %: 0.0


In [4]:
# BUILDING THE CALIBRATION DATASET DROPPING ROWS WITH MISSING INPUTS
df = merton_df.copy()

# first date where B becomes available for each firm
first_B_date = (
    df.dropna(subset=["B"])
      .groupby("gvkey")["date"]
      .min()
      .rename("first_B_date")
)
# first date where sigma_E becomes available for each firm
first_sigma_date = (
    df.dropna(subset=["sigma_E"])
      .groupby("gvkey")["date"]
      .min()
      .rename("first_sigma_date")
)

starts = pd.concat([first_B_date, first_sigma_date], axis=1)
starts["calib_start"] = starts[["first_B_date","first_sigma_date"]].max(axis=1)

# attach and filter
df2 = df.merge(starts["calib_start"], on="gvkey", how="left")

calib = (
    df2[df2["date"] >= df2["calib_start"]]
      .dropna(subset=["E","B","r","sigma_E"])
      .query("E > 0 and B > 0")
      .copy()
      .rename(columns={"B":"B_drop"})
)

print("Rows before:", len(df), "Rows after firm-specific start + required inputs:", len(calib))
print("Dropped %:", (len(df)-len(calib))/len(df))
print("Missing values in calibration dataset:")
print(calib.isna().sum())

calib_drop = calib.copy()

Rows before: 131155 Rows after firm-specific start + required inputs: 126655
Dropped %: 0.03431054858754908
Missing values in calibration dataset:
gvkey            0
date             0
E                0
logret_mcap      0
isin             0
company          0
country_iso      0
r                0
B_drop           0
sigma_E_daily    0
sigma_E          0
calib_start      0
dtype: int64
