In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

merton = pd.read_csv("../data/derived/Merton_weekly.csv")
nig = pd.read_csv("../data/derived/NIG_weekly.csv")

print(merton.keys())
print(nig.keys())

Index(['gvkey', 'date', 'sigma_hat', 'mu_hat', 'V_0', 'V_used', 'B_used',
       'PD_Q', 'PD_P', 'train_end_date', 'training_end'],
      dtype='object')
Index(['gvkey', 'date', 'A_hat', 'L', 'alpha', 'beta', 'delta', 'mu', 'PD_1y'], dtype='object')


In [3]:
# data sort and labeling to then merge on firm and date
for df in (merton, nig):
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

merton_keep = (
    merton.loc[:, ["gvkey", "date", "PD_Q", "PD_P"]]
          .rename(columns={"PD_Q": "PD_1y_Merton_Q", "PD_P": "PD_1y_Merton_P"})
)

nig_keep = (
    nig.loc[:, ["gvkey", "date", "PD_1y"]]
       .rename(columns={"PD_1y": "PD_1y_NIG"})
)

# ensure numeric PDs
for c in ["PD_1y_Merton_Q", "PD_1y_Merton_P"]:
    merton_keep[c] = pd.to_numeric(merton_keep[c], errors="coerce")
nig_keep["PD_1y_NIG"] = pd.to_numeric(nig_keep["PD_1y_NIG"], errors="coerce")

# Guard against duplicate keys, keep the last row per (gvkey,date)
merton_keep = merton_keep.sort_values(["gvkey", "date"]).drop_duplicates(["gvkey", "date"], keep="last")
nig_keep    = nig_keep.sort_values(["gvkey", "date"]).drop_duplicates(["gvkey", "date"], keep="last")

merged = merton_keep.merge(nig_keep, on=["gvkey", "date"], how="outer")

merged = merged.sort_values(["gvkey", "date"]).reset_index(drop=True)

print(merged.columns)
print(merged.head())


Index(['gvkey', 'date', 'PD_1y_Merton_Q', 'PD_1y_Merton_P', 'PD_1y_NIG'], dtype='object')
   gvkey       date  PD_1y_Merton_Q  PD_1y_Merton_P     PD_1y_NIG
0  14447 2014-01-03             NaN             NaN  2.298318e-09
1  14447 2014-01-10             NaN             NaN  3.589342e-09
2  14447 2014-01-17             NaN             NaN  3.319471e-09
3  14447 2014-01-24             NaN             NaN  4.126655e-09
4  14447 2014-01-31             NaN             NaN  2.078421e-09


In [None]:
import pandas as pd

def compare_date_coverage(merged, gv):

    sub = merged[merged["gvkey"] == gv].copy()

    # Ensure datetime
    sub["date"] = pd.to_datetime(sub["date"])

    # Dates where each model has PD
    dates_merton = set(
        sub.loc[
            sub["PD_1y_Merton_Q"].notna() | sub["PD_1y_Merton_P"].notna(),
            "date"
        ]
    )

    dates_nig = set(
        sub.loc[
            sub["PD_1y_NIG"].notna(),
            "date"
        ]
    )

    only_nig = dates_nig - dates_merton
    only_merton = dates_merton - dates_nig

    print(f"Firm {gv}")
    print("Total Merton dates:", len(dates_merton))
    print("Total NIG dates:", len(dates_nig))
    print("Dates only in NIG:", len(only_nig))
    print("Dates only in Merton:", len(only_merton))

    return sorted(list(only_nig)), sorted(list(only_merton))

In [None]:
coverage_summary = []

for gv in merged["gvkey"].unique():
    sub = merged[merged["gvkey"] == gv]

    dates_merton = set(
        sub.loc[
            sub["PD_1y_Merton_Q"].notna() | sub["PD_1y_Merton_P"].notna(),
            "date"
        ]
    )

    dates_nig = set(
        sub.loc[
            sub["PD_1y_NIG"].notna(),
            "date"
        ]
    )

    coverage_summary.append({
        "gvkey": gv,
        "merton_dates": len(dates_merton),
        "nig_dates": len(dates_nig),
        "only_nig": len(dates_nig - dates_merton),
        "only_merton": len(dates_merton - dates_nig)
    })

coverage_df = pd.DataFrame(coverage_summary)

coverage_df.sort_values("only_nig", ascending=False).head()

Unnamed: 0,gvkey,merton_dates,nig_dates,only_nig,only_merton
0,14447,585,625,64,24
1,17436,585,625,64,24
20,101361,585,625,64,24
21,102296,585,625,64,24
22,103487,585,625,64,24


In [8]:
def gap_diagnostics(merged, gv):
    sub = merged[merged["gvkey"] == gv].copy()
    sub["date"] = pd.to_datetime(sub["date"])

    m_ok = sub["PD_1y_Merton_Q"].notna() | sub["PD_1y_Merton_P"].notna()
    n_ok = sub["PD_1y_NIG"].notna()

    dates_m = set(sub.loc[m_ok, "date"])
    dates_n = set(sub.loc[n_ok, "date"])

    only_n = sorted(dates_n - dates_m)
    only_m = sorted(dates_m - dates_n)

    def summarize(ds):
        if not ds:
            return None
        s = pd.Series(ds)
        return {
            "min": s.min(), "max": s.max(),
            "n": len(s),
            "first_10": list(s.head(10)),
        }

    print("ONLY NIG summary:", summarize(only_n))
    print("ONLY Merton summary:", summarize(only_m))

    # Are they clustered?
    on_month = pd.Series(only_n).dt.to_period("M").value_counts().sort_index()
    om_month = pd.Series(only_m).dt.to_period("M").value_counts().sort_index()

    return only_n, only_m, on_month, om_month

only_nig_dates, only_merton_dates, only_nig_by_month, only_merton_by_month = gap_diagnostics(merged, 14447)

only_nig_by_month.tail(12), only_merton_by_month.tail(12)

ONLY NIG summary: {'min': Timestamp('2014-01-03 00:00:00'), 'max': Timestamp('2025-12-19 00:00:00'), 'n': 64, 'first_10': [Timestamp('2014-01-03 00:00:00'), Timestamp('2014-01-10 00:00:00'), Timestamp('2014-01-17 00:00:00'), Timestamp('2014-01-24 00:00:00'), Timestamp('2014-01-31 00:00:00'), Timestamp('2014-02-07 00:00:00'), Timestamp('2014-02-14 00:00:00'), Timestamp('2014-02-21 00:00:00'), Timestamp('2014-02-28 00:00:00'), Timestamp('2014-03-07 00:00:00')]}
ONLY Merton summary: {'min': Timestamp('2014-03-31 00:00:00'), 'max': Timestamp('2024-12-31 00:00:00'), 'n': 24, 'first_10': [Timestamp('2014-03-31 00:00:00'), Timestamp('2014-06-30 00:00:00'), Timestamp('2014-09-30 00:00:00'), Timestamp('2014-12-31 00:00:00'), Timestamp('2015-03-31 00:00:00'), Timestamp('2015-06-30 00:00:00'), Timestamp('2015-09-30 00:00:00'), Timestamp('2015-12-31 00:00:00'), Timestamp('2016-03-31 00:00:00'), Timestamp('2016-06-30 00:00:00')]}


(2025-01    5
 2025-02    4
 2025-03    4
 2025-04    4
 2025-05    5
 2025-06    4
 2025-07    4
 2025-08    5
 2025-09    4
 2025-10    5
 2025-11    4
 2025-12    3
 Freq: M, Name: count, dtype: int64,
 2019-12    1
 2020-03    1
 2020-06    1
 2020-09    1
 2020-12    1
 2021-03    1
 2021-06    1
 2021-09    1
 2022-03    1
 2022-06    1
 2024-09    1
 2024-12    1
 Freq: M, Name: count, dtype: int64)