In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import roc_auc_score

In [2]:
df_mlpd = pd.read_csv('../data_visualization/mthly_backtest_results_5yrs.csv')

In [3]:
df_cripd = pd.read_csv(r'../../../data/input/PD_Default_1M12M_China.csv')
df_cripd['date'] = pd.to_datetime(df_cripd.YYYYMM, format='%Y%m')

In [4]:
df_cripd_mthly = df_cripd[df_cripd.is_Default_1M != -1]
df_cripd_yearly = df_cripd[df_cripd.is_Default_12M != -1]

In [17]:
df_cripd_mthly

Unnamed: 0,COMPANY_NO,YYYYMM,PD_1M,PD_12M,is_Default_1M,is_Default_12M,date
0,2725,199506.0,0.001464,0.020113,0,0,1995-06-01
1,2725,199507.0,0.001702,0.023391,0,0,1995-07-01
2,2725,199508.0,0.001311,0.018076,0,0,1995-08-01
3,2725,199509.0,0.001346,0.018378,0,0,1995-09-01
4,2725,199510.0,0.001424,0.019399,0,0,1995-10-01
...,...,...,...,...,...,...,...
753310,215550,202506.0,0.000399,0.005808,0,-1,2025-06-01
753312,215556,202506.0,0.000017,0.000367,0,-1,2025-06-01
753314,215569,202506.0,0.000101,0.001768,0,-1,2025-06-01
753316,215578,202506.0,0.000172,0.002660,0,-1,2025-06-01


In [16]:
df_cripd_mthly[df_cripd_mthly.Y==1]

AttributeError: 'DataFrame' object has no attribute 'Y'

In [66]:
date_range = list(range(2020, 2025-1))

In [67]:
date_range

[2020, 2021, 2022, 2023]

In [6]:
def compute_pd_metrics(df, type_time='monthly',start_year=2020, start_month=6, end_year=2025, end_month=6):
    if type_time == 'monthly':
        date_range = pd.date_range(start=f'{2020}-{6}', end=f'{2025}-{6}', freq='MS')
    else:
        date_range = list(range(start_year, end_year))
    results = []
    try:
        for date_ in date_range:
            if type_time == 'monthly':
                test_df = df[df.date == date_]
                test_y_true = test_df.is_Default_1M
                test_y_pred = test_df.PD_1M
            else:
                test_df = df[df['date'].apply(lambda x: x.year == date_)]
                test_y_true = test_df.is_Default_12M
                test_y_pred = test_df.PD_12M

            auc_score = roc_auc_score(test_y_true, test_y_pred)
            precision, recall, thresholds = precision_recall_curve(test_y_true, test_y_pred)
            pr_auc = auc(recall, precision)

            results.append({
                        'date': date_,
                        'auc_score': auc_score,
                        'pr_auc': pr_auc
            })
    except Exception as e:
        print(date_)
        print(e)

    results_df = pd.DataFrame(results)
    return results_df



In [7]:
compute_pd_metrics(df_cripd_mthly).to_csv('cripd_mly_5years.csv', index=False)



In [72]:
compute_pd_metrics(df_cripd_yearly, type_time='yearly', start_year=2015).to_csv('cripd_yearly_10years.csv', index=False)

In [74]:
compute_pd_metrics(df_cripd_yearly, type_time='yearly', start_year=2015)

Unnamed: 0,date,auc_score,pr_auc
0,2015,0.827729,0.018422
1,2016,0.688999,0.012908
2,2017,0.734673,0.030318
3,2018,0.729615,0.031178
4,2019,0.831452,0.069123
5,2020,0.872448,0.130328
6,2021,0.915567,0.208348
7,2022,0.876076,0.182777
8,2023,0.902099,0.168111
9,2024,0.970626,0.192469


In [8]:
def compute_pd_metrics_yearly(df, type_time='yearly',start_year=2015, start_month=1, end_year=2025, end_month=6):

    date_range = pd.date_range(start=f'{start_year}-{start_month}', end=f'{end_year}-{end_month}', freq='MS')
    results = []
    try:
        for date_ in date_range:
            # if type_time == 'monthly':
            #     test_df = df[df.date == date_]
            #     test_y_true = test_df.is_Default_1M
            #     test_y_pred = test_df.PD_1M
            # else:
            filter_range = date_ + pd.DateOffset(years=1)
            test_df = df[df['date'].apply(lambda x: x >= date_ and x < filter_range)]
            test_y_true = test_df.is_Default_12M
            test_y_pred = test_df.PD_12M

            auc_score = roc_auc_score(test_y_true, test_y_pred)
            precision, recall, thresholds = precision_recall_curve(test_y_true, test_y_pred)
            pr_auc = auc(recall, precision)

            results.append({
                        'date': date_,
                        'auc_score': auc_score,
                        'pr_auc': pr_auc
            })
    except Exception as e:
        print(date_)
        print(e)

    results_df = pd.DataFrame(results)
    return results_df

In [9]:
compute_pd_metrics_yearly(df_cripd_yearly).to_csv('cripd_yearly_10years.csv', index=False)



2025-05-01 00:00:00
Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.


In [11]:
df_ml_pd = pd.read_csv(r'/data/zhuanghao/MyGithub/MLOps-PDModel/src/modeldev/data_visualization/yearly_backtest_results_10yrs.csv')

In [13]:
df_ml_pd[df_ml_pd.year > '2021-01-01']

Unnamed: 0,year,auc_score,pr_auc,train_size,test_size,test_positives,test_samples
73,2021-02-01,0.926473,0.282988,460208,46542,65.0,46515
74,2021-03-01,0.928353,0.301885,463878,46978,64.0,46949
75,2021-04-01,0.923907,0.275325,467546,47413,63.0,47384
76,2021-05-01,0.921108,0.303937,471207,47854,73.0,47825
77,2021-06-01,0.913435,0.277117,474947,48266,76.0,48234
78,2021-07-01,0.901284,0.265542,478757,48658,76.0,48637
79,2021-08-01,0.903651,0.277552,482604,49091,80.0,49066
80,2021-09-01,0.905414,0.273212,486442,49517,81.0,49492
81,2021-10-01,0.892286,0.295462,490437,49861,80.0,49835
82,2021-11-01,0.894469,0.285444,494428,50203,76.0,50177
