### Computing auc, pr_auc on 1M and 12M per country-horizon for Stats model

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import roc_auc_score

In [None]:
import os
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Folder where the CSV files are stored
input_folder = 'data 1/data/202508/'
output_folder = 'data 1/cripd_metrics/'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Function to compute PD metrics
def compute_pd_metrics(df, type_time='monthly', start_year=2020, start_month=6, end_year=2025, end_month=6):
    if type_time == 'monthly':
        date_range = pd.date_range(start=f'{start_year}-{start_month}', end=f'{end_year}-{end_month}', freq='MS')
    else:
        date_range = list(range(start_year, end_year))
    
    results = []
    try:
        for date_ in date_range:
            if type_time == 'monthly':
                test_df = df[df.date == date_]
                test_y_true = test_df.is_Default_1M
                test_y_pred = test_df.PD_1M
            else:
                test_df = df[df['date'].apply(lambda x: x.year == date_)]
                test_y_true = test_df.is_Default_12M
                test_y_pred = test_df.PD_12M

            auc_score = roc_auc_score(test_y_true, test_y_pred)
            precision, recall, thresholds = precision_recall_curve(test_y_true, test_y_pred)
            pr_auc = auc(recall, precision)

            results.append({
                'date': date_,
                'auc_score': auc_score,
                'pr_auc': pr_auc
            })
    except Exception as e:
        print(f"Error processing date {date_}: {e}")
    
    results_df = pd.DataFrame(results)
    return results_df


def compute_pd_metrics_yearly(df, type_time='yearly',start_year=2015, start_month=1, end_year=2025, end_month=6):

    date_range = pd.date_range(start=f'{start_year}-{start_month}', end=f'{end_year}-{end_month}', freq='MS')
    results = []
    try:
        for date_ in date_range:
            # if type_time == 'monthly':
            #     test_df = df[df.date == date_]
            #     test_y_true = test_df.is_Default_1M
            #     test_y_pred = test_df.PD_1M
            # else:
            filter_range = date_ + pd.DateOffset(years=1)
            test_df = df[df['date'].apply(lambda x: x >= date_ and x < filter_range)]
            test_y_true = test_df.is_Default_12M
            test_y_pred = test_df.PD_12M

            auc_score = roc_auc_score(test_y_true, test_y_pred)
            precision, recall, thresholds = precision_recall_curve(test_y_true, test_y_pred)
            pr_auc = auc(recall, precision)

            results.append({
                        'date': date_,
                        'auc_score': auc_score,
                        'pr_auc': pr_auc
            })
    except Exception as e:
        print(date_)
        print(e)

    results_df = pd.DataFrame(results)
    return results_df

# Read each CSV file in the input folder and process
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):
        # Read the CSV file
        file_path = os.path.join(input_folder, filename)
        df_cripd = pd.read_csv(file_path)
        
        # Extract company number from filename (assuming the format is consistent)
        company_no = filename.split('_')[-1].split('.')[0]
        
        # Add a date column
        df_cripd['date'] = pd.to_datetime(df_cripd.YYYYMM, format='%Y%m')
        
        # Separate monthly and yearly data
        df_cripd_mthly = df_cripd[df_cripd.is_Default_1M != -1]
        df_cripd_yearly = df_cripd[df_cripd.is_Default_12M != -1]
        
        # Compute metrics for monthly and yearly data
        monthly_results = compute_pd_metrics(df_cripd_mthly)
        yearly_results = compute_pd_metrics_yearly(df_cripd_yearly, type_time='yearly', start_year=2015)
        
        # Save the results to CSV files
        monthly_results.to_csv(os.path.join(output_folder, f'cripd_mly_1M_{company_no}.csv'), index=False)
        yearly_results.to_csv(os.path.join(output_folder, f'cripd_yearly_12M_{company_no}.csv'), index=False)
        
        print(f"Processed {filename} and saved results.")


Processed PD_Default_1M12M_1.csv and saved results.
Processed PD_Default_1M12M_10.csv and saved results.
Processed PD_Default_1M12M_100.csv and saved results.
Processed PD_Default_1M12M_102.csv and saved results.
Processed PD_Default_1M12M_103.csv and saved results.
Processed PD_Default_1M12M_107.csv and saved results.
Processed PD_Default_1M12M_11.csv and saved results.
Processed PD_Default_1M12M_12.csv and saved results.
Processed PD_Default_1M12M_15.csv and saved results.
Processed PD_Default_1M12M_16.csv and saved results.
Error processing date 2020-06-01 00:00:00: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
2020-01-01 00:00:00
Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
Processed PD_Default_1M12M_163.csv and saved results.
Processed PD_Default_1M12M_17.csv and saved results.
Processed PD_Default_1M12M_18.csv and saved results.
Processed PD_Default_1M12M_19.csv and saved results.
Processed PD_Default_1M12M_2.csv and sa