In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from tqdm import tqdm

import ctypes
from scipy.interpolate import CubicSpline
import os

# from merge_bond_treasury_redcode import *
from NEW_MERGE_cds_bond import *
# from merge_cds_bond import *
# from process_final_product import *

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# DATA_DIR = config("DATA_DIR")

DATA_DIR = r'../../../FS-project_files'

# Initial Pull and analysis
TREASURY_ISSUE_FILE_NAME = "issue_data_mod.parquet"
TREASURY_MONTHLY_FILE_NAME = "monthly_ts_data_mod.parquet"
CORPORATES_MONTHLY_FILE_NAME = "corporate_bond_returns.parquet"
RED_CODE_FILE_NAME = "RED_and_ISIN_mapping.parquet"

# # Secondary Pull and final analysis
# BOND_RED_CODE_FILE_NAME = "merged_bond_treasuries_redcode.parquet"
CDS_FILE_NAME = "cds_final.pkl"
# FINAL_ANALYSIS_FILE_NAME = "final_data.parquet"


In [43]:
treasury_monthly_data = pd.read_parquet(f"{DATA_DIR}/{TREASURY_MONTHLY_FILE_NAME}")
treasury_issue_data = pd.read_parquet(f"{DATA_DIR}/{TREASURY_ISSUE_FILE_NAME}")
corp_data = pd.read_parquet(f"{DATA_DIR}/{CORPORATES_MONTHLY_FILE_NAME}")
red_data = pd.read_parquet(f"{DATA_DIR}/{RED_CODE_FILE_NAME}")
cds_df = pd.read_pickle(f"{DATA_DIR}/{CDS_FILE_NAME}")

In [4]:
treasury_monthly_data.head()

Unnamed: 0,kycrspid,kytreasno,mcaldt,tmpubout,tmduratn,tmyld
0,19610622.8,200001.0,1960-10-31,,234.0,7.6e-05
1,19610622.8,200001.0,1960-11-30,,204.0,7.9e-05
2,19610622.8,200001.0,1960-12-30,,174.0,6.7e-05
3,19610622.8,200001.0,1961-01-31,,142.0,6.8e-05
4,19610622.8,200001.0,1961-02-28,,114.0,7.4e-05


In [5]:
treasury_issue_data.head()

Unnamed: 0,kycrspid,kytreasno,tmatdt
0,19610622.8,200001.0,1961-06-22
1,19610623.4,200002.0,1961-06-23
2,19610629.4,200003.0,1961-06-29
3,19610706.4,200004.0,1961-07-06
4,19610713.4,200005.0,1961-07-13


In [16]:
treasury_data_combined = merge_treasury_data(treasury_issue_data, treasury_monthly_data)

In [17]:
treasury_data_combined.head()

Unnamed: 0,kycrspid,kytreasno,mcaldt,tmpubout,tmduratn,tmatdt,treas_yld
0,19610622.8,200001.0,1960-10-31,,234.0,1961-06-22,0.028055
1,19610622.8,200001.0,1960-11-30,,204.0,1961-06-22,0.029241
2,19610622.8,200001.0,1960-12-30,,174.0,1961-06-22,0.024777
3,19610622.8,200001.0,1961-01-31,,142.0,1961-06-22,0.025169
4,19610622.8,200001.0,1961-02-28,,114.0,1961-06-22,0.027243


In [6]:
bond_data = pd.read_parquet(f"{DATA_DIR}/{CORPORATES_MONTHLY_FILE_NAME}")

In [19]:
t_data = treasury_data_combined.copy()

In [13]:
bond_data.columns

Index(['date', 'cusip', 'issuer_cusip', 'permno', 'exretn_t+1',
       'exretnc_bns_t+1', 'exretnc_t+1', 'exretnc_dur_t+1', 'bond_ret_t+1',
       'bond_ret', 'exretn', 'exretnc_bns', 'exretnc', 'exretnc_dur', 'rating',
       'cs', 'cs_6m_delta', 'bond_yield', 'bond_amount_out', 'offering_amt',
       'bondprc', 'perc_par', 'tmt', 'duration', 'ind_num_17', 'sic_code',
       'mom6_1', 'ltrev48_12', 'BOND_RET', 'ILLIQ', 'var95', 'n_trades_month',
       'size_ig', 'size_jk', 'zcb', 'conv', 'BOND_YIELD', 'CS', 'BONDPRC',
       'PRFULL', 'DURATION', 'CONVEXITY', 'CS_6M_DELTA', 'bond_value',
       'BOND_VALUE', 'coupon', 'bond_type', 'principal_amt', 'bondpar_mil'],
      dtype='object')

In [32]:
bond_data[['date', 'issuer_cusip', 'CS', 'size_ig', 'size_jk', 'tmt']].head(10)

Unnamed: 0,date,issuer_cusip,CS,size_ig,size_jk,tmt
0,2002-08-31,336,0.032526,0.0,1.0,70.033333
1,2002-09-30,336,0.049044,0.0,1.0,69.033333
2,2002-10-31,336,0.045408,0.0,1.0,68.0
3,2002-11-30,336,0.039786,0.0,1.0,67.0
4,2002-12-31,336,0.079508,0.0,1.0,65.966667
5,2003-01-31,336,0.068708,0.0,1.0,64.933333
6,2003-02-28,336,,,,
7,2003-03-31,336,0.090516,0.0,1.0,62.966667
8,2003-04-30,336,0.068205,0.0,1.0,61.966667
9,2003-05-31,336,0.060632,0.0,1.0,60.933333


In [29]:
unique_pairs_df = bond_data[['size_jk', 'size_ig']].drop_duplicates()
print(unique_pairs_df)


     size_jk  size_ig
0        1.0      0.0
6        NaN      NaN
68       0.0      1.0
112      1.0      1.0


In [28]:
unique_counts = (
    bond_data
      .groupby(['size_jk', 'size_ig'])['cusip']
      .nunique()                        # count distinct CUSIPs
      .reset_index(name='unique_cusip_count')
)

print(unique_counts)


   size_jk  size_ig  unique_cusip_count
0      0.0      1.0                2402
1      1.0      0.0                9134
2      1.0      1.0               19146


In [31]:
bond_data.columns

Index(['date', 'cusip', 'issuer_cusip', 'permno', 'exretn_t+1',
       'exretnc_bns_t+1', 'exretnc_t+1', 'exretnc_dur_t+1', 'bond_ret_t+1',
       'bond_ret', 'exretn', 'exretnc_bns', 'exretnc', 'exretnc_dur', 'rating',
       'cs', 'cs_6m_delta', 'bond_yield', 'bond_amount_out', 'offering_amt',
       'bondprc', 'perc_par', 'tmt', 'duration', 'ind_num_17', 'sic_code',
       'mom6_1', 'ltrev48_12', 'BOND_RET', 'ILLIQ', 'var95', 'n_trades_month',
       'size_ig', 'size_jk', 'zcb', 'conv', 'BOND_YIELD', 'CS', 'BONDPRC',
       'PRFULL', 'DURATION', 'CONVEXITY', 'CS_6M_DELTA', 'bond_value',
       'BOND_VALUE', 'coupon', 'bond_type', 'principal_amt', 'bondpar_mil'],
      dtype='object')

In [39]:
bond_red_df = merge_red_code_into_bond_treas(bond_data[['date', 'issuer_cusip', 'CS', 'size_ig', 'size_jk', 'tmt']], red_data)

In [None]:
cds_df = pd.read_pickle(f"{DATA_DIR}/{}")

In [None]:
def merge_cds_into_bonds(bond_red_df, cds_df):
    '''
    bond_red_df: dataframe with the issuer cusip and red_code now added
        date, -- date when data was collected
        issuer_cusip, -- cusip of issuing firm
        CS, -- Credit Spread we replace Z-spread with
        size_ig, -- 0 if no ig bonds in portfolio, 1 if yes
        size_jk, -- 0 if no junk bonds in portfolio, 1 if yes
        mat_days, -- time to maturity in days
        redcode -- redcode is issuer specific, used to merge CDS values later on

    cds_df: dataframe containing cds_data
        date, -- date of report 
        'ticker', -- ticker of issuer
        'redcode', -- redcode of issuer
        'parspread', -- parspread
        'tenor', -- tenor, how long
        'tier', -- tier of debt
        'country', -- country of issuer
        'year' -- year of date
    
    output: dataframe with par spread values merged into all values where there was a possible cubic spline
       'cusip', -- unique bond tag
       'date', -- reporting date
       'mat_days', -- days till maturity
       'CS', -- credit spread
        'size_ig', -- 0 if no ig bonds in portfolio, 1 if yes
        'size_jk', -- 0 if no junk bonds in portfolio, 1 if yes
       'par_spread', -- parspread of CDS, backed out by Cubic Spline
    '''
    date_set = set(bond_red_df.date.unique())
    cds_df = cds_df[cds_df['date'].isin(date_set)].dropna(subset=['date', 'parspread', 'tenor', 'redcode'])

    # par spread values are roughly consistent for each tenor, make broad assumptions on true value on par spread
    c_df_avg = cds_df.groupby(cds_df.columns.difference(['parspread']).tolist(), as_index=False).agg({'parspread': 'median'})

    df_unique_count = c_df_avg.groupby(['redcode', 'date'])['tenor'].nunique().reset_index()
    df_unique_count.rename(columns={'tenor': 'unique_tenor_count'}, inplace=True)

    # need at least 2 for cubic spline
    df_unique_count = df_unique_count[df_unique_count['unique_tenor_count'] > 1]

    # grab the filtered_cds_df by using df_uni_count as a filter
    filtered_cds_df = c_df_avg.merge(df_unique_count[['redcode', 'date']], on=['redcode', 'date'], how='inner')

    # my mapping to convert tenor to days to get a rough approximation of a daily spline
    tenor_to_days = {
        "1Y": 365,
        "3Y": 3 * 365,
        "5Y": 5 * 365,
        "7Y": 7 * 365,
        "10Y": 10 * 365
    }

    filtered_cds_df['tenor_days'] = filtered_cds_df['tenor'].map(tenor_to_days)

    # Dictionary to store cubic splines for each (redcode, date) pair
    cubic_splines = {}

    # Group by (redcode, date) and create splines
    for (redcode, date), group in filtered_cds_df.groupby(['redcode', 'date']):
        x = group['tenor_days'].values
        y = group['parspread'].values
        
        sorted_indices = np.argsort(x)
        x_sorted, y_sorted = x[sorted_indices], y[sorted_indices]

        # Fit cubic spline
        try:
            cubic_splines[(redcode, date)] = CubicSpline(x_sorted, y_sorted)
        except:
            print(x_sorted)
            print(y_sorted)

    # START filtering the bond dataframe to make the merge easier
    red_set = set(filtered_cds_df['redcode'].unique())
    bond_red_df = bond_red_df[bond_red_df['redcode'].isin(red_set)]


    # vectorized function to grab the par spread
    def add_par_spread_vectorized(df):
        mask = df.set_index(['redcode', 'date']).index.isin(cubic_splines.keys())

        # spline interpolation only for matching keys
        valid_rows = df.loc[mask]
        df.loc[mask, 'par_spread'] = valid_rows.apply(
            lambda row: cubic_splines[(row['redcode'], row['date'])](row['mat_days']), axis=1
        )

        df['par_spread'] = df['par_spread'].fillna(np.nan)
        
        return df
    
    par_df = add_par_spread_vectorized(bond_red_df)
    par_df = par_df.dropna(subset=['par_spread'])

    # keep only the important columns
    par_df = par_df[['cusip', 'date', 'mat_days', 'CS', 'size_ig', 'size_jk', 'par_spread']]
    # have had issues with a phantom array column
    def safe_convert(x):
        """Convert lists and arrays to tuples while keeping other data types unchanged."""
        if isinstance(x, list):
            return tuple(x)
        elif isinstance(x, np.ndarray):
            return tuple(x.tolist()) if x.ndim > 0 else x.item()  # Convert array to tuple if not scalar
        else:
            return x

    # Apply safe conversion
    par_df = par_df.applymap(safe_convert)
    par_df = par_df.drop_duplicates()

    return par_df

In [45]:
merge_cds_into_bonds(bond_red_df, cds_df)

KeyError: 'maturity'