In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from tqdm import tqdm

import ctypes
from scipy.interpolate import CubicSpline
import os

from merge_bond_treasury_redcode import *
# from merge_cds_bond import *
# from process_final_product import *

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# DATA_DIR = config("DATA_DIR")

DATA_DIR = r'../../../FS-project_files'

# Initial Pull and analysis
TREASURY_ISSUE_FILE_NAME = "issue_data_mod.parquet"
TREASURY_MONTHLY_FILE_NAME = "monthly_ts_data_mod.parquet"
CORPORATES_MONTHLY_FILE_NAME = "wrds_bond_mod.parquet"
# RED_CODE_FILE_NAME = "RED_and_ISIN_mapping.parquet"

# # Secondary Pull and final analysis
# BOND_RED_CODE_FILE_NAME = "merged_bond_treasuries_redcode.parquet"
# CDS_FILE_NAME = "cds_final.parquet"
# FINAL_ANALYSIS_FILE_NAME = "final_data.parquet"


In [13]:
treasury_monthly_data = pd.read_parquet(f"{DATA_DIR}/{TREASURY_MONTHLY_FILE_NAME}")
treasury_issue_data = pd.read_parquet(f"{DATA_DIR}/{TREASURY_ISSUE_FILE_NAME}")

In [14]:
treasury_monthly_data.head()

Unnamed: 0,kycrspid,kytreasno,mcaldt,tmpubout,tmduratn,tmyld
0,19610622.8,200001.0,1960-10-31,,234.0,7.6e-05
1,19610622.8,200001.0,1960-11-30,,204.0,7.9e-05
2,19610622.8,200001.0,1960-12-30,,174.0,6.7e-05
3,19610622.8,200001.0,1961-01-31,,142.0,6.8e-05
4,19610622.8,200001.0,1961-02-28,,114.0,7.4e-05


In [15]:
treasury_issue_data.head()

Unnamed: 0,kycrspid,kytreasno,tmatdt
0,19610622.8,200001.0,1961-06-22
1,19610623.4,200002.0,1961-06-23
2,19610629.4,200003.0,1961-06-29
3,19610706.4,200004.0,1961-07-06
4,19610713.4,200005.0,1961-07-13


In [16]:
treasury_data_combined = merge_treasury_data(treasury_issue_data, treasury_monthly_data)

In [17]:
treasury_data_combined.head()

Unnamed: 0,kycrspid,kytreasno,mcaldt,tmpubout,tmduratn,tmatdt,treas_yld
0,19610622.8,200001.0,1960-10-31,,234.0,1961-06-22,0.028055
1,19610622.8,200001.0,1960-11-30,,204.0,1961-06-22,0.029241
2,19610622.8,200001.0,1960-12-30,,174.0,1961-06-22,0.024777
3,19610622.8,200001.0,1961-01-31,,142.0,1961-06-22,0.025169
4,19610622.8,200001.0,1961-02-28,,114.0,1961-06-22,0.027243


In [18]:
bond_data = pd.read_parquet(f"{DATA_DIR}/{CORPORATES_MONTHLY_FILE_NAME}")

In [19]:
t_data = treasury_data_combined.copy()

In [20]:
bond_data.head()

Unnamed: 0_level_0,offering_date,company_symbol,maturity,amount_outstanding,security_level,yield,cusip,isin,conv,offering_price,price_eom,t_spread,principal_amt,duration,maturity_time_frame,rating
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2002-07-31,1994-03-23,T,2004-04-01,400000.0,SEN,0.0856,001957AM1,US001957AM13,0.0,99.265,97.213129,0.014847,1000.0,1.507683,10,1
2002-08-31,1994-03-23,T,2004-04-01,400000.0,SEN,0.062781,001957AM1,US001957AM13,0.0,99.265,100.684813,0.011224,1000.0,1.444238,10,1
2002-09-30,1994-03-23,T,2004-04-01,400000.0,SEN,0.06696,001957AM1,US001957AM13,0.0,99.265,100.066504,0.007308,1000.0,1.361385,10,1
2002-10-31,1994-03-23,T,2004-04-01,400000.0,SEN,0.06667,001957AM1,US001957AM13,0.0,99.265,100.112053,0.007682,1000.0,1.323898,10,0
2002-11-30,1994-03-23,T,2004-04-01,123856.0,SEN,0.055671,001957AM1,US001957AM13,0.0,99.265,101.5,0.008068,1000.0,1.25141,10,0


In [29]:
b_mat_df = bond_data[['isin', 'maturity']].drop_duplicates()
t_mat_df = t_data[['kycrspid', 'tmatdt']].drop_duplicates()

In [40]:
bond_data['year_m'] = pd.to_datetime(bond_data.index).to_period('M').astype(str)
t_data['year_m'] = pd.to_datetime(t_data['mcaldt']).dt.to_period('M').astype(str)

In [37]:
bond_treas_dict = {} # combined maturity date then kycrspid

for mat in set(b_mat_df['maturity']):
    t_mat_df['mat_dist'] = (pd.to_datetime(t_mat_df['tmatdt']) - pd.to_datetime(mat)).dt.days.abs()
    t_mask_df = t_mat_df[t_mat_df['mat_dist'] < 185] # under 6 month filter for maturity
    if len(t_mask_df) == 0:
        print(mat)
        print('Empty df')
    bond_treas_dict[mat] = set(t_mask_df['kycrspid'])

In [None]:
from tqdm import tqdm

matched_t_rows = []

# Use tqdm to wrap the iterator
for idx, bond_row in tqdm(bond_data.iterrows(), total=len(bond_data), desc="Matching t_data rows"):
    bond_maturity = bond_row['maturity']
    bond_year_m = bond_row['year_m']
    bond_duration = bond_row['duration']
    
    allowed_kycrspids = bond_treas_dict.get(bond_maturity, set())
    
    if not allowed_kycrspids:
        matched_t_rows.append(None)
        continue
    
    candidates = t_data[
        (t_data['year_m'] == bond_year_m) &
        (t_data['kycrspid'].isin(allowed_kycrspids))
    ]
    
    if candidates.empty:
        matched_t_rows.append(None)
        continue

    # Avoid SettingWithCopyWarning
    candidates = candidates.copy()
    candidates['duration_diff'] = (candidates['tmduratn'] - bond_duration).abs()
    
    best_match = candidates.loc[candidates['duration_diff'].idxmin()]
    matched_t_rows.append(best_match)

# Create final merged DataFrame
matched_t_df = pd.DataFrame(matched_t_rows)
final_df = bond_data.reset_index(drop=True).join(matched_t_df.reset_index(drop=True), rsuffix='_t')

# takes 6 hours??


Matching t_data rows:   0%|          | 3235/1052364 [01:09<6:18:20, 46.22it/s]


KeyboardInterrupt: 