In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tqdm
import sys
import time
import datetime
import scipy.stats
from Py_Files import metric_inventory
from Py_Files import aws_rds
from Py_Files import credentials
from Py_Files import data_exploration


print(sys.executable)


data_dir = '/Users/joeybortfeld/Documents/QML Solutions Data/'
s3_dir = 's3://qml-solutions-new-york/'
metric_list = metric_inventory.ratio_dict['size'] + metric_inventory.ratio_dict['leverage'] + metric_inventory.ratio_dict['coverage'] + metric_inventory.ratio_dict['profitability'] + metric_inventory.ratio_dict['liquidity'] + metric_inventory.ratio_dict['volatility']
print('ratio count:', len(metric_list))


In [None]:
df = pd.read_csv(data_dir + f'qml_modeling_data/modeling_dataset_with_bankruptcy_labels_us_nonfin_100m_20250212.csv')
df['fiscal_end_date'] = pd.to_datetime(df['fiscal_end_date'])

# 0. Generate Quantile Distribution for Box Plots and Table Summary

In [None]:
write_to_local = True
write_to_s3 = True

# build quantile summaries for each ratio across all sectors
quantile_list = [0, 0.01, .02, .03, .04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95,0.96,0.97,0.98, 0.99, 1]
groupby = 'factset_econ_sector'
start = time.time()

for m in tqdm.tqdm(metric_list):
    
    temp = data_exploration.quantile_analysis(df, metric=m, quantile_list=quantile_list, groupby=groupby)

    if write_to_local:
        temp.to_csv(data_dir + f'exploratory_data/ratio_quantile_summaries/quantile_summary_table_{m}.csv', index=False)

    if write_to_s3:
        temp.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/ratio-quantile-summaries/quantile_summary_table_{m}.csv', index=False, storage_options=credentials.aws_s3_credentials)

print('done in', time.time() - start)
print('--', datetime.datetime.now())

# 1. Generate Quantile Distribution for Box Plots to Compare Bankruptcy vs Non-Bankruptcy
* This generates the box data (25th, 50th, 75th percentiles and more) for observations conditional that they go into bankruptcy 1,2,3,4,5 years out

In [None]:
write_to_local = True
write_to_s3 = True

sector_groupby = 'factset_econ_sector'
collection = []
start = time.time()

for this_metric in tqdm.tqdm(metric_list):
    temp = data_exploration.quantile_analysis_by_default_class(df, this_metric, sector_groupby)

    if write_to_local:
        temp.to_csv(data_dir + f'exploratory_data/ratio_quantile_summaries_by_default_class/quantile_summary_table_{this_metric}.csv', index=False)

    if write_to_s3:
        temp.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/ratio-quantile-summaries-by-default-class/quantile_summary_table_{this_metric}.csv', index=False, storage_options=credentials.aws_s3_credentials)


print('done in', time.time() - start)
print('--', datetime.datetime.now())


# 2. Generate Realized Default Rates by Ratio Deciles


In [None]:
write_to_local = True
write_to_s3 = True
groupby = 'factset_econ_sector'
start = time.time()

for this_metric in tqdm.tqdm(metric_list):
    temp = data_exploration.default_rate_by_ratio_decile(data=df, metric=this_metric, groupby=groupby)

    if write_to_local:
        temp.to_csv(data_dir + f'exploratory_data/ratio_default_rates_by_decile/decile_default_rate_{this_metric}.csv', index=False)

    if write_to_s3:
        temp.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/ratio-default-rates-by-decile/decile_default_rate_{this_metric}.csv', index=False, storage_options=credentials.aws_s3_credentials)

print('done in', time.time() - start)
print('--', datetime.datetime.now())

# 3. Generate Histogram Data for Ratio Histograms

In [None]:
# build histogram
write_to_local = True
write_to_s3 = True
write_to_rds = True
groupby = 'factset_econ_sector'
quantile_list = [0, 0.01, .02, .03, .04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95,0.96,0.97,0.98, 0.99, 1]
start = time.time()


collection = []
for this_metric in tqdm.tqdm(metric_list):
    temp = data_exploration.generate_histogram_data(df, this_metric, quantiles=(.01, .99), groupby=groupby)
    collection.append(temp)

    if write_to_local:
        temp.to_csv(data_dir + f'exploratory_data/ratio_histograms/ratio_histogram_summary_table_{this_metric}.csv', index=False)

    if write_to_s3:
        temp.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/ratio-histograms/ratio_histogram_summary_table_{this_metric}.csv', index=False, storage_options=credentials.aws_s3_credentials)


if write_to_rds:
    print('writing to rds')
    collection = pd.concat(collection, axis=0)

    sqlalchemy_engine = aws_rds.sqlalchemy_connect_to_rds(credentials.aws_rds_credentials)
    collection.to_sql('ratio_histogram_summary_table', sqlalchemy_engine, if_exists='replace', index=False)
    print('done in ', time.time() - start)

    # set indices in postgres database table
    psycopg2_connection = aws_rds.psycopg2_connect_to_rds(credentials.aws_rds_credentials)
    aws_rds.create_index_on_rds(table_name='ratio_histogram_summary_table', 
                            index_name='idx_metric_sector_lower_clip', 
                            columns_to_index=['metric', 'sector', 'lower_clip'], 
                            conn=psycopg2_connection)


        
print('done in ', time.time() - start)
print('--', datetime.datetime.now())


# 4. Bankruptcy Diagnostics

In [None]:

temp = data_exploration.build_default_diagnostics(df)

for i in [1,2,3,4,5]:
    print(f'{i}Y defaults with assets/ebitda/cf:', (temp[f'fund_count_{i}'] == 3).sum())


# 5. Univariate Regressions

In [None]:
write_to_local = True
write_to_s3 = True


# apply percentile transformations to ratios using a specified subset for determining the percentile distribution
df_model, pct_vars_dict = data_exploration.model_df_prep(df, metric_list=metric_list, test_split_date=None)
df_model_split, pct_vars_dict_split = data_exploration.model_df_prep(df, metric_list=metric_list, test_split_date='2018-01-01')


for n in range(1,6):
    
    horizon = n

    # with pct transformation = No
    #results = univariate_reg(df_model, display_name_dict, horizon, 'No', None)
    #results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y.csv', index=False)
    # with train/test split
    results = data_exploration.univariate_reg(df_model_split, var_list=metric_list, horizon=n, pct='No', test_split_date=None)
    if write_to_local:
        results.to_csv(data_dir + f'exploratory_data/default_model_univariate_regressions/univariate_reg_{n}y_split.csv', index=False)
    if write_to_s3:
        results.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/default-model-univariate-regressions/univariate_reg_{n}y_split.csv', index=False, storage_options=credentials.aws_s3_credentials)

    # with pct transformation = Yes
    #results = univariate_reg(df_model, pct_vars_dict, horizon, 'Yes', None)
    #results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y-pct.csv', index=False)
    # with train/test split
    results = data_exploration.univariate_reg(df_model_split, var_list=metric_list, horizon=n, pct='Yes', test_split_date='2018-01-01')
    if write_to_local:  
        results.to_csv(data_dir + f'exploratory_data/default_model_univariate_regressions/univariate_reg_{n}y-pct_split.csv', index=False)
    if write_to_s3: 
        results.to_csv(s3_dir + f'qml-dashboard-tools/exploratory-data/default-model-univariate-regressions/univariate_reg_{n}y-pct_split.csv', index=False, storage_options=credentials.aws_s3_credentials)

In [None]:
# coverage statistics
df.groupby(by='report_date')['ff_assets_in_usd'].count().plot()
df.groupby(by='report_date')['market_leverage'].count().plot()


In [None]:
temp = df[['fsym_id', 'fiscal_end_date', 'default_1', 'default_5', 'bankruptcy_date', 'ff_assets_in_usd', 'total_equity_to_assets', 'net_income_to_sales']].copy()
temp['ff_assets_in_usd'] = temp.groupby('fsym_id')['ff_assets_in_usd'].ffill(limit=4)

# convert to decile
temp['ff_assets_in_usd_decile'] = pd.qcut(temp['ff_assets_in_usd'], q=100, labels=False)
temp['total_equity_to_assets_decile'] = pd.qcut(temp['total_equity_to_assets'], q=100, labels=False)
temp['net_income_to_sales_decile'] = pd.qcut(temp['net_income_to_sales'], q=100, labels=False)
for m in ['ff_assets_in_usd_decile', 'total_equity_to_assets_decile', 'net_income_to_sales_decile']:
    temp[m] = temp[m] + 1

# adjustments
# -- we want small values to be bad and large values to be good
pass

temp['size_x_leverage'] = temp['ff_assets_in_usd_decile'] * temp['total_equity_to_assets_decile']
temp['size_x_profitability'] = temp['ff_assets_in_usd_decile'] * temp['net_income_to_sales_decile']

temp['size_x_profitability_decile'] = pd.qcut(temp['size_x_profitability'], q=100, labels=False)
temp['size_x_leverage_decile'] = pd.qcut(temp['size_x_leverage'], q=100, labels=False)

fig, ax = plt.subplots(1,2, figsize=(10,5))
temp.groupby(by='size_x_profitability_decile')['default_1'].mean().plot(kind='bar', ax=ax[0])
temp.groupby(by='size_x_leverage_decile')['default_1'].mean().plot(kind='bar', ax=ax[1])


In [None]:
temp = df[['fsym_id', 'fiscal_end_date', 'default_1', 'default_2', 'default_3', 'default_4', 'default_5', 'bankruptcy_date', 'ff_assets_in_usd', 'total_equity_to_assets', 'net_income_to_sales']].copy()

# fill forward where missing (we didn't download quarterly assets in usd)
temp['ff_assets_in_usd'] = temp.groupby('fsym_id')['ff_assets_in_usd'].ffill(limit=4)

# convert to decile
temp['ff_assets_in_usd_decile'] = pd.qcut(temp['ff_assets_in_usd'], q=10, labels=False)
temp['total_equity_to_assets_decile'] = pd.qcut(temp['total_equity_to_assets'], q=10, labels=False)
temp['net_income_to_sales_decile'] = pd.qcut(temp['net_income_to_sales'], q=10, labels=False)
for m in ['ff_assets_in_usd_decile', 'total_equity_to_assets_decile', 'net_income_to_sales_decile']:
    temp[m] = temp[m] + 1

collection = []
for i in range(1,11):
    for j in range(1,11):

        temp2 = temp[temp['ff_assets_in_usd_decile'] == i]
        temp2 = temp2[temp2['net_income_to_sales_decile'] == j]
        temp2 = temp2[temp2['default_1'].notnull()]
        temp2 = temp2[temp2['default_1'] != -1] 
        
        mean1 = temp2['default_1'].mean()
        mean2 = temp2['default_2'].mean()
        mean3 = temp2['default_3'].mean()
        mean4 = temp2['default_4'].mean()
        mean5 = temp2['default_5'].mean()
        count = temp2.shape[0]

        collection.append([i, j, mean1, mean2, mean3, mean4, mean5, count])

temp = pd.DataFrame(collection, columns=['ff_assets_in_usd_decile', 'net_income_to_sales_decile', 'default_1', 'default_2', 'default_3', 'default_4', 'default_5', 'count'])
temp = temp.pivot(index='net_income_to_sales_decile', columns='ff_assets_in_usd_decile', values=['default_1'])


# add heatmap to table
# temp.style.background_gradient(cmap='RdYlGn')
temp.to_csv('/Users/joeybortfeld/Downloads/size_x_profitability_default_rates.csv', index=True)
temp

In [None]:
temp[['ff_assets_in_usd', 'total_equity_to_assets', 'ff_assets_in_usd_decile', 'total_equity_to_assets_decile', 'size_x_leverage_decile']]

In [None]:
import scipy.stats

this_var = 'net_income_to_sales'
temp = df[df[this_var].notnull()]

values_dict = {}
for sector in temp['factset_econ_sector'].unique():
    values_dict[sector] = temp[temp['factset_econ_sector'] == sector][this_var].values
values_dict['All'] = temp[this_var].values


# Run Mood's median test
stat, p, med, table = scipy.stats.median_test(values_dict['Consumer Services'], values_dict['Distribution Services'])
print(f"Mood's Median Test statistic: {stat}, p-value: {p}")

if p < 0.05:
    print("Reject the null hypothesis: medians are different across sectors.")
else:
    print("Fail to reject the null hypothesis: medians are not significantly different.")
 

table


In [None]:
# scratch: bootstraping simulations to get confidence intervals
temp = df[['fsym_id', 'fiscal_end_date', 'factset_econ_sector', 'total_equity_to_assets']].copy()
temp = temp[temp['total_equity_to_assets'].notnull()]
temp = temp[temp['total_equity_to_assets'] != np.inf]
temp = temp[temp['total_equity_to_assets'] != -np.inf]
temp = temp.sort_values(by='total_equity_to_assets', ascending=False)
temp = temp.reset_index(drop=True)
print(temp.shape[0])

print('original median:', temp['total_equity_to_assets'].median())
print('quintiles': )
print()
print('original mean:', temp['total_equity_to_assets'].mean())
print('original sd:', temp['total_equity_to_assets'].std())
print()




# # number of simulations
# medians_list = []
# for _ in tqdm.tqdm(range(10_000)):

#     # resample the data (N=100_000)
#     temp2 = temp.sample(n=200_000, replace=True)

#     # calculate the median of the resampled data
#     medians_list.append(temp2['total_equity_to_assets'].median())

print('bootstrapped median:', np.median(medians_list))
print('bootstrap median absolute deviation:', scipy.stats.median_abs_deviation(medians_list))
print('approx 95% confidence interval:', np.percentile(medians_list, [2.5, 97.5]))

# plot the histogram of the medians
plt.hist(medians_list, bins=20)
plt.show()


In [None]:


query = f'''SELECT * FROM ratio_histogram_summary_table;'''
temp = pd.read_sql_query(query, engine)
temp.shape

In [None]:
def obs_count_by_two_groups(data:pd.DataFrame, groupby1:str, groupby2:str, pct:bool=False):

    '''
    Generate a table of counts by two groups. Groupby1 are rows, groupby2 are columns
    '''
    if pct:
        return data.groupby([groupby1, groupby2]).size().unstack() / data.groupby(groupby2).size()
    else:
        return data.groupby([groupby1, groupby2]).size().unstack()

def obs_count_by_group(data:pd.DataFrame, groupby:str, pct:bool=False):
    if pct:
        return data.groupby(groupby).size() / data.shape[0]
    else:
        return data.groupby(groupby).size()

temp  = obs_count_by_two_groups(df, 'factset_econ_sector', 'fiscal_year', pct=True)
# obs_count_by_group(df, 'factset_econ_sector', pct=True)
temp

In [None]:
temp.T.plot()

In [None]:
df.groupby('fiscal_year').size()

In [None]:
# scratch: sampling
this_rand = np.random.uniform(0, 1)
results = []
for _ in range(10_000):
    temp = np.random.binomial(n=1, p=this_rand, size=100)
    results.append(temp.mean())

plt.hist(results, bins=20)
plt.show()


print(this_rand)

In [None]:
from sqlalchemy import create_engine

# s3 connection



sql_connection_string = f"postgresql+psycopg2://{aws_rds_user}:{aws_rds_password}@modeling-dataset.ci6paxfsercw.us-east-1.rds.amazonaws.com:5432/postgres"

sqlalchemy_engine = create_engine(
    f"postgresql+psycopg2://{aws_rds_user}:{aws_rds_password}@modeling-dataset.ci6paxfsercw.us-east-1.rds.amazonaws.com:5432/postgres"
)

query = f'''SELECT fsym_id, fiscal_end_date, net_debt_to_ebitda FROM modeling_dataset '''
start = time.time()
df3 = pd.read_sql_query(query, sqlalchemy_engine)
print(time.time() - start)


In [None]:
df3.shape