In [None]:
%load_ext autoreload
%autoreload 2

import requests
import json
import pandas as pd
import numpy as np
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import time
import datetime
import re
import tqdm
import os
import boto3
from Py_Files import credentials
from Py_Files import factset_api
from Py_Files import factset_fields
from Py_Files import qml_ratios
from Py_Files import qml_equity_ratios
import matplotlib.pyplot as plt

data_dir = '/Users/joeybortfeld/Documents/QML Solutions Data/'
s3_dir = 's3://qml-research-data/'

In [None]:
factset_universe = pd.read_csv(data_dir + '/universe_and_traits/qml_universe_ids.csv')
universe_dict = factset_api.load_universe_dict(factset_universe)

In [6]:
# define the universe to use
selected_universe = 'us_nonfin_100m'

# 0. Consolidate all fundamentaldata into a single dataframe 

In [None]:
build_from_source_files = True

if build_from_source_files: 

    print('building from source files')

    
    fsym_list = universe_dict[selected_universe]

    df_annual, error_list_annual = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_fundamentals/annual/')
    df_annual = qml_ratios.preprocess_factset_fundamentals(df_annual, verbose=True) 
    df_annual.to_csv(data_dir + f'factset_data/factset_consolidated/annual_fundamentals_combined_{selected_universe}.csv', index=False)

    df_quarterly, error_list_quarterly = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_fundamentals/quarterly/')
    df_quarterly = qml_ratios.preprocess_factset_fundamentals(df_quarterly, verbose=True)
    df_quarterly.to_csv(data_dir + f'factset_data/factset_consolidated/quarterly_fundamentals_combined_{selected_universe}.csv', index=False)

    df_semi_annual, error_list_semi_annual = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_fundamentals/semi_annual/')
    df_semi_annual = qml_ratios.preprocess_factset_fundamentals(df_semi_annual, verbose=True)
    df_semi_annual.to_csv(data_dir + f'factset_data/factset_consolidated/semi_annual_fundamentals_combined_{selected_universe}.csv', index=False)

    # check for any columns that are not in the flow or stock variable lists
    temp = [c for c in df_annual.columns if c not in factset_fields.flow_var_list + factset_fields.stock_var_list]    
    print('data validation:')
    print('unexpected columns:', temp)
    print()

    # COLLECT ASSETS IN USD DATA

    df_annual_assets_in_usd, error_list_annual = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_assets_in_usd/annual/')
    df_annual_assets_in_usd = qml_ratios.preprocess_factset_fundamentals(df_annual_assets_in_usd, verbose=True) 
    df_annual_assets_in_usd.to_csv(data_dir + f'factset_data/factset_consolidated/annual_assets_in_usd_{selected_universe}.csv', index=False)

    df_semi_annual_assets_in_usd, error_list_semi_annual = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_assets_in_usd/semi_annual/')
    df_semi_annual_assets_in_usd = qml_ratios.preprocess_factset_fundamentals(df_semi_annual_assets_in_usd, verbose=True) 
    df_semi_annual_assets_in_usd.to_csv(data_dir + f'factset_data/factset_consolidated/semi_annual_assets_in_usd_{selected_universe}.csv', index=False)

    df_quarterly_assets_in_usd, error_list_quarterly = qml_ratios.consolidate_selected_files(fsym_list=fsym_list, folder_path=data_dir + 'factset_data/factset_assets_in_usd/quarterly/')
    df_quarterly_assets_in_usd = qml_ratios.preprocess_factset_fundamentals(df_quarterly_assets_in_usd, verbose=True) 
    df_quarterly_assets_in_usd.to_csv(data_dir + f'factset_data/factset_consolidated/quarterly_assets_in_usd_{selected_universe}.csv', index=False)

else: 

    df_annual = pd.read_csv(data_dir + f'factset_data/factset_consolidated/annual_fundamentals_combined_{selected_universe}.csv')
    df_quarterly = pd.read_csv(data_dir + f'factset_data/factset_consolidated/quarterly_fundamentals_combined_{selected_universe}.csv')
    df_semi_annual = pd.read_csv(data_dir + f'factset_data/factset_consolidated/semi_annual_fundamentals_combined_{selected_universe}.csv')
    df_annual_assets_in_usd = pd.read_csv(data_dir + f'factset_data/factset_consolidated/annual_assets_in_usd_{selected_universe}.csv')
    df_semi_annual_assets_in_usd = pd.read_csv(data_dir + f'factset_data/factset_consolidated/semi_annual_assets_in_usd_{selected_universe}.csv')
    df_quarterly_assets_in_usd = pd.read_csv(data_dir + f'factset_data/factset_consolidated/quarterly_assets_in_usd_{selected_universe}.csv')


In [None]:
# process the fundamental data
# - get quarterly, semi-annual, annual data
# - combine into a single dataframe
# - construct ratios

build_from_source_files = True
if build_from_source_files:
    df_annual_formatted = qml_ratios.format_annual_data(df_annual, 
                                         flow_vars=factset_fields.flow_var_list, 
                                         stock_vars=factset_fields.stock_var_list, 
                                         verbose=True)

    df_quarterly_formatted = qml_ratios.format_quarterly_data(df_quarterly, 
                                                flow_vars=factset_fields.flow_var_list, 
                                                stock_vars=factset_fields.stock_var_list, 
                                                verbose=True) 

    df_semi_annual_formatted = qml_ratios.format_semi_annual_data(df_semi_annual, 
                                                flow_vars=factset_fields.flow_var_list, 
                                                stock_vars=factset_fields.stock_var_list, 
                                                verbose=True) 

    df_merged = qml_ratios.merge_quarterly_semi_and_annual(quarterly=df_quarterly_formatted, 
                                                semi_annual=df_semi_annual_formatted, 
                                                annual=df_annual_formatted, 
                                                flow_vars=factset_fields.flow_var_list, 
                                                stock_vars=factset_fields.stock_var_list, 
                                                cleanup=True)


    df_assets_in_usd_formatted = qml_ratios.format_assets_in_usd_data(data_annual=df_annual_assets_in_usd, 
                                                                      data_semi_annual=df_semi_annual_assets_in_usd, 
    data_quarterly=df_quarterly_assets_in_usd, 
    cleanup=True)
    df_merged = df_merged.merge(df_assets_in_usd_formatted, on=['fsym_id', 'fiscal_end_date'], how='left')

    # construct ratios
    df = qml_ratios.build_qml_model_ratios(df_merged, verbose=True)

    earnings_volatility_qf = qml_ratios.calculate_earnings_volatility(df_quarterly_formatted, freq='qf')
    earnings_volatility_saf = qml_ratios.calculate_earnings_volatility(df_semi_annual_formatted, freq='saf')
    df = df.merge(earnings_volatility_qf, on=['fsym_id', 'fiscal_end_date'], how='left')
    df = df.merge(earnings_volatility_saf, on=['fsym_id', 'fiscal_end_date'], how='left')
    for var in ['net_income_vol', 'ebitda_vol', 'ebit_vol', 'sales_vol']:
        df[var] = df[f'{var}_qf'].fillna(df[f'{var}_saf'])
    print('done')

    todays_date = datetime.datetime.now().strftime('%Y%m%d')
    df.to_csv(data_dir + f'qml_modeling_data/fundamental_dataset_{selected_universe}_{todays_date}.csv', index=False)

else:
    df = pd.read_csv(data_dir + f'qml_modeling_data/fundamental_dataset_20250115.csv')
    df['fiscal_end_date'] = pd.to_datetime(df['fiscal_end_date'])


# 1. Consolidate equity data

In [None]:
# collect equity benchmark data
df_benchmarks = qml_equity_ratios.build_benchmark_data()

# build a set of dictionaries that help us map from fsym_id to benchmark index
fsym_to_country_dict = qml_equity_ratios.build_fsym_to_country_dict()
country_to_region_dict = qml_equity_ratios.build_country_to_region_dict()
region_to_benchmark_dict = qml_equity_ratios.build_region_to_benchmark_dict()


In [None]:
# iterate over the fsyms and calculate the equity ratios. write a csv for each fsym with combined equity data/ratios
build_from_source_files = False

if build_from_source_files:

    excel_fsyms = os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/excel_addin_download/')
    excel_fsyms = [i.split('_')[0] for i in excel_fsyms]
    print(len(excel_fsyms))

    # api data
    split_fsyms = os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices SPLIT/')
    split_fsyms = [i.split('.')[0] for i in split_fsyms]
    print(len(split_fsyms))

    # combined equity fysm universe
    equity_fsyms = list(set(excel_fsyms) | set(split_fsyms))
    print('total fsyms', len(equity_fsyms))

    # filter out fsyms that have already been processed
    completed_fsyms = os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/processed/')
    completed_fsyms = [i.split('.')[0] for i in completed_fsyms]

    equity_fsyms = [f for f in equity_fsyms if not f in completed_fsyms]
    print('--remaining', len(equity_fsyms))

    # calculate equity ratios
    status = qml_equity_ratios.build_equity_ratios(equity_fsyms, 
                                        df_benchmarks, 
                                        fsym_to_country_dict, 
                                        country_to_region_dict, 
                                        region_to_benchmark_dict, 
                                        output_dir='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/processed/')

In [None]:
# ccollect equity data for all fsyms and consolidate results

build_from_source_files = True
if build_from_source_files:

    fsym_list = universe_dict[selected_universe]

    completed_fsyms = os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/processed/')
    completed_fsyms = [i.split('.')[0] for i in completed_fsyms]
    completed_fsyms.remove('')

    fsym_list = [f for f in fsym_list if f in completed_fsyms]

    collection = []
    for f in tqdm.tqdm(completed_fsyms):
        temp = pd.read_csv(data_dir + f'factset_data/factset_equity/processed/{f}.csv')
        collection.append(temp)

    df_equity_ratios = pd.concat(collection, axis=0)
    df_equity_ratios.to_csv(data_dir + f'factset_data/factset_consolidated/equity_ratios_combined_{selected_universe}.csv', index=False)

else:
    df_equity_ratios = pd.read_csv(data_dir + f'factset_data/factset_consolidated/equity_ratios_combined_{selected_universe}.csv')
    df_equity_ratios['date'] = pd.to_datetime(df_equity_ratios['date'])
    

# 2. Combine fundamental data/ratios with equity data/ratios

In [None]:
# combine fundamental data/ratios with equity data/ratios

####################################################################
# fundamental data
df = pd.read_csv(data_dir + f'qml_modeling_data/fundamental_dataset_us_nonfin_100m_20250212.csv')

for c in ['fiscal_end_date', 'eps_report_date_qf', 'eps_report_date_saf', 'eps_report_date_af']:
    df[c] = pd.to_datetime(df[c])

# construct report date
df['report_date'] = df['eps_report_date_qf'].fillna(df['eps_report_date_saf']).fillna(df['eps_report_date_af'])

# if no report date is available, use the fiscal date and offset by 90 days
mask = df['report_date'].isnull()
df.loc[mask, 'report_date'] = df.loc[mask, 'fiscal_end_date'].map(lambda x: x + pd.Timedelta(days=90))

# reset the report date to the last day of the month
df['report_date'] = df['report_date'] + pd.offsets.MonthEnd(0)
df['report_month'] = df['report_date'].dt.month
df['report_year'] = df['report_date'].dt.year

####################################################################
# convert from quarterly/semi-annual/annual observations to monthly observations
temp = df[['fsym_id', 'report_date']].copy()
temp['max_report_date'] = temp.groupby(by='fsym_id')['report_date'].transform('max')
temp['min_report_date'] = temp.groupby(by='fsym_id')['report_date'].transform('min')
temp = temp.drop_duplicates(subset='fsym_id', keep='first')
temp = temp[['fsym_id', 'min_report_date', 'max_report_date']]

print('build monthly template')
all_fsyms = temp['fsym_id'].unique()
all_dates = pd.date_range(start=df['report_date'].min(), end=df['report_date'].max(), freq='ME')
fsym_date_combos = []
for f in tqdm.tqdm(all_fsyms):
    for d in all_dates:
        fsym_date_combos.append((f, d))
fsym_date_df = pd.DataFrame(fsym_date_combos, columns=['fsym_id', 'report_date'])

fsym_date_df = fsym_date_df.merge(temp, on='fsym_id', how='left')

# trim the template to the range of report dates for each fsym
print('--pre-trim:', fsym_date_df.shape)
fsym_date_df = fsym_date_df[fsym_date_df['report_date'] >= fsym_date_df['min_report_date']]
fsym_date_df = fsym_date_df[fsym_date_df['report_date'] <= fsym_date_df['max_report_date']]
print('--post-trim:', fsym_date_df.shape)

df = fsym_date_df.merge(df, on=['fsym_id', 'report_date'], how='outer')
df = df.sort_values(by=['fsym_id', 'report_date'])

# fill forward the fundamental data into the monthly template
fund_cols = [c for c in df.columns]
print('fill forward fundamentals into monthly template')
for c in tqdm.tqdm(fund_cols):
    df[c] = df.groupby('fsym_id')[c].ffill(limit=14)

# recalc report year and month in case of gaps from forward fill
df['report_month'] = df['report_date'].dt.month
df['report_year'] = df['report_date'].dt.year

####################################################################
# merge equity data
print('merge equity data')
df_equity_ratios['date'] = pd.to_datetime(df_equity_ratios['date'])
df_equity_ratios['report_month'] = df_equity_ratios['date'].dt.month
df_equity_ratios['report_year'] = df_equity_ratios['date'].dt.year

df = df.merge(df_equity_ratios, on=['fsym_id', 'report_month', 'report_year'], how='outer')
df = df.sort_values(by=['fsym_id', 'report_year', 'report_month'])

########################################################
# market leverage - the only combination of fundamental and equity data
# market cap is in millions, ff_liabs is in millions
df['market_leverage'] = df['ff_liabs'] / (df['market_cap'] + df['ff_liabs']) 


# 3. Add Descriptive Data and Bankruptcy Labels

In [None]:

# 1. get coverage data
df_coverage = pd.read_csv(data_dir + 'universe_and_traits/qml_universe_ids.csv')
df_coverage = df_coverage[['fsym_id', 'name1', 'name2', 'factset_econ_sector', 'factset_industry', 
                       'entity_country_hq', 'exchange_country',
                       'max_assets_in_usd', 'factset_entity_id', 'ultimate_parent_id']]
df_coverage = df_coverage[df_coverage['fsym_id'] != '@NA']

print('coverage data shape:', df_coverage.shape)
print(df_coverage['factset_econ_sector'].value_counts())
print()

# merge company descriptive data
df = df.merge(df_coverage, on='fsym_id', how='left')

# 2. get company default data
df_defaults = pd.read_csv(data_dir + 'universe_and_traits/bankruptcy_data.csv')
df_defaults['bankruptcy_date'] = pd.to_datetime(df_defaults['bankruptcy_date'])
df_defaults = df_defaults[['fsym_id', 'bankruptcy_date']]
df_defaults = df_defaults[df_defaults['bankruptcy_date'].notnull()]
df_defaults = df_defaults[df_defaults['fsym_id'] != '@NA']
df_defaults = df_defaults[df_defaults['fsym_id'] != '']
df_defaults = df_defaults[df_defaults['fsym_id'].notnull()]
validation = df_defaults.duplicated(subset='fsym_id', keep='first').sum()
if validation > 0:
    print('ALERT: bankruptcy duplicates found')
    print('bankruptcy duplicates:', validation)
df_defaults = df_defaults.sort_values(by=['fsym_id', 'bankruptcy_date'], ascending=False)
df_defaults = df_defaults.drop_duplicates(subset='fsym_id', keep='last')

df = df.merge(df_defaults, on='fsym_id', how='left')

# 3. drop financial companies (banks, insurance, finance)
mask1 = df['factset_econ_sector'] == 'Finance'
mask2 = df['factset_industry'] != 'Real Estate Development'
df = df[~(mask1 & mask2)]

print('fsym_ids with bankruptcy:', df[df['bankruptcy_date'].notnull()]['fsym_id'].nunique())

# label forward defaults over 1,2,3,4,5 years
print('labeling defaults')
for i in tqdm.tqdm([1,2,3,4,5]):

    df[f'default_{i}'] = 0
    mask1 = (df['bankruptcy_date'] - df['report_date']).dt.days < (365*i + 365*0.5)
    mask2 = (df['bankruptcy_date'] - df['report_date']).dt.days >= (365*i - 365*0.5)
    df.loc[mask1 & mask2, f'default_{i}'] = 1

    # flag -1 defaults
    mask1 = (df['bankruptcy_date'] - df['report_date']).dt.days < (365*i - 365*0.5)
    df.loc[mask1, f'default_{i}'] = -1

print('write to csv')
todays_date = datetime.datetime.now().strftime('%Y%m%d')
df.to_csv(data_dir + f'qml_modeling_data/modeling_dataset_with_bankruptcy_labels_{selected_universe}_{todays_date}.csv', index=False)
print('done all')



In [None]:
# simple diagnostics
print('fsym count:',df['fsym_id'].nunique())
print('default rate:', df[df['default_1'] != -1]['default_1'].mean())
print('defaulted fsym count:', df[df['default_1'] == 1]['fsym_id'].nunique())
print('total obs count:', df.shape[0])


In [None]:
# company-specific diagnostics
temp = df[df['fsym_id'] == 'MH33D6-R'].copy()

fig, ax = plt.subplots(figsize=(10, 12), ncols=3, nrows=3)
temp.set_index('date')['ff_assets_in_usd'].plot(ax=ax[0, 0], title='ff_assets_in_usd')
temp.set_index('fiscal_end_date')['net_income_to_sales'].plot(ax=ax[0, 1], title='net_income_to_sales')
temp.set_index('fiscal_end_date')['ebitda_to_interest_expense'].plot(ax=ax[0, 2], title='ebitda_to_interest_expense')

temp.set_index('date')['capm_idio_vol_365'].plot(ax=ax[1, 0], title='capm_idio_vol_365')
temp.set_index('date')['return_12'].plot(ax=ax[1, 1], title='return_12')
temp.set_index('date')['ulcer_index_128'].plot(ax=ax[1, 2], title='ulcer_index_182')

temp.set_index('date')['price'].plot(ax=ax[2, 0], title='price')
temp.set_index('date')['market_cap'].plot(ax=ax[2, 1], title='market_cap')
temp.set_index('date')['market_leverage'].plot(ax=ax[2, 2], title='market_leverage')

plt.tight_layout()

temp[['fsym_id', 'fiscal_end_date', 'date', 'net_income_to_sales', 'capm_idio_vol_365']].tail(20)