In [None]:
%load_ext autoreload
%autoreload 2

import requests
import json
import pandas as pd
import numpy as np
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import time
import datetime
import re
import tqdm
import os
import boto3
from Py_Files import credentials
from Py_Files import factset_api
from Py_Files import factset_fields
from Py_Files import qml_ratios

data_dir = '/Users/joeybortfeld/Documents/QML Solutions Data/'
s3_dir = 's3://qml-research-data/'

# 0. Consolidate all data into a single dataframe 

In [44]:
build_from_source_files = False

if build_from_source_files: 

    print('building from source files')

    df_annual, error_list_annual = qml_ratios.consolidate_local_data(data_dir + 'factset_data/factset_fundamentals/annual/')
    df_annual = qml_ratios.preprocess_factset_fundamentals(df_annual, verbose=True) 
    df_annual.to_csv(data_dir + ' factset_data/factset_consolidated/annual_fundamentals_combined.csv', index=False)

    df_quarterly, error_list_quarterly = qml_ratios.consolidate_local_data(data_dir + 'factset_data/factset_fundamentals/quarterly/')
    df_quarterly = qml_ratios.preprocess_factset_fundamentals(df_quarterly, verbose=True)
    df_quarterly.to_csv(data_dir + 'factset_data/factset_consolidated/quarterly_fundamentals_combined.csv', index=False)

    df_semi_annual, error_list_semi_annual = qml_ratios.consolidate_local_data(data_dir + 'factset_data/factset_fundamentals/semi_annual/')
    df_semi_annual = qml_ratios.preprocess_factset_fundamentals(df_semi_annual, verbose=True)
    df_semi_annual.to_csv(data_dir + 'factset_data/factset_consolidated/semi_annual_fundamentals_combined.csv', index=False)

    # check for any columns that are not in the flow or stock variable lists
    temp = [c for c in df_annual.columns if c not in factset_fields.flow_var_list + factset_fields.stock_var_list]    
    print('data validation:')
    print('unexpected columns:', temp)
    print()

    # COLLECT ASSETS IN USD DATA

    # Annual Assets in USD
    # iterate specifically over the set of fsym_ids in the fundamentals data
    # (we downloaded all 100k fsyms for assets in USD, but we only may have downloaded a subset of all fsyms for the full fundamental data)
    fsym_list = os.listdir(data_dir + 'factset_data/factset_fundamentals/annual/')
    file_list = [data_dir +  'factset_data/factset_fundamentals/annual/' + f for f in fsym_list]
    df_annual_assets_in_usd, error_list_annual = qml_ratios.consolidate_selected_files(file_list)

    print('annual error count:',len(error_list_annual))
    df_annual_assets_in_usd = qml_ratios.preprocess_factset_fundamentals(df_annual_assets_in_usd, verbose=True) 
    df_annual_assets_in_usd.to_csv(data_dir + 'factset_data/factset_consolidated/annual_assets_in_usd.csv', index=False)


    # Semi-Annual Assets in USD
    fsym_list = os.listdir(data_dir + 'factset_data/factset_fundamentals/semi_annual/')
    file_list = [data_dir +  'factset_data/factset_fundamentals/semi_annual/' + f for f in fsym_list]
    df_semi_annual_assets_in_usd, error_list_semi_annual = qml_ratios.consolidate_selected_files(file_list)

    print('semi-annual error count:',len(error_list_semi_annual))
    df_semi_annual_assets_in_usd = qml_ratios.preprocess_factset_fundamentals(df_semi_annual_assets_in_usd, verbose=True) 
    df_semi_annual_assets_in_usd.to_csv(data_dir + 'factset_data/factset_consolidated/semi_annual_assets_in_usd.csv', index=False)



else: 

    df_annual = pd.read_csv(data_dir + 'factset_data/factset_consolidated/annual_fundamentals_combined.csv')
    df_quarterly = pd.read_csv(data_dir + 'factset_data/factset_consolidated/quarterly_fundamentals_combined.csv')
    df_semi_annual = pd.read_csv(data_dir + 'factset_data/factset_consolidated/semi_annual_fundamentals_combined.csv')
    df_annual_assets_in_usd = pd.read_csv(data_dir + 'factset_data/factset_consolidated/annual_assets_in_usd.csv')
    df_semi_annual_assets_in_usd = pd.read_csv(data_dir + 'factset_data/factset_consolidated/semi_annual_assets_in_usd.csv')


In [None]:
df_annual_formatted = qml_ratios.format_annual_data(df_annual, 
                                         flow_vars=factset_fields.flow_var_list, 
                                         stock_vars=factset_fields.stock_var_list, 
                                         verbose=True)

df_quarterly_formatted = qml_ratios.format_quarterly_data(df_quarterly, 
                                              flow_vars=factset_fields.flow_var_list, 
                                              stock_vars=factset_fields.stock_var_list, 
                                              verbose=True) 

df_semi_annual_formatted = qml_ratios.format_semi_annual_data(df_semi_annual, 
                                              flow_vars=factset_fields.flow_var_list, 
                                              stock_vars=factset_fields.stock_var_list, 
                                              verbose=True) 

df_merged = qml_ratios.merge_quarterly_semi_and_annual(quarterly=df_quarterly_formatted, 
                                             semi_annual=df_semi_annual_formatted, 
                                             annual=df_annual_formatted, 
                                             flow_vars=factset_fields.flow_var_list, 
                                             stock_vars=factset_fields.stock_var_list, 
                                             cleanup=True)


df_assets_in_usd_formatted = qml_ratios.format_assets_in_usd_data(data_annual=df_annual_assets_in_usd, data_semi_annual=df_semi_annual_assets_in_usd, cleanup=True)
df_merged = df_merged.merge(df_assets_in_usd_formatted, on=['fsym_id', 'fiscal_end_date'], how='left')

# construct ratios
df = qml_ratios.build_qml_model_ratios(df_merged, verbose=True)

earnings_volatility_qf = qml_ratios.calculate_earnings_volatility(df_quarterly_formatted, freq='qf')
earnings_volatility_saf = qml_ratios.calculate_earnings_volatility(df_semi_annual_formatted, freq='saf')
df = df.merge(earnings_volatility_qf, on=['fsym_id', 'fiscal_end_date'], how='left')
df = df.merge(earnings_volatility_saf, on=['fsym_id', 'fiscal_end_date'], how='left')
for var in ['net_income_vol', 'ebitda_vol', 'ebit_vol', 'sales_vol']:
    df[var] = df[f'{var}_qf'].fillna(df[f'{var}_saf'])
print('done')


In [64]:
df.to_csv(data_dir + 'qml_modeling_data/fundamental_dataset_20250115.csv', index=False)