In [1]:
%load_ext autoreload
%autoreload 2

import requests
import json
import pandas as pd
import numpy as np
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import time
import datetime
import re
import tqdm
import os
import boto3
from Py_Files import credentials
from Py_Files import factset_api
from Py_Files import factset_fields

data_dir = '/Users/joeybortfeld/Documents/QML Solutions Data/'
s3_dir = 's3://qml-research-data/'

# 0. Load the Factset Universe (All Fsym IDS) into Dictionary

In [None]:
factset_universe = pd.read_csv('/Users/joeybortfeld/Downloads/qml_universe_ids.csv')

universe_dict = factset_api.load_universe_dict(factset_universe)

# 1. Download Assets in USD using the Factset Fundamentals API

In [None]:
error_list = factset_api.batch_fundamental_download(fsym_list=universe_dict['$1B'],
                               field_list=['FF_ASSETS'],
                               currency='USD',
                               periodicity_list=['annual', 'quarterly' 'semi_annual'],
                               start_date='1990-01-01',
                               end_date='2024-12-31',
                               skip_if_done=True,
                               output_folder='/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/',
                               factset_api_authorization=credentials.factset_api_authorization)
    

# 2. Download All Metrics in Local Currency using the Factset Fundamentals API

In [None]:
error_list = factset_api.batch_fundamental_download(fsym_list=universe_dict['full'],
                               field_list=factset_fields.fundamental_fields,
                               currency='LOCAL',
                               periodicity_list=['annual', 'quarterly' 'semi_annual'],
                               start_date='1990-01-01',
                               end_date='2024-12-31',
                               skip_if_done=True,
                               output_folder='/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/',
                               factset_api_authorization=credentials.factset_api_authorization)
    

# 3. Review Downloaded Data on Local Storage and Upload to S3

In [None]:
# list the file counts stored locally

folder_list = [
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/quarterly/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/semi_annual/',

    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/semi_annual/',
]

for this_folder in folder_list:
    file_list = os.listdir(this_folder)
    
    print(this_folder, len(file_list))

        # for this_file in file_list:
    #     aws_s3.copy_file_to_s3(local_file_path=this_folder + this_file, 
    #                             s3_bucket='qml-solutions-new-york', 
    #                             s3_key='factset-api-fundamentals/', 
    #                             aws_access_key_id=credentials.aws_access_key_id, 
    #                             aws_secret_access_key=credentials.aws_secret_access_key,
    #                             verbose=True)
    


In [None]:
# transfer local files to s3


folder_list = [
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/quarterly/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/semi_annual/',

    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/semi_annual/',
]

for this_folder in folder_list:
    file_list = os.listdir(this_folder)
    
    print(this_folder, len(file_list))

    for this_file in tqdm.tqdm(file_list):
        aws_s3.copy_file_to_s3(local_file_path=this_folder + this_file, 
                                s3_bucket='qml-solutions-new-york', 
                                s3_key='XXXXXXXXXXXXXXX',
                                aws_access_key_id=credentials.aws_access_key_id, 
                                aws_secret_access_key=credentials.aws_secret_access_key,
                                verbose=True)
        


# 0. Get the Factset Universe

In [None]:
build_max_assets_in_usd = False

if build_max_assets_in_usd:

    # ASSEMBLE ASSETS IN USD ACROSS THE ENTIRE FSYM_ID UNIVERSE
    output_folder = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_annual_assets_in_usd/'
    fsym_id_list = output_files = os.listdir(output_folder)
    fsym_id_list = [f.replace('.csv', '') for f in fsym_id_list]
    print('file count:', len(fsym_id_list))


    counter = 0
    results = []
    for fsym_id in tqdm.tqdm(fsym_id_list):

        if counter == 100:
            pass
        counter +=1 

        if fsym_id[0] == '.':
            print('error:', fsym_id)
            continue

        df = pd.read_csv(output_folder + f'{fsym_id}.csv')
        results.append([fsym_id, df['value'].max()])

    max_assets_in_usd_df = pd.DataFrame(results, columns=['fsym_id', 'max_assets_in_usd'])
    max_assets_in_usd_df.to_csv('/Users/joeybortfeld/Downloads/max_assets_in_usd_by_fsym_id.csv', index=False)


In [None]:
# Get the full factset universe of companies (primary issues) and the S&P500 subset

factset_universe = pd.read_excel('/Users/joeybortfeld/Documents/CreditGradients Data/factset_primary_issue_universe.xlsx', skiprows=4)

rename_dict = {'Sec is Primary Issue': 'ff_iscomp',
               'Company': 'ff_co_name',
               'FactSet Econ Sector': 'factset_econ_sector',
               'FactSet Ind': 'factst_industry',
               'Bus Desc': 'ff_bus_desc_ext',
               'Entity ID': 'factset_entity_id',
               'Entity Country HQ': 'factset_hq_country',
               'Perm. Sec. ID': 'fsym_id',
               'Max Assets (USD)': 'max_assets_in_usd',
               'Exchange Couuntry Name': 'exchange_country',
               'Exchange Ticker': 'exchange_ticker',
               'Ult Parent ID': 'ultimate_parent_id',
               'Primary Equity Listing': 'primary_equity_listing',
               'P_SYMBOL': 'ticker'}

factset_universe = factset_universe.rename(columns=rename_dict)
factset_universe = factset_universe[factset_universe['fsym_id'] != '@NA']

# sort largest to smallest by total assets (USD)
factset_universe = factset_universe.sort_values(by='max_assets_in_usd', ascending=False)
factset_universe = factset_universe.reset_index(drop=True)

print('Total primary issue universe size:', len(factset_universe))

print('--count with max assets > $10B:', len(factset_universe[factset_universe['max_assets_in_usd'] > 10_000]))
print('--count with max assets > $5B:', len(factset_universe[factset_universe['max_assets_in_usd'] > 5_000]))
print('--count with max assets > $1B:', len(factset_universe[factset_universe['max_assets_in_usd'] > 1_000]))
print('--count with max assets > $500M:', len(factset_universe[factset_universe['max_assets_in_usd'] > 500]))
print('--count with max assets > $100M:', len(factset_universe[factset_universe['max_assets_in_usd'] > 100]))

print()

################################################################################
# get the S&P 500
sp500_universe = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/sp500_constituents.csv')
sp500_universe['sp500'] = 1
sp500_universe = sp500_universe.rename(columns={'Symbol': 'ticker'})
sp500_universe = sp500_universe[['ticker', 'sp500']]

# merge the two
factset_universe = factset_universe.merge(sp500_universe, on='ticker', how='left')
factset_universe['sp500'] = factset_universe['sp500'].fillna(0).astype(int)
print('--sp500 count:', len(factset_universe[factset_universe['sp500'] == 1]))

################################################################################
# BUILD THE UNIVERSE SET AT VARIOUS ASSET THRESHOLDS
universe_dict = {}

# save the sp500 fsym_ids as a list
temp = factset_universe[factset_universe['sp500'] == 1].copy()
temp = temp[temp['fsym_id'] != '@NA']
sp500_list = list(temp['fsym_id'])
sp500_list = sorted(sp500_list)
universe_dict['sp500'] = sp500_list

# save the full universe fsym_ids as a list
temp = factset_universe.copy()
temp = temp[temp['fsym_id'] != '@NA']
full_list = list(temp['fsym_id'].unique())
universe_dict['full'] = full_list

# save fsym_id universe at various asset thresholds
for this_threshold in [('$10B', 10_000), ('$5B', 5_000), ('$1B', 1_000), ('$500M', 500), ('$250M', 250), ('$100M', 100)]:
    temp = factset_universe[factset_universe['max_assets_in_usd'] > this_threshold[1]].copy()
    temp = temp[temp['fsym_id'] != '@NA']
    temp = temp['fsym_id'].unique()
    temp = sorted(temp)
    universe_dict[this_threshold[0]] = temp

for k,v in universe_dict.items():
    print(k,':', len(v))

# save the semi-annual fsym_ids
df_semi_annual = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_semi_annual_fsym_id_list.csv')
universe_dict['semi_annual'] = list(df_semi_annual['fsym_id'])

# factset_universe.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_universe_processed.csv', index=False)

print('done all')

# 1. Get the list of metrics in the Fundamentals API

In [None]:
metric_categories = [
    'INCOME_STATEMENT',
    'BALANCE_SHEET',
    'CASH_FLOW',
    'PENSION_AND_POSTRETIREMENT',
    'MARKET_DATA',
    'MISCELLANEOUS',
    'DATES'
]
metrics_endpoint = 'https://api.factset.com/content/factset-fundamentals/v2/metrics'

collection = []
for this_category in metric_categories:

    print(this_category)

    metrics_request = {"category": this_category}
    headers = {'Accept': 'application/json','Content-Type': 'application/json'}

    #create a post request
    metrics_post = json.dumps(metrics_request)
    metrics_response = requests.get(url = metrics_endpoint, 
                                    data=metrics_post, 
                                    auth = authorization, 
                                    headers = headers, 
                                    verify= False )
    print('HTTP Status: {}'.format(metrics_response.status_code))

    #create a dataframe from POST request, show dataframe properties
    metrics_data = json.loads(metrics_response.text)
    metrics_df = pd.DataFrame(metrics_data['data'])
    metrics_df['metric_category'] = this_category
    print(metrics_df.shape)
    
    collection.append(metrics_df)

metrics_df = pd.concat(collection, ignore_index=True)
# metrics_df.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/all_fundamental_api_metrics_list.csv')
metrics_df


# 2. Fundamentals API Download - Assets in USD

# 3. Fundamentals API Download - All Metrics in Local Currency


In [None]:
error_list = factset_api.batch_fundamental_download(fsym_list=universe_dict['full'],
                               field_list=['FF_ASSETS'],
                               currency='LOCAL',
                               periodicity_list=['annual', 'quarterly' 'semi_annual'],
                               start_date='1990-01-01',
                               end_date='2024-12-31',
                               skip_if_done=True,
                               output_folder='/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/',
                               factset_api_authorization=credentials.factset_api_authorization)
    

In [None]:
# INDIVIDUAL FSYM_ID DOWNLOAD METHOD

# download parameters
download_type_dict = {'annual': ['ANN', 20], 
                      'quarterly': ['QTR', 10],
                      'semi_annual': ['SEMI', 15]}

download_type = 'semi_annual'
output_folder = f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_{download_type}/'
start_date = '1990-01-01'
end_date = '2024-12-31'
skip_if_done = True

################################################################################
# define the company set
company_set = universe_dict['$1B']
if np.NaN in company_set:
    company_set.remove(np.NaN)

if download_type == 'semi_annual':
    semi_annual_set = universe_dict['semi_annual']
    company_set = [e for e in company_set if e in semi_annual_set]

print('company set size:', len(company_set))

# prescreen to see if the fsym_id is already in the output folder
if skip_if_done:
    print('--skipping files that are already done')
    output_files = os.listdir(output_folder)
    print('--file count in output folder:', len(output_files))
    output_files = [e.replace('.csv', '') for e in output_files]    
    company_set = [e for e in company_set if e not in output_files]

    print('--new company set size when skipping:', len(company_set))
################################################################################


# loop through fsym_ids
error_list = []
collection = []
start_time = time.time()
for this_fsym in tqdm.tqdm(company_set):

    try:
        result = utilities.download_fundamentals(id_list=[this_fsym],
                                    field_list=fundamentals_var_list,
                                    # field_list=['FF_ASSETS',],
                                    periodicity=download_type_dict[download_type][0],
                                    start_date=start_date,
                                    end_date=end_date,
                                    currency='LOCAL',
                                    verbose=False,
                                    authorization=authorization)
        response_code, fundamentals_df = result

        if response_code != 200:
            error_list.append(this_fysm)
            print('error:', this_fysm)
        else:

            fundamentals_df.to_csv(output_folder + f'{this_fsym}.csv', index=False)
            collection.append(fundamentals_df)

    except:
        error_list.append(this_fsym)
        print('error:', this_fsym)

    counter += 1

print('done in {}m'.format((time.time() - start_time) / 60))

if len(error_list) > 0:
    print('errors in download:', len(error_list))

    

else:
    print('No errors')


In [None]:
# DOWNLOAD ASSETS IN USD


download_type_dict = {'annual': ['ANN', 20], 
                      'quarterly': ['QTR', 10],
                      'semi_annual': ['SEMI', 15]}

download_type = 'semi_annual'
start_date = '1990-01-01'
end_date = '2024-12-31'

id_batches = batch_a_list(full_list, 30)
error_list = []
collection = []
start_time = time.time()
print('batch count:', len(id_batches))
for counter, this_batch in enumerate(id_batches):

    if counter % 250 == 0:
        print(counter, datetime.datetime.now().strftime("%H:%M:%S"))
    counter += 1

    result = utilities.download_fundamentals(id_list=this_batch,
                                    field_list=['FF_ASSETS'],
                                    periodicity=download_type_dict[download_type][0],
                                    start_date=start_date,
                                    end_date=end_date,
                                    currency='USD',
                                    verbose=False,
                                    authorization=authorization)
    response_code, fundamentals_df = result

    if response_code != 200:
        error_list.append([counter,this_batch])
    else:
        fundamentals_df.to_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_{download_type}_assets_in_usd/fundamentals_df_{counter}.csv', index=False)
        collection.append(fundamentals_df)

print('done in {}m'.format((time.time() - start_time) / 60))
if len(error_list) > 0:
    print('errors in download:')
    print(error_list)
else:
    print('No errors')

# combine results into one dataframe
df = pd.concat(collection, ignore_index=True)
df.to_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_{download_type}_assets_in_usd/fundamentals_df_combined.csv', index=False)

if download_type == 'semi_annual':

    # save fsym_ids that report semi-annual data
    temp = df[df['value'].notnull()]
    temp = temp[temp['fsymId'] != '@NA']
    temp['fsymId'].nunique()
    temp = temp.drop_duplicates(subset=['fsymId'])
    temp = temp[['fsymId']]
    temp = temp.reset_index(drop=True)
    temp.columns=['fsym_id']
    temp.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_semi_annual_fsym_id_list.csv', index=False)


In [None]:
# DATA PROCESSING
df_annual = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_annual/fundamentals_df_combined.csv')
df_annual = utilities.preprocess_factset_fundamentals(df_annual, verbose=True)

df_quarterly = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_quarterly/fundamentals_df_combined.csv')
df_quarterly = utilities.preprocess_factset_fundamentals(df_quarterly, verbose=True)

df_semi_annual = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_semi_annual/fundamentals_df_combined.csv')
df_semi_annual = utilities.preprocess_factset_fundamentals(df_semi_annual, verbose=True)

# check for any columns that are not in the flow or stock variable lists
temp = [c for c in df_annual.columns if c not in utilities.flow_var_list + utilities.stock_var_list]    
print('data validation:')
print('non flow/stock vars:', temp)
print()

df_annual_formatted = utilities.format_annual_data(df_annual, 
                                         flow_vars=utilities.flow_var_list, 
                                         stock_vars=utilities.stock_var_list, 
                                         verbose=True)

df_quarterly_formatted = utilities.format_quarterly_data(df_quarterly, 
                                              flow_vars=utilities.flow_var_list, 
                                              stock_vars=utilities.stock_var_list, 
                                              verbose=True) 

df_semi_annual_formatted = utilities.format_semi_annual_data(df_semi_annual, 
                                              flow_vars=utilities.flow_var_list, 
                                              stock_vars=utilities.stock_var_list, 
                                              verbose=True) 

df_merged = utilities.merge_quarterly_semi_and_annual(quarterly=df_quarterly_formatted, 
                                             semi_annual=df_semi_annual_formatted, 
                                             annual=df_annual_formatted, 
                                             flow_vars=utilities.flow_var_list, 
                                             stock_vars=utilities.stock_var_list, 
                                             cleanup=True)


# construct ratios
df = utilities.build_qml_model_ratios(df_merged, verbose=True)

print('done')


In [203]:
temp = pd.read_csv('/Users/joeybortfeld/Downloads/Default list.csv')

# collect all isins for each default
temp1 = temp.groupby(['description', 'defaulttype1', 'defaultdate'])[['isin', 'cusip', 'ticker' ]].agg(lambda x: set(x))
temp1 = temp1.reset_index()


temp = temp.drop_duplicates(subset=['description', 'defaulttype1', 'defaultdate'])
temp['first_default_date'] = temp.groupby(['description'])['defaultdate'].transform('min')
temp = temp.sort_values(by=['first_default_date', 'description'])
temp = temp.reset_index(drop=True)

temp = temp[['description', 'defaulttype1', 'defaultdate', 'SectorLevel3', 'SectorLevel4', 'Commodity Sector']]
temp = temp.merge(temp1, on=['description', 'defaulttype1', 'defaultdate'], how='left')

temp.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/default_list_unique.csv', index=False)


In [None]:
# Download company profile data

company_set = universe_dict['full']
if np.NaN in company_set:
    company_set.remove(np.NaN)

# error_list = []
# collection = []
error_list2 = []
for this_fsym in tqdm.tqdm(error_list):

    result = utilities.download_company_profile(id_list=[this_fsym,], authorization=authorization, verbose=False)
    response_code, df = result
    if response_code != 200:
        error_list2.append(this_fsym)
    else:
        collection.append(df)

if len(error_list2) > 0:
    print('errors in download:', len(error_list2))

else:
    print('No errors')

    df = pd.concat(collection, ignore_index=True)
    df.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_company_profiles.csv', index=False)



In [None]:
print('errors in download:', len(error_list))

In [None]:
# Download fundamentals descriptive data

descriptive_var_list = [
    'FF_CO_NAME',

    'FF_COUNTRY',

    'FF_BUS_DESC_ABBREV',
    'FF_BUS_DESC_EXT',
    
    'FF_ISCOMP',
    'FF_SECACT',
    'FF_ACQ_DATE',

    'FF_GEN_IND', 
    'FF_MAJOR_IND',
    'FF_MAJOR_SUBIND',

  'FF_MAJOR_IND_NAME',
  'FF_MAJOR_SUBIND_NAME']
collection = []
for this_metric in [[e] for e in descriptive_var_list]:

  fundamentals_endpoint = 'https://api.factset.com/content/factset-fundamentals/v2/fundamentals'
  fundamentals_request_2={
    "data": {
      "ids": ['AAPL-US', 'MSFT-US'],
      "metrics": this_metric
    }
  }
  headers = {'Accept': 'application/json','Content-Type': 'application/json'}

  #create a post request
  fundamentals_post = json.dumps(fundamentals_request_2)
  # print('POST request:')
  # print(fundamentals_endpoint)
  # print(fundamentals_post)
  # print()

  fundamentals_response = requests.post(url = fundamentals_endpoint, 
                                        data=fundamentals_post, 
                                        auth = authorization, 
                                        headers = headers, 
                                        verify= False )
  # print('HTTP Status: {}'.format(fundamentals_response.status_code))

  # create a dataframe from POST request, show dataframe properties
  fundamentals_data = json.loads(fundamentals_response.text)
  if fundamentals_response.status_code != 200:
    print(fundamentals_data)
  else:
    df = pd.DataFrame(fundamentals_data['data'])
    # print('RECORDS:',len(fundamentals_df))
    # print(fundamentals_df[['metric', 'value']].head(20))
    df = df.pivot(index=['requestId', 'fsymId'], columns='metric', values='value')
    df.reset_index(inplace=True)
    collection.append(df)

df = pd.concat(collection, ignore_index=True)
df

In [None]:
endpoint = 'https://api.factset.com/content/factset-fundamentals/v2/company-reports/profile?ids=AAPL-US,MSFT-US'

request={
    'data': {
        "ids": ["AAPL-US"],
    }
  }


headers = {'Accept': 'application/json','Content-Type': 'application/json'}
post = json.dumps(request)
response = requests.get(url = endpoint, auth = authorization, headers = headers, verify= False )
temp = pd.DataFrame(json.loads(response.text)['data'])

temp

In [None]:
id_list = ['AAPL-US', 'MSFT-US']
endpoint = 'https://api.factset.com/content/factset-fundamentals/v2/company-reports/profile?ids=' + ','.join(id_list)
headers = {'Accept': 'application/json','Content-Type': 'application/json'}
response = requests.get(url = endpoint, auth = authorization, headers = headers, verify= False )
temp = pd.DataFrame(json.loads(response.text)['data'])
temp


In [None]:

headers = {'Accept': 'application/json','Content-Type': 'application/json'}

#create a post request
fundamentals_post_3 = json.dumps(fundamentals_request_3)
fundamentals_response_3 = requests.post(url = fundamentals_endpoint, data=fundamentals_post_3, auth = authorization, headers = headers, verify= False )
print('HTTP Status: {}'.format(fundamentals_response_3.status_code))

#create a dataframe from POST request, show dataframe properties
fundamentals_data_3 = json.loads(fundamentals_response_3.text)
# fundamentals_df_3 = json_normalize(fundamentals_data_3['data'])
fundamentals_df_3 = pd.DataFrame(fundamentals_data_3['data'])
print('COLUMNS:')
print('')
print(fundamentals_df_3.dtypes)
print('')
print('RECORDS:',len(fundamentals_df_3))

#show the last 5 records for select columns
fundamentals_df_3[['requestId','fsymId','metric','periodicity','fiscalPeriod','periodicity','fiscalYear','fiscalPeriodLength','fiscalEndDate','reportDate','epsReportDate','updateType','currency','value']].tail()


In [None]:


file = 's3://qml-solutions-new-york/factset-api-fundamentals-annual/fundamentals_df_0.csv'
temp = pd.read_csv(file, storage_options=aws_credentials)
temp

# Housekeeping

In [None]:

# CONVERSION FROM BATCHES TO SINGLE FSYM_ID PER FILE

counter = 0
for i in range(1,3275):

    if counter % 250 == 0:
        print(counter, datetime.datetime.now().strftime("%H:%M:%S"))

    file = f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_semi_annual_assets_in_usdX/fundamentals_df_{i}.csv'
    df = pd.read_csv(file)

    fsym_ids = list(df['fsymId'].unique())
    for fsym_id in fsym_ids:
        df[df['fsymId'] == fsym_id].to_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_semi_annual_assets_in_usd/{fsym_id}.csv', index=False)

    counter += 1    

print('done')

In [None]:
# COLLECT ASSETS IN USD
df = pd.read_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_annual_assets_in_usd/fundamentals_df_combined.csv')
print(df.columns)
print(df['fsymId'].nunique())

fsfsd

df = df[df['fsymId'] != '@NA']
df = df[df['value'].notnull()]
df = df.sort_values(by=['fsymId', 'value'], ascending=[True, False])
df = df.drop_duplicates(subset=['fsymId'], keep='first')
df = df.sort_values(by='value', ascending=False)
df = df.reset_index(drop=True)

df = df[['fsymId', 'currency', 'fiscalEndDate', 'value']]
df.to_csv('/Users/joeybortfeld/Downloads/temp.csv', index=False)




# 2. EQUITY DOWNLOAD

In [None]:
# INDIVIDUAL FSYM_ID DOWNLOAD METHOD

metric = 'returns' # either 'prices' or 'returns'
split = 'SPLIT'
start_date = '2006-01-03'
end_date = '2024-11-12'
skip_if_done = True

assert metric in ['prices', 'returns'], 'error: metric must be either price or return'

if metric == 'prices':
    output_folder = f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_{split.lower()}/'
else:
    output_folder = f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_returns/'
print('writing to', output_folder)

################################################################################
# define the company set
company_set = universe_dict['$1B']
if np.NaN in company_set:
    company_set.remove(np.NaN)
print('company set size:', len(company_set))

# prescreen to see if the fsym_id is already in the output folder
if skip_if_done:
    print('--skipping files that are already done')
    output_files = os.listdir(output_folder)
    print('--file count in output folder:', len(output_files))
    output_files = [e.replace('.csv', '') for e in output_files]    
    company_set = [e for e in company_set if e not in output_files]

    print('--new company set size when skipping:', len(company_set))
################################################################################

# loop through fsym_ids
error_list = []
collection = []
start_time = time.time()
for this_fsym in tqdm.tqdm(company_set):

    try:
        if metric == 'prices':
            result = utilities.get_stock_prices(id_list=[this_fsym],
                                        field_list=["price", "volume", "tradeCount"],
                                        start_date=start_date,
                                        end_date=end_date,
                                        frequency='D',
                                        verbose=False, 
                                        split=split,
                                        authorization=authorization)
        else:
            result = utilities.get_stock_returns(id_list=[this_fsym],
                                        start_date=start_date,
                                        end_date=end_date,
                                        frequency='D',
                                        verbose=False, 
                                        authorization=authorization)
        response_code, this_df = result

        if response_code != 200:
            error_list.append(this_fysm)
            # print('error:', this_fysm)
        else:

            this_df.to_csv(output_folder + f'{this_fsym}.csv', index=False)
            # collection.append(this_df)

    except:
        error_list.append(this_fsym)
        print('error:', this_fsym)

print('done in {}m'.format((time.time() - start_time) / 60))

if len(error_list) > 0:
    print('errors in download:')
    print(len(error_list))
    print(error_list)
else:
    print('No errors')

# df = pd.concat(collection, ignore_index=True)



In [None]:
result = utilities.get_stock_prices(id_list=['MH33D6-R'],
                                        field_list=["price", "volume", "tradeCount"],
                                        start_date='2024-01-01',
                                        end_date='2024-10-31',
                                        frequency='D',
                                        verbose=False, 
                                        split='SPLIT',
                                        authorization=authorization)
result

In [None]:
# INDIVIDUAL FSYM_ID DOWNLOAD METHOD - SHARES OUTSTANDING

# download parameters
download_type_dict = {'annual': ['ANN', 20], 
                      'quarterly': ['QTR', 10],
                      'semi_annual': ['SEMI', 15]}

download_type = 'quarterly'
output_folder = f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_fundamentals_shares_outstanding_{download_type}/'
start_date = '1990-01-01'
end_date = '2024-12-31'
skip_if_done = True

print('writing to', output_folder)


################################################################################
# define the company set
company_set = universe_dict['$1B']
if np.NaN in company_set:
    company_set.remove(np.NaN)

if download_type == 'semi_annual':
    semi_annual_set = universe_dict['semi_annual']
    company_set = [e for e in company_set if e in semi_annual_set]

print('company set size:', len(company_set))

# prescreen to see if the fsym_id is already in the output folder
if skip_if_done:
    print('--skipping files that are already done')
    output_files = os.listdir(output_folder)
    print('--file count in output folder:', len(output_files))
    output_files = [e.replace('.csv', '') for e in output_files]    
    company_set = [e for e in company_set if e not in output_files]

    print('--new company set size when skipping:', len(company_set))
################################################################################


# loop through fsym_ids
error_list = []
collection = []
start_time = time.time()
print('start time:', datetime.datetime.now().strftime("%H:%M:%S"))
for this_fsym in tqdm.tqdm(company_set):

    try:
        result = utilities.download_fundamentals(id_list=[this_fsym],
                                    field_list=['FF_COM_SHS_OUT',],
                                    periodicity=download_type_dict[download_type][0],
                                    start_date=start_date,
                                    end_date=end_date,
                                    currency='LOCAL',
                                    verbose=False,
                                    authorization=authorization)
        response_code, fundamentals_df = result

        if response_code != 200:
            error_list.append(this_fysm)
            print('error:', this_fysm)
        else:

            fundamentals_df.to_csv(output_folder + f'{this_fsym}.csv', index=False)
            collection.append(fundamentals_df)

    except:
        error_list.append(this_fsym)
        print('error:', this_fsym)

print('done in {}m'.format((time.time() - start_time) / 60))

if len(error_list) > 0:
    print('errors in download:')
    print(error_list)
else:
    print('No errors')
print('end time:', datetime.datetime.now().strftime("%H:%M:%S"))

In [None]:
# DEFINE THE PUBLIC UNIVERSE
# these are companies where we have a price

output_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split/'
file_list = os.listdir(output_dir)
file_list = [f for f in file_list if f.endswith('.csv')]

print(f'There are {len(file_list)} files in the directory')

collection = []
for f in tqdm.tqdm(file_list):
    df = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split/{f}')
    df['date'] = pd.to_datetime(df['date'])
    df = df[['fsymId', 'date', 'price']]
    df = df[df['price'].notnull()]
    min_date  = df['date'].min()
    max_date = df['date'].max()
    collection.append([f.replace('.csv', ''), min_date, max_date])

df = pd.DataFrame(collection, columns=['fsym_id', 'min_date', 'max_date'])
df['min_date'] = pd.to_datetime(df['min_date'])
df['max_date'] = pd.to_datetime(df['max_date'])

df.to_csv('/Users/joeybortfeld/Documents/CreditGradients Data/factset_public_universe_with_assets_greater_than_1_billion_usd.csv', index=False)

In [None]:
# BUILD MONTH END STOCK PRICES HISTORY

input_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split/'
output_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split_month_end/'
file_list = os.listdir(input_dir)
file_list = [f for f in file_list if f.endswith('.csv')]

print(f'There are {len(file_list)} files in the directory')

collection = []
for f in tqdm.tqdm(file_list):
    df = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split/{f}')
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month

    df = df[df['price'].notnull()]

    # only keep the month end prices (not exactly month end, if a stock only trades on the 10th of the month, it will be included)
    df['max_date'] = df.groupby(['fsymId', 'year', 'month'])['date'].transform('max')
    df = df[df['date'] == df['max_date']]
    
    df = df[['fsymId', 'currency', 'year', 'month', 'date', 'price']]

    

    df.to_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_split_month_end/{f}', index=False)


    # save to /Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices_month_end/

In [None]:
utilities.build_market_cap(fsym_id='MH33D6-R', market_cap_type='daily')

In [None]:
# CALCULATE MARKET CAP
# use month end price data and combine with shares outstanding data

# market cap parameters
market_cap_type = 'daily'
ffill_limit_dict = {'monthly': 16, 'daily': 375}
assert market_cap_type in ['monthly', 'daily'], 'error: market_cap_type must be either monthly or daily'
this_fsym = 'MH33D6-R'
this_fsym='HTM0LK-R'


# build the set of fsym_ids with semi annual share data
semi_annual_share_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_fundamentals_shares_outstanding_semi_annual/'
semi_annual_share_list = os.listdir(semi_annual_share_dir)
semi_annual_share_list = [f for f in semi_annual_share_list if f.endswith('.csv')]
semi_annual_share_list = [f.replace('.csv', '') for f in semi_annual_share_list ]


# 0. get price data
if market_cap_type == 'monthly':
    df1 = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_stock_prices_split_month_end/{this_fsym}.csv')
    df1['date'] = pd.to_datetime(df1['date'])
    df1 = df1[['fsymId', 'date', 'price', 'currency']]
else:
    df1 = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_stock_prices_split/{this_fsym}.csv')
    df1['date'] = pd.to_datetime(df1['date'])
    df1 = df1[['fsymId', 'date', 'price', 'currency']]

# 1. get shares outstanding
df2 = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_fundamentals_shares_outstanding_quarterly/{this_fsym}.csv')
df2['fiscalEndDate'] = pd.to_datetime(df2['fiscalEndDate'])
df2['epsReportDate'] = pd.to_datetime(df2['epsReportDate'])
df2['epsReportDate'] = df2['epsReportDate'].fillna(df2['fiscalEndDate'] + pd.Timedelta(days=90))
df2 = df2[['fsymId', 'epsReportDate', 'value']]
df2.columns=['fsymId', 'date', 'shares_outstanding_quarterly']

df3 = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_fundamentals_shares_outstanding_annual/{this_fsym}.csv')
df3['fiscalEndDate'] = pd.to_datetime(df3['fiscalEndDate'])
df3['epsReportDate'] = pd.to_datetime(df3['epsReportDate'])
df3['epsReportDate'] = df3['epsReportDate'].fillna(df3['fiscalEndDate'] + pd.Timedelta(days=90))
df3 = df3[['fsymId', 'epsReportDate', 'value']]
df3.columns=['fsymId', 'date', 'shares_outstanding_annual']

# combine all shares outstanding data
df4 = df2.merge(df3, how='outer', on=['fsymId', 'date'])

# check if semi annual data exists
if this_fsym in semi_annual_share_list:
    df5 = pd.read_csv(f'/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_api_fundamentals_shares_outstanding_semi_annual/{this_fsym}.csv')
    df5['fiscalEndDate'] = pd.to_datetime(df5['fiscalEndDate'])
    df5['epsReportDate'] = pd.to_datetime(df5['epsReportDate'])
    df5['epsReportDate'] = df5['epsReportDate'].fillna(df5['fiscalEndDate'] + pd.Timedelta(days=90))
    df5 = df5[['fsymId', 'epsReportDate', 'value']]
    df5.columns=['fsymId', 'date', 'shares_outstanding_semi_annual']

    df4 = df4.merge(df5, how='outer', on=['fsymId', 'date'])

# 2. merge shares outstanding with monthly prices
df = df1.merge(df4, how='outer', on=['fsymId', 'date'])

# 3. data cleaning
# get the date of the first observation with non null price
first_date = df[df['price'].notnull()]['date'].min()
df = df[df['date'] >= first_date]

# fill forward shares outstanding
df = df.sort_values(by=['date'])
df['shares_outstanding'] = df['shares_outstanding_quarterly'].fillna(df['shares_outstanding_annual'])
df['shares_outstanding'] = df['shares_outstanding'].fillna(method='ffill', limit=ffill_limit_dict[market_cap_type])   # allow 12 months of fill forward (12 row) plut 4 more rows for quarterly financial filings

# 4. market cap calculation (millions)
df['market_cap'] = df['price'] * df['shares_outstanding']

# 5. cleanup
df = df[df['price'].notnull()]
df = df[['fsymId', 'date', 'market_cap', 'price', 'shares_outstanding']]
df.columns = ['fsym_id', 'market_cap_date', 'market_cap', 'price', 'shares_outstanding']
df = df.sort_values(by=['fsym_id', 'market_cap_date'])
df = df.reset_index(drop=True)

df['year'] = df['market_cap_date'].dt.year
df['month'] = df['market_cap_date'].dt.month

df.set_index('market_cap_date')['market_cap'].plot()



df.tail(50)


In [None]:
df1

# Copy Files from Local Directory to AWS S3

In [None]:
import os
import boto3



def s3_check_file_exists(bucket_name:str='qml-solutions-new-york', 
                         file_key:str='/factset-api-global-prices/B01DPB-R.csv', 
                         aws_access_key_id:str=None, 
                         aws_secret_access_key:str=None):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except s3.exceptions.ClientError:
        return False

def copy_file_to_s3(local_file_path:str=None, 
                     s3_bucket:str='qml-solutions-new-york', 
                     s3_key:str='factset-api-global-prices/', 
                     aws_access_key_id:str=None, 
                     aws_secret_access_key:str=None,
                     verbose:bool=False):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )   
    try:
        s3.upload_file(local_file_path, s3_bucket, s3_key)
        if verbose:
            print(f'--uploaded {local_file_path} to {s3_bucket}/{s3_key}')
        return True
    except:
        return False


# loop through files in local folder
# Transfer file to AWS

s3_bucket = 'qml-solutions-new-york'


local_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data'
error_dict = {}
for source_folder in [
    # 'factset_api_fundamentals_quarterly', 
    # 'factset_api_fundamentals_annual',

    'factset_api_fundamentals_semi_annual',
    'factset_api_stock_prices_split',
    'factset_api_stock_returns',
    'factset_api_fundamentals_shares_outstanding_quarterly',
    'factset_api_fundamentals_shares_outstanding_annual',
    'factset_api_fundamentals_shares_outstanding_semi_annual',
    'factset_api_fundamentals_annual_assets_in_usd',
    'factset_api_fundamentals_semi_annual_assets_in_usd',

    ]:

    print(source_folder)
    target_folder = source_folder.replace('_', '-')
    file_list = os.listdir(local_dir + '/' + source_folder + '/')
    error_list = []
    print('file count:', len(file_list))
    print('start at:', datetime.datetime.now().strftime("%H:%M:%S"))

    for file in tqdm.tqdm(file_list):

        does_exist = s3_check_file_exists(bucket_name=s3_bucket, 
                            file_key=f'{target_folder}/{file}', 
                            aws_access_key_id=aws_credentials['key'], 
                            aws_secret_access_key=aws_credentials['secret'])

        if not does_exist:

            response = copy_file_to_s3(local_file_path=f'{local_dir}/{source_folder}/{file}', 
                                s3_bucket=s3_bucket, 
                            s3_key=f'{target_folder}/{file}', 
                            aws_access_key_id=aws_credentials['key'], 
                            aws_secret_access_key=aws_credentials['secret'],
                            verbose=False)

            if not response:
                error_list.append(file)
    
    print('error count:', len(error_list))
    print()
    error_dict[source_folder] = error_list

In [None]:
# MULTITHREAD BULK UPLOAD FROM LOCAL TO S3

from concurrent.futures import ThreadPoolExecutor
from botocore.exceptions import BotoCoreError, ClientError


def upload_file_to_s3(local_file_path, bucket_name, s3_key, s3_client):
    """
    Uploads a single file to S3.
    """
    try:
        s3_client.upload_file(local_file_path, bucket_name, s3_key)
        return True
    except (BotoCoreError, ClientError) as e:
        print(f"Error uploading {local_file_path} to {bucket_name}/{s3_key}: {e}")
        return False

def bulk_upload_to_s3(local_dir, local_folder, bucket_name, aws_access_key_id, aws_secret_access_key, num_threads=8):
    """
    Uploads all files in local_dir to the specified S3 bucket.
    """
    # Initialize S3 client 
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    # Collect a list of local file names to transfer (aka 'MH33D6-R.csv', ''XQCWLZ-R.csv)

    target_folder = local_folder.replace('_', '-')
    local_file_list = os.listdir(local_dir + '/' + local_folder + '/')

    local_path_list = [f'{local_dir}/{local_folder}/{f}' for f in local_file_list]
    s3_key_list = [f'{target_folder}/{f}' for f in local_file_list]

    from_to_list = list(zip(local_path_list, s3_key_list))

    # diagnostics
    print(f'transfer files from {local_dir}/{local_folder}')
    print(f'transfer to s3 {bucket_name}/{target_folder}')
    print('files to transfer:', len(from_to_list))

 
    # Use ThreadPoolExecutor for parallel uploads
    error_list = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for local_path, s3_key in from_to_list:
            futures.append(
                executor.submit(upload_file_to_s3, local_path, bucket_name, s3_key, s3_client)
            )

        # Track progress with tqdm
        for future in tqdm.tqdm(futures, desc="Uploading files"):
            try:
                if not future.result():
                    # Add failed uploads to the list
                    error_list.append(futures[future])
            except Exception as e:
                print(f"Unexpected error: {e}")
                error_list.append(futures[future])

    
    # collect and retry errors
    print(f"Failed uploads: {len(error_list)}")
    final_error_list = []
    if error_list:
        print("Retrying failed uploads...")
        for local_path, s3_key in error_list:
            success = upload_file_to_s3(local_path, bucket_name, s3_key, s3_client)
            if not success:
                print(f"Final failure for {local_path}")
                final_error_list.append(local_path)

    print("Upload process complete.")
    if len(final_error_list) == 0:
        return True, []
    else:
        return False, final_error_list

# Example usage

    
local_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data'
bucket_name = 'qml-solutions-new-york'

for local_folder in [

    # 'factset_api_fundamentals_annual',
    # 'factset_api_fundamentals_quarterly',
    'factset_api_fundamentals_semi_annual',

    'factset_api_fundamentals_annual_assets_in_usd',
    'factset_api_fundamentals_semi_annual_assets_in_usd',

    'factset_api_fundamentals_shares_outstanding_annual',
    'factset_api_fundamentals_shares_outstanding_quarterly',
    'factset_api_fundamentals_shares_outstanding_semi_annual',

    'factset_api_stock_prices_split',
    'factset_api_stock_returns',
    ]:

    success, error_list = bulk_upload_to_s3(
        local_dir=local_dir,
        local_folder=local_folder,
        bucket_name=bucket_name,
        aws_access_key_id=aws_credentials['key'],
        aws_secret_access_key=aws_credentials['secret'],
        num_threads=8  # Adjust number of threads based on your system's capabilities
    )

    print(success, error_list)
    print()



In [None]:
def s3_check_file_exists(bucket_name:str='qml-solutions-new-york', 
                         file_key:str='/factset-api-global-prices/B01DPB-R.csv', 
                         aws_access_key_id:str=None, 
                         aws_secret_access_key:str=None):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except s3.exceptions.ClientError:
        return False


s3_check_file_exists(bucket_name='qml-solutions-new-york', 
                         file_key='factset-api-stock-prices-split/MH33D6-R.csv', 
                         aws_access_key_id=aws_credentials['key'], 
                         aws_secret_access_key=aws_credentials['secret'])

In [None]:

temp = pd.read_csv('s3://qml-solutions-new-york/factset-api-global-prices/B01DPB-R.csv',
                   storage_options=aws_credentials)
temp

In [None]:
import boto3

# Transfer file to AWS
local_folder = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices/'
s3_bucket = 'qml-solutions-new-york'



def s3_check_file_exists(bucket_name:str='qml-solutions-new-york', 
                         file_key:str='/factset-api-global-prices/B01DPB-R.csv', 
                         aws_access_key_id:str=None, 
                         aws_secret_access_key:str=None):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except s3.exceptions.ClientError:
        return False
    
# CHECK IF FILE EXISTS IN S3
res = s3_check_file_exists(bucket_name=s3_bucket, 
                     file_key='factset-api-global-prices/B01DPB-R.csv', 
                     aws_access_key_id=aws_credentials['key'], 
                     aws_secret_access_key=aws_credentials['secret'])
res

In [None]:
import boto3

def list_s3_bucket_contents(bucket_name, prefix='', aws_access_key_id=None, aws_secret_access_key=None):
    """
    List all items in an S3 bucket and subfolder.
    
    Parameters:
    - bucket_name: str, name of the S3 bucket
    - prefix: str, the folder path within the bucket (optional)
    
    Returns:
    - List of file keys (paths) in the specified bucket and folder
    """
    s3_client = boto3.client('s3', 
                             aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,)
    paginator = s3_client.get_paginator('list_objects_v2')
    
    file_keys = []
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                file_keys.append(obj['Key'])
    
    return file_keys



# Usage example:
bucket_name = 'qml-solutions-new-york'
folder_path = 'factset-api-fundamentals-annual/'  # Optional
file_list = list_s3_bucket_contents(bucket_name, folder_path, aws_access_key_id=aws_credentials2['key'], aws_secret_access_key=aws_credentials2['secret'])
print(len(file_list))
print(file_list[:15])


In [None]:
utilities.download_fundamentals(id_list=['MH33D6-R'],
                                    field_list=['FF_BUS_DESC_ABBREV'],
                                    # periodicity=download_type_dict[download_type][0],
                                    # start_date=start_date,
                                    # end_date=end_date,
                                    # currency='USD',
                                    # verbose=False,
                                    # authorization=authorization
                                    )

