In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from openai import OpenAI
import ast
import tqdm
import os
from Py_Files import credentials
from Py_Files import factset_api
import matplotlib.pyplot as plt
import plotly.express as px


In [None]:
collection = []
for this_index in ['C0A0', 'H0A0', 'ER00', 'HE00']:

    temp = pd.read_excel(f'/Users/joeybortfeld/Documents/QML Solutions Data/ice/ICE Jan 15/{this_index}-01152025.xlsx', skiprows=1)
    temp = temp[temp['Description'] != '']
    temp = temp[temp['Description'].notnull()]

    # rename columns to camel case with underscore
    temp.columns = [col.lower().replace(' ', '_') for col in temp.columns]
    temp.columns = [col.replace('.', '') for col in temp.columns]
    temp.columns = [col.replace('_(sa)', '') for col in temp.columns]




    temp['index'] = this_index
    temp = temp[['cusip', 'isin_number', 'description', 'ticker', 'index', 'par_wtd_coupon', 'maturity_date', 'rating', 'iso_currency_code', 'iso_country_code', 
                 'industry_lvl_2_desc', 'industry_lvl_3_desc', 'industry_lvl_4_desc', 
                 'face_value_loc', 'price', 'oas', 'subordination_type', 'mod_dur_to_worst']]

    collection.append(temp)

df = pd.concat(collection, axis=0)
df['constant'] = 1

# rating preprocessing
rating_to_num_dict = {'AAA': 21, 'AA1': 20, 'AA2': 19, 'AA3': 18, 
                      'A1': 17, 'A2': 16, 'A3': 15, 
                      'BBB1': 14, 'BBB2': 13, 'BBB3': 12, 
                      'BB1': 11, 'BB2': 10, 'BB3': 9, 
                      'B1': 8, 'B2': 7, 'B3': 6, 
                      'CCC1': 5, 'CCC2': 4, 'CCC3': 3, 
                      'CC': 2, 'C': 1, 'D': 0}
num_to_rating_dict = {v: k for k, v in rating_to_num_dict.items()}
df['rating_num'] = df['rating'].map(lambda x: rating_to_num_dict[x])

rating_to_broad_rating_dict = {'AAA': '>=AA', 'AA1': '>=AA', 'AA2': '>=AA', 'AA3': '>=AA', 
                      'A1': 'A', 'A2': 'A', 'A3': 'A', 
                      'BBB1': 'BBB', 'BBB2': 'BBB', 'BBB3': 'BBB', 
                      'BB1': 'BB', 'BB2': 'BB', 'BB3': 'BB', 
                      'B1': 'B', 'B2': 'B', 'B3': 'B', 
                      'CCC1': '<=CCC', 'CCC2': '<=CCC', 'CCC3': '<=CCC', 
                      'CC': '<=CCC', 'C': '<=CCC', 'D': '<=CCC'}
df['broad_rating'] = df['rating'].map(lambda x: rating_to_broad_rating_dict[x])

broad_rating_to_num_dict = {'>=AA': 6, 'A': 5, 'BBB': 4, 'BB': 3, 'B': 2, '<=CCC': 1,}
df['broad_rating_num'] = df['broad_rating'].map(lambda x: broad_rating_to_num_dict[x])

# calculate face value in USD
exchange_rate_to_usd_dict = {'USD': 1.0, 'EUR': 1.03} 
df['exchange_rate_to_usd'] = df['iso_currency_code'].map(lambda x: exchange_rate_to_usd_dict[x])
df['face_value_in_usd'] = df['face_value_loc'] * df['exchange_rate_to_usd']

# descriptive statistics
print('ICE indices:', df['index'].unique())
print('unique ticker count:', df['ticker'].nunique())
print('unique description count:', df['description'].nunique())
print('min maturity date:', df['maturity_date'].min())
print('max maturity date:', df['maturity_date'].max())
print('min oas:', df['oas'].min())
print('max oas:', df['oas'].max())
print('unique rating count:', df['rating'].nunique())
print('unique subordination type count:', df['subordination_type'].nunique())
print('unique industry lvl 2 desc count:', df['industry_lvl_2_desc'].nunique())
print('unique industry lvl 3 desc count:', df['industry_lvl_3_desc'].nunique())
print('unique industry lvl 4 desc count:', df['industry_lvl_4_desc'].nunique())
print('unique iso country code count:', df['iso_country_code'].nunique())






In [None]:
# BAML data exploration

# set random seed
np.random.seed(1234)
lower_ci_bound = 2.5
upper_ci_bound = 97.5

lower_oas_bound = .05
upper_oas_bound = .95

lower_dur_bound = 4.0
upper_dur_bound = 6.0



region_dict = {'USD': ['C0A0', 'H0A0'], 'EUR': ['ER00', 'HE00'], 'All': ['C0A0', 'H0A0', 'ER00', 'HE00']}

# median OAS by rating
rating_var = 'broad_rating_num'

collection = []
for this_region in ['USD', 'EUR', 'All']:

    # subset for the specified region
    dff = df[df['index'].isin(region_dict[this_region])].copy()
    dff = dff[dff['mod_dur_to_worst'] >= lower_dur_bound]
    dff = dff[dff['mod_dur_to_worst'] <= upper_dur_bound]

    # iterate over each distinct rating
    for this_rating in dff[rating_var].unique():
        
        df1 = dff[dff[rating_var] == this_rating].copy()

        # bond count
        bond_count = len(df1)

        # calculate the median oas
        median_oas = df1['oas'].median()

        # bootstrap to get a confidence interval of the median
        sample_list = []
        for _ in range(1_000):
            sample = df1['oas'].sample(n=len(df1), replace=True)
            sample_list.append(sample.median())
        lower_est = np.percentile(sample_list, lower_ci_bound)
        upper_est = np.percentile(sample_list, upper_ci_bound)

        # calculate the range of the observed data
        lower_oas = df1['oas'].quantile(lower_oas_bound)
        upper_oas = df1['oas'].quantile(upper_oas_bound)

        # append to collection
        collection.append([this_region, this_rating, bond_count, lower_oas, lower_est, median_oas, upper_est, upper_oas])




collection = pd.DataFrame(collection, columns=['region', 'rating', 'bond_count', 'lower_oas', 'lower_est', 'median_oas', 'upper_est', 'upper_oas'])
collection = collection.sort_values(by=['region', 'rating'])
collection


In [None]:
lower_est, median_oas, upper_est, 

In [181]:
def baml_industry_aggregations(data:pd.DataFrame, industry='industry_lvl_3_desc'):

    groupby = ['description', industry]
    df = data.groupby(groupby, as_index=False)['face_value_in_usd'].sum()
    df = df.sort_values(by=groupby, ascending=[True,True])
    df = df.drop_duplicates(subset=groupby, keep='last')

    df = df[['description', industry]]
    df.columns = ['description', f'majority_{industry}']

    return df

description_industry_lvl_3 = baml_industry_aggregations(df, industry='industry_lvl_3_desc')
description_industry_lvl_4 = baml_industry_aggregations(df, industry='industry_lvl_4_desc')
description_industry_lvl_3_4 = description_industry_lvl_3.merge(description_industry_lvl_4, on='description', how='left')
description_industry_lvl_3_4.to_csv('/Users/joeybortfeld/Downloads/description_industry_lvl_3_4.csv', index=False)



In [183]:
def baml_oas_aggregations(data:pd.DataFrame, lower_duration:float=4.0, upper_duration:float=6.0):

    df = data.copy()

    df = df[df['mod_dur_to_worst'] >= lower_duration]
    df = df[df['mod_dur_to_worst'] <= upper_duration]

    df = df.groupby(by=['description', 'ticker', 'rating'])['oas'].median().reset_index(drop=False)
    return df


temp = baml_oas_aggregations(df)
temp.to_csv('/Users/joeybortfeld/Downloads/baml_oas_aggregations.csv', index=False)


In [None]:
# GENERATE DESCRIPTION-RATING-TYPE AGGREGATIONS

def baml_rating_aggregations(data:pd.DataFrame):

    df = data.copy()

    # 0. generate aggregations to produce a dataset where rows are unique combinations of ['description', 'subordination_type', 'rating']
    # aggregations by ['description', 'subordination_type', 'rating']
    groupby = ['description', 'ticker', 'subordination_type', 'rating']
    df['bond_count'] = df.groupby(groupby)['constant'].transform('sum')
    df['total_face_value_in_usd'] = df.groupby(groupby)['face_value_in_usd'].transform('sum')

    df = df.drop_duplicates(subset=groupby, keep='first')
    df = df.sort_values(by=groupby)
    df = df.reset_index(drop=True)

    # generate a tuple with all the ratings info (subordination_type, rating, bond_count, total_face_value_in_usd) for each description
    df['all_rating_info'] = df[['subordination_type', 'rating', 'ticker', 'bond_count', 'total_face_value_in_usd']].apply(lambda x: f'({x[0]}, {x[1]}, {x[2]}, {int(x[3])})', axis=1)
    rating_summary_tuple = df.groupby(by=['description', 'ticker'])['all_rating_info'].apply(lambda x: list(set(x)))
    rating_summary_tuple = rating_summary_tuple.reset_index(drop=False)

    df = df.drop(['all_rating_info'], axis=1)
    df = df.merge(rating_summary_tuple, on=['description', 'ticker'], how='left')

    df['rating_type_count'] = df.groupby(by=['description', 'ticker'])['constant'].transform('sum')

    # generate a single rating for each subordination type
    for this_type in ['Senior', 'Secured', 'Subordinated', 'Senior Non Preferred', 'Tier 2', 'Alternative Tier 1', 'Tier 1', 'Junior subordinated', 'Preferred', 'Upper tier 2']:

        label = this_type.lower().replace(' ','_')
        
        dff = df[df['subordination_type'] == this_type].copy()
        
        # flag if split rating
        dff['max_rating_num'] = dff.groupby(by=['description', 'ticker'])['rating_num'].transform('max')
        dff['min_rating_num'] = dff.groupby(by=['description', 'ticker'])['rating_num'].transform('min')
        dff[f'{label}_split_rating'] = dff['max_rating_num'] - dff['min_rating_num']

        # generate a single rating for each subordination type as the minimum rating
        dff[f'{label}_rating_num'] = dff.groupby(by=['description', 'ticker'])['rating_num'].transform('min')

        dff[f'{label}_rating'] = dff[f'{label}_rating_num'].map(num_to_rating_dict)
        dff = dff[['description', 'ticker', f'{label}_rating', f'{label}_split_rating']]

        df = df.merge(dff, on=['description', 'ticker'], how='left')

        df[f'{label}_rating'] = df[f'{label}_rating'].fillna('NA')

    # generate a single representative rating for each description
    df['single_rating'] = ''
    df['single_rating_type'] = ''
    df['single_split_rating'] = np.NaN
    for this_type in ['senior', 'senior_non_preferred', 'tier_1', 'tier_2', 'secured', 'subordinated', 'junior_subordinated',]:
        mask1 = df['single_rating'] == ''
        mask2 = df[f'{this_type}_rating'] != 'NA'

        df.loc[mask1 & mask2, 'single_rating'] = df.loc[mask1 & mask2, f'{this_type}_rating']
        df.loc[mask1 & mask2, 'single_rating_type'] = this_type
        df.loc[mask1 & mask2, 'single_split_rating'] = df.loc[mask1 & mask2, f'{this_type}_split_rating']
    
    df = df.drop_duplicates(subset=['description', 'ticker'], keep='first')
    return df[['description', 'ticker', 
               'bond_count', 'total_face_value_in_usd', 'all_rating_info', 'rating_type_count',
               'single_rating', 'single_rating_type', 'single_split_rating',
               'senior_rating', 'senior_split_rating', 
               'secured_rating', 'secured_split_rating', 
               'subordinated_rating', 'subordinated_split_rating',
               'senior_non_preferred_rating', 'senior_non_preferred_split_rating',
                'tier_2_rating', 'tier_2_split_rating', 
                'alternative_tier_1_rating', 'alternative_tier_1_split_rating',
                'tier_1_rating', 'tier_1_split_rating',
                'junior_subordinated_rating', 'junior_subordinated_split_rating',
                'preferred_rating', 'preferred_split_rating',
                'upper_tier_2_rating', 'upper_tier_2_split_rating']]


temp = baml_rating_aggregations(df)
temp['single_rating_bucket'] = temp['single_rating'].map(rating_to_broad_rating_dict)
temp =temp[['description', 'ticker', 'single_rating', 'single_rating_bucket', 'total_face_value_in_usd']]
temp = temp.merge(description_industry_lvl_3_4, on='description', how='left')

temp = temp[temp['majority_industry_lvl_3_desc'] != 'Financial Services']
temp = temp[temp['majority_industry_lvl_3_desc'] != 'Banking']
temp = temp[temp['majority_industry_lvl_3_desc'] != 'Insurance']

temp = temp.sort_values(by=['single_rating_bucket', 'total_face_value_in_usd'], ascending=[True, False])
temp = temp.drop_duplicates(subset=['description'], keep='first')
temp = temp.groupby(by='single_rating_bucket').head(70)
temp.to_csv('/Users/joeybortfeld/Downloads/largest_issuers_by_rating_bucke3.csv', index=False)



In [None]:
temp = df[df['rating_num'] >= 5]
temp = df
temp = temp[temp['mod_dur_to_worst'] >= 5.0]
temp = temp[temp['mod_dur_to_worst'] <= 9.0]
temp.groupby('broad_rating_num')['oas'].median().apply(lambda x: np.log(x)).plot()

In [None]:
df = pd.read_csv('/Users/joeybortfeld/Downloads/largest_issuers_by_rating_bucket_w_fsym_id.csv', skiprows=2, encoding='latin1')

# build rating fields
df['single_rating_num'] = df['single_rating'].map(rating_to_num_dict)
df['single_rating_bucket'] = df['single_rating'].map(rating_to_broad_rating_dict)
df['single_rating_bucket_num'] = df['single_rating_bucket'].map(broad_rating_to_num_dict)

# filter for issuers with fsym_id
df = df[df['fsym_id'].notnull()]
df = df[df['fsym_id'] != 'NA']

temp = pd.read_csv('/Users/joeybortfeld/Downloads/most_recent_pd_by_fsym.csv')
df = df.merge(temp, on='fsym_id', how='left')

df[['description', 'fsym_id', 'majority_industry_lvl_3_desc', 'single_rating', 'single_rating_bucket', 'oas_4_6', 'cumulative_pd_5']]

df['cumulative_pd_5'] = df['cumulative_pd_5'].apply(lambda x: np.log(x))
df['oas_4_6'] = df['oas_4_6'].apply(lambda x: np.log(x))

# chart
fig = px.strip(df, x='single_rating_bucket_num', y='oas_4_6', color="single_rating_bucket").update_traces(jitter = 1)
fig = px.strip(df, x='oas_4_6', y='cumulative_pd_5',).update_traces(jitter = 1)
fig.show()

print('correlation between oas and pd:')
print(df[['cumulative_pd_5', 'oas_4_6']].corr())

# df['cumulative_pd_5'] *= (1-.40)
# df['cumulative_pd_5'] *= 10_000

# df['oas_4_6'] = df['oas_4_6'].apply(lambda x: np.log(x))
# df['cumulative_pd_5'] = df['cumulative_pd_5'].apply(lambda x: np.log(x))

# fig, ax = plt.subplots(1,1, figsize=(10,5))
# df.groupby(by='single_rating_bucket_num')['cumulative_pd_5'].median().plot(label='PD', kind='bar', ax=ax, title='Median 5Y PD by Rating Bucket')
# df.groupby(by='single_rating_bucket_num')['cumulative_pd_5'].count().plot(label='PD', kind='bar', ax=ax, secondary_y=True, title='Counts by Rating Bucket', color='red')
# df.groupby(by='single_rating_bucket_num')['oas_4_6'].median().plot(label='oas', ax=ax, secondary_y=True, color='red')
# ax[0].legend()
# ax[1].legend()




In [None]:
df.groupby(by='single_rating_bucket_num')[['cumulative_pd_5', 'oas_4_6']].median().plot(kind='bar')

In [None]:
from Py_Files import factset_api
from Py_Files import credentials

factset_api.download_company_profile(id_list=['AAPL-US',], authorization=credentials.factset_api_authorization)

In [38]:
import json 
import requests

def get_stock_prices(id_list:str=['MH33D6-R'], 
                     field_list:list=["price", "volume", "tradeCount"], 
                     start_date:str='2006-01-03', 
                     end_date:str='2024-12-31', 
                     frequency:str='D',
                     split:str='UNSPLIT',
                     verbose:bool=False,
                     authorization=None):

    '''
    Get stock prices for a given ticker.

    Split is either SPLIT, SPLIT_SPINOFF', UNSPLIT. For the purpose of constructing historical market capitaliation use
    UNSPLIT to be on a like-for-like basis with shares outsanding as reported in the financial statements. 
    '''

    prices_endpoint = 'https://api.factset.com/content/factset-global-prices/v1/prices'

    prices_request ={
    "ids": id_list,
        "fields": field_list,
        "startDate":start_date,
        "endDate":end_date,
        "frequency":frequency,
        "adjust":split,


    }

    headers = {'Accept': 'application/json','Content-Type': 'application/json'}

    #create a post request
    prices_post = json.dumps(prices_request)

    if verbose:
        print('post request:')
        print(prices_endpoint)
        print(prices_post)
        print()

    prices_response = requests.post(url = prices_endpoint, data=prices_post, auth = authorization, headers = headers, verify= False )

    if verbose:
        print('HTTP Status: {}'.format(prices_response.status_code))
        print(prices_response.text)

    if prices_response.status_code != 200:
        if verbose:
            print('error: failed to get stock prices')
        return [prices_response.status_code,None]
    else:
        prices_data = json.loads(prices_response.text)
        prices_df = pd.DataFrame(prices_data['data'])
        return [prices_response.status_code, prices_df]

def get_stock_returns(id_list:str=['MH33D6-R'], 
                     start_date:str='2006-01-03', 
                     end_date:str='2024-12-31', 
                     frequency:str='D',
                     verbose:bool=False,
                     authorization=None):

    '''
    Get stock returns.

    '''

    returns_endpoint = 'https://api.factset.com/content/factset-global-prices/v1/returns'

    returns_request ={
    "ids": id_list,
        "startDate":start_date,
        "endDate":end_date,
        "frequency":"D",
        "dividendAdjust": "EXDATE"
    }

    headers = {'Accept': 'application/json','Content-Type': 'application/json'}

    #create a post request
    returns_post = json.dumps(returns_request)

    if verbose:
        print('post request:')
        print(returns_endpoint)
        print(returns_post)
        print()

    returns_response = requests.post(url = returns_endpoint, data=returns_post, auth = authorization, headers = headers, verify= False )

    if verbose:
        print('HTTP Status: {}'.format(returns_response.status_code))
        print(returns_response.text)

    if returns_response.status_code != 200:
        if verbose:
            print('error: failed to get stock prices')
        return [returns_response.status_code,None]
    else:
        returns_data = json.loads(returns_response.text)
        returns_df = pd.DataFrame(returns_data['data'])
        return [returns_response.status_code, returns_df]



def get_shares_outanding(id_list:str=['MH33D6-R'], 
                     start_date:str='2006-01-03', 
                     end_date:str='2024-12-31', 
                     frequency:str='D',
                     verbose:bool=False,
                     authorization=None):

    '''
    Get shares outstanding for a given ID
    '''

    shares_endpoint = 'https://api.factset.com/content/factset-global-prices/v1/security-shares'

    shares_request ={
        'data': {
            "ids": id_list,
            "startDate":start_date,
            "endDate":end_date,
            "frequency":frequency,
            "calendar": 'FIVEDAY',
            "batch": "N"
        }
    }

    headers = {'Accept': 'application/json','Content-Type': 'application/json'}

    #create a post request
    shares_post = json.dumps(shares_request)

    if verbose:
        print('post request:')
        print(shares_endpoint)
        print(shares_post)
        print()

    shares_response = requests.post(url=shares_endpoint, 
                                    data=shares_post, 
                                    auth=authorization, 
                                    headers=headers, 
                                    verify=False )

    if verbose:
        print('HTTP Status: {}'.format(shares_response.status_code))
        print(shares_response.text)

    if shares_response.status_code != 200:
        if verbose:
            print('error: failed to get shares outstanding')
        return [shares_response.status_code,None]
    else:
        shares_data = json.loads(shares_response.text)
        shares_df = pd.DataFrame(shares_data['data'])
        return [shares_response.status_code, shares_df]


In [None]:
def bulk_get_stock_returns(id_list:list=['MH33D6-R'], 
                     start_date:str='2006-01-03', 
                     end_date:str='2024-12-31', 
                     frequency:str='D',
                     verbose:bool=False,
                     authorization=None):

    

    status, df = get_stock_returns(id_list=['DBNXVB-R'], 
                        start_date='2006-01-03', 
                        end_date='2024-12-31', 
                        frequency='D',
                        verbose=False,
                        authorization=credentials.factset_api_authorization)

df


In [None]:
status, df_split = get_stock_prices(id_list=['DBNXVB-R'], 
                     field_list=["price", "volume", "tradeCount"], 
                     start_date='2006-01-03', 
                     end_date='2024-12-31', 
                     frequency='D',
                     split='SPLIT',
                     verbose=False,
                     authorization=credentials.factset_api_authorization)

status, df_unsplit = get_stock_prices(id_list=['DBNXVB-R'], 
                     field_list=["price", "volume", "tradeCount"], 
                     start_date='2006-01-03', 
                     end_date='2024-12-31', 
                     frequency='D',
                     split='SPLIT_SPINOFF',
                     verbose=False,
                     authorization=credentials.factset_api_authorization)

df = df_split[['date', 'price']].merge(df_unsplit[['date', 'price']], on='date', how='left')
df.columns = ['date', 'price_split', 'price_unsplit']
df.set_index('date')[['price_split', 'price_unsplit']].plot()


df.head(10)

In [None]:
status, temp = get_shares_outanding(id_list=['DBNXVB-R'], 
                     start_date='2006-01-01', 
                     end_date='2024-12-31', 
                     frequency='D',
                     verbose=True,
                     authorization=credentials.factset_api_authorization)
print(status)
print(temp.columns)
temp

In [None]:
def get_market_cap(id_list:str=['MH33D6-R'], 
                     start_date:str='2006-01-03', 
                     end_date:str='2024-12-31', 
                     frequency:str='M',
                     verbose:bool=False,
                     authorization=None):
    
    status1, df1 = get_stock_prices(id_list=id_list, 
                     field_list=["price", "volume", "tradeCount"], 
                     start_date=start_date, 
                     end_date=end_date, 
                     frequency=frequency,
                     split='SPLIT',
                     verbose=verbose,
                     authorization=authorization)

    status2, df2 = get_shares_outanding(id_list=id_list, 
                     start_date=start_date, 
                     end_date=end_date, 
                     frequency=frequency,
                     verbose=verbose,
                     authorization=authorization)
    
    df = df1[['date', 'price']].merge(df2[['date', 'totalOutstanding']], on='date', how='left')
    df['market_cap'] = df['price'] * df['totalOutstanding']*1_000_000  # shares are in millions

    return df

df = get_market_cap(id_list=['DBNXVB-R'], 
                    start_date='2006-01-03', 
                    end_date='2024-12-31', 
                    frequency='M',
                    verbose=False,
                    authorization=credentials.factset_api_authorization)


In [None]:
df.set_index('date')['market_cap'].plot()
df['market_cap'] /= 1_000_000_000
df.tail(10)

In [None]:
factset_universe = pd.read_csv('/Users/joeybortfeld/Downloads/qml_universe_ids.csv')
factset_universe = factset_universe.sort_values(by='max_assets_in_usd', ascending=False)

universe_dict = factset_api.load_universe_dict(factset_universe)

In [None]:
response = factset_api.batch_get_stock_data(metric='prices', 
                                #   fsym_list=['DBNXVB-R', 'MH33D6-R', 'P8R3C2-R', 'NNKD2Y-R', 'K4GK55-R'], 
                                fsym_list=universe_dict['$1B'], 
                                  start_date='2006-01-03', 
                                  end_date='2024-12-31', 
                                  frequency='D',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/')


temp = factset_universe[['fsym_id', 'name1', 'name2', 'factset_sector', 'factset_industry', 'max_assets_in_usd']].copy()
temp = temp[temp['max_assets_in_usd'] > 1_000]

# fails
temp1 = pd.DataFrame(response, columns=['fsym_id'])
temp1['status'] = 'fail'

# successes
successes = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/')]
temp2 = pd.DataFrame(successes, columns=['fsym_id'])
temp2['status'] = 'success'

temp3 = pd.concat([temp1, temp2], axis=0)

temp = temp.merge(temp3, on='fsym_id', how='left')

# get max pds
temp4 = pd.read_csv('/Users/joeybortfeld/Downloads/max_pd_by_fsym.csv')
temp = temp.merge(temp4, on='fsym_id', how='left')
temp.to_csv('/Users/joeybortfeld/Downloads/download_status.csv', index=False)

In [None]:
# prices_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/')]

# # 0. get start price date per each fsym
# prices_starts_dict = {}
# for f in tqdm.tqdm(prices_files):
#     df = pd.read_csv(f'/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/{f}.csv')
#     df = df[df['price'].notnull()]
#     df = df[df['price'] != 0]
#     start_date = df['date'].min()
#     prices_starts_dict[f] = start_date

# print(len(prices_starts_dict))
# fasdfasdf    

# 1. get shares data
response = factset_api.batch_get_shares_outanding(fsym_list=prices_files, 
                                  end_date='2024-12-31', 
                                  start_date_dict=prices_starts_dict,
                                  frequency='M',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/shares/')


In [None]:
response

In [None]:
price_fsyms = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices UNSPLIT/')]



response = factset_api.batch_get_stock_data(metric='prices', 
                                fsym_list=price_fsyms, 
                                  start_date='2006-01-03', 
                                  end_date='2024-12-31', 
                                  frequency='D',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices SPLIT/')


In [None]:
prices_starts_dict['PHS9MZ-R']

In [None]:
# identify files in returns that are not in prices and vice versa
returns_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/returns/')]
prices_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/')]

print(len(set(returns_files) - set(prices_files)))
print(len(set(prices_files) - set(returns_files)))


In [None]:
response, temp = factset_api.get_shares_outanding(id_list=['K5JZYK-R'], 
                     start_date='2006-03-31', 
                     end_date='2024-12-31', 
                     frequency='D',
                     verbose=False,
                     authorization=credentials.factset_api_authorization)

print(response)
temp[temp['totalOutstanding'].notnull()]


In [None]:
temp[temp['publicationDate'].notnull()]['publicationDate'].min()

In [None]:
status, df = get_stock_prices(id_list=['CGF31Z-R'], 
                     field_list=["price", "volume", "tradeCount"], 
                     start_date='2023-01-03', 
                     end_date='2024-12-31', 
                     frequency='D',
                     split='SPLIT',
                     verbose=False,
                     authorization=credentials.factset_api_authorization)

print(status)
print(df.columns)
df


In [None]:
status, df = get_stock_returns(id_list=['CGF31Z-R'], 
                    
                     start_date='2006-01-03', 
                     end_date='2024-12-31', 
                     frequency='D',
                     verbose=False,
                     authorization=credentials.factset_api_authorization)
print(status)
df


In [None]:
returns_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/returns/')]
prices_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/')]

returns_files.remove("")


# collect returns data
collection = []
for f in tqdm.tqdm(returns_files):
    df = pd.read_csv(f'/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/returns/{f}.csv')
    df = df[df['totalReturn'] != 0]
    min_date = df['date'].min()
    max_date = df['date'].max()
    count = df['totalReturn'].count()
    
    collection.append([f, min_date, max_date, count])

df1 = pd.DataFrame(collection, columns=['fsym_id', 'min_date_returns', 'max_date_returns', 'count_returns'])

# collect prices data
collection = []
for f in tqdm.tqdm(prices_files):
    df = pd.read_csv(f'/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/{f}.csv')
    df = df[df['price'] != 0]
    df = df[df['price'].notnull()]
    min_date = df['date'].min()
    max_date = df['date'].max()
    count = df['price'].count()
    
    collection.append([f, min_date, max_date, count])

df2 = pd.DataFrame(collection, columns=['fsym_id', 'min_date_prices', 'max_date_prices', 'count_prices'])

df = df1.merge(df2, on='fsym_id', how='left')
df
