In [None]:
%load_ext autoreload
%autoreload 2

import requests
import json
import pandas as pd
import numpy as np
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import time
import datetime
import re
import tqdm
import os
import boto3
from Py_Files import credentials
from Py_Files import factset_api
from Py_Files import factset_fields
from Py_Files import aws_s3

data_dir = '/Users/joeybortfeld/Documents/QML Solutions Data/'
s3_dir = 's3://qml-research-data/'

# 0. Load the Factset Universe (All Fsym IDS) into Dictionary

In [None]:
factset_universe = pd.read_csv(data_dir + '/universe_and_traits/qml_universe_ids.csv')

universe_dict = factset_api.load_universe_dict(factset_universe)


# 1. Download Assets in USD using the Factset Fundamentals API

In [None]:
error_list = factset_api.batch_fundamental_download(fsym_list=universe_dict['us_nonfin_1m'],
                               field_list=['FF_ASSETS'],
                               currency='USD',
                               periodicity_list=[
                                                'quarterly', 
                                                #  'annual',
                                                #  'semi_annual'
                                                 ],
                               start_date='1990-01-01',
                               end_date='2024-12-31',
                               skip_if_done=True,
                               output_folder=data_dir + 'factset_data/factset_assets_in_usd/',
                               factset_api_authorization=credentials.factset_api_authorization)
    

# 2. Download All Metrics in Local Currency using the Factset Fundamentals API

In [None]:
fund_fields_to_download = [c.upper() for c in factset_fields.fundamental_fields]

error_list = factset_api.batch_fundamental_download(
                                fsym_list=universe_dict['us_nonfin_1m'],
                               field_list=fund_fields_to_download,
                               currency='LOCAL',
                               periodicity_list=[
                                                'annual',
                                                 'quarterly', 
                                                #  'semi_annual'
                                                 ],
                               start_date='1990-01-01',
                               end_date='2024-12-31',
                               skip_if_done=False,
                               output_folder=data_dir+'factset_data/factset_fundamentals/',
                               factset_api_authorization=credentials.factset_api_authorization)
    

# 3. Download Equity Price, Shares and Returns using the Factset Global Prices API

In [None]:
response = factset_api.batch_get_stock_data(metric='prices', 
                                fsym_list=universe_dict['us_nonfin_1m'], 
                                  start_date='2006-01-03', 
                                  end_date='2024-12-31', 
                                  frequency='D',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices SPLIT/')

response = factset_api.batch_get_stock_data(metric='returns', 
                                fsym_list=universe_dict['us_nonfin_1m'], 
                                  start_date='2006-01-03', 
                                  end_date='2024-12-31', 
                                  frequency='D',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/returns/')



In [None]:
# prices_files = [i.split('.')[0] for i in os.listdir('/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices SPLIT/')]

small_list = universe_dict['us_nonfin_1m']
big_list = universe_dict['us_nonfin_1b']
small_list = [i for i in small_list if i not in big_list]
small_list = [i for i in small_list if i in prices_files]



# 0. get start price date per each fsym
# (the shares outstanding API is finicky, if you specify a start date prior to availabale data it will fail.
# therefore we need to estimate the first available date for each fsym by using the first price date for each fsym.
# using that date, repeatedly try subsequent month end dates until the API returns data. )

# build a dictionary of start dates for each fsym
# prices_starts_dict = {}
# for f in tqdm.tqdm(prices_files):
#     df = pd.read_csv(f'/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices SPLIT/{f}.csv')
#     df = df[df['price'].notnull()]
#     df = df[df['price'] != 0]
#     start_date = df['date'].min()
#     prices_starts_dict[f] = start_date

# print(len(prices_starts_dict))

response = factset_api.batch_get_shares_outanding(fsym_list=['DG71WP-R'], 
                                  end_date='2024-12-31', 
                                  start_date_dict=prices_starts_dict,
                                  frequency='M',
                                  verbose=True,
                                  authorization=credentials.factset_api_authorization,
                                  skip_if_done=True,
                                  output_folder='/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/shares/')



In [None]:
temp = factset_api.get_shares_outanding(id_list=['DG71WP-R'], 
                               start_date='2013-01-03', 
                     end_date='2024-12-31', 
                     frequency='M',
                     verbose=True,
                     authorization=credentials.factset_api_authorization,)

print(temp[0])
temp[1].columns

# 3. Review Downloaded Data on Local Storage and Upload to S3

In [None]:
# list the file counts stored locally

folder_list = [
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_fundamentals/annual/',
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_fundamentals/quarterly/',
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_fundamentals/semi_annual/',

    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_assets_in_usd/annual/',
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_assets_in_usd/semi_annual/',

    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/shares/',
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/prices/',
    '/Users/joeybortfeld/Documents/QML Solutions Data/factset_data/factset_equity/returns/',
]

for this_folder in folder_list:

    file_list = os.listdir(this_folder)
    
    # print the file count in each folder
    print(this_folder, len(file_list))

    for this_file in tqdm.tqdm(file_list):
        aws_s3.copy_file_to_s3(local_file_path=this_folder + this_file, 
                                s3_bucket='qml-solutions-new-york', 
                                s3_key='factset-api-fundamentals/', 
                                aws_access_key_id=credentials.aws_access_key_id, 
                                aws_secret_access_key=credentials.aws_secret_access_key,
                                verbose=True)
    


In [None]:
# transfer local files to s3


folder_list = [
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/quarterly/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_fundamentals/semi_annual/',

    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/annual/',
    '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data/factset_assets_in_usd/semi_annual/',
]

for this_folder in folder_list:
    file_list = os.listdir(this_folder)
    
    print(this_folder, len(file_list))

    for this_file in tqdm.tqdm(file_list):
        aws_s3.copy_file_to_s3(local_file_path=this_folder + this_file, 
                                s3_bucket='qml-solutions-new-york', 
                                s3_key='XXXXXXXXXXXXXXX',
                                aws_access_key_id=credentials.aws_access_key_id, 
                                aws_secret_access_key=credentials.aws_secret_access_key,
                                verbose=True)
        


# DONE

# Housekeeping

# Copy Files from Local Directory to AWS S3

In [None]:
# MULTITHREAD BULK UPLOAD FROM LOCAL TO S3

from concurrent.futures import ThreadPoolExecutor
from botocore.exceptions import BotoCoreError, ClientError


def upload_file_to_s3(local_file_path, bucket_name, s3_key, s3_client):
    """
    Uploads a single file to S3.
    """
    try:
        s3_client.upload_file(local_file_path, bucket_name, s3_key)
        return True
    except (BotoCoreError, ClientError) as e:
        print(f"Error uploading {local_file_path} to {bucket_name}/{s3_key}: {e}")
        return False

def bulk_upload_to_s3(local_dir, local_folder, bucket_name, aws_access_key_id, aws_secret_access_key, num_threads=8):
    """
    Uploads all files in local_dir to the specified S3 bucket.
    """
    # Initialize S3 client 
    s3_client = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    # Collect a list of local file names to transfer (aka 'MH33D6-R.csv', ''XQCWLZ-R.csv)

    target_folder = local_folder.replace('_', '-')
    local_file_list = os.listdir(local_dir + '/' + local_folder + '/')

    local_path_list = [f'{local_dir}/{local_folder}/{f}' for f in local_file_list]
    s3_key_list = [f'{target_folder}/{f}' for f in local_file_list]

    from_to_list = list(zip(local_path_list, s3_key_list))

    # diagnostics
    print(f'transfer files from {local_dir}/{local_folder}')
    print(f'transfer to s3 {bucket_name}/{target_folder}')
    print('files to transfer:', len(from_to_list))

 
    # Use ThreadPoolExecutor for parallel uploads
    error_list = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for local_path, s3_key in from_to_list:
            futures.append(
                executor.submit(upload_file_to_s3, local_path, bucket_name, s3_key, s3_client)
            )

        # Track progress with tqdm
        for future in tqdm.tqdm(futures, desc="Uploading files"):
            try:
                if not future.result():
                    # Add failed uploads to the list
                    error_list.append(futures[future])
            except Exception as e:
                print(f"Unexpected error: {e}")
                error_list.append(futures[future])

    
    # collect and retry errors
    print(f"Failed uploads: {len(error_list)}")
    final_error_list = []
    if error_list:
        print("Retrying failed uploads...")
        for local_path, s3_key in error_list:
            success = upload_file_to_s3(local_path, bucket_name, s3_key, s3_client)
            if not success:
                print(f"Final failure for {local_path}")
                final_error_list.append(local_path)

    print("Upload process complete.")
    if len(final_error_list) == 0:
        return True, []
    else:
        return False, final_error_list

# Example usage

    
local_dir = '/Users/joeybortfeld/Documents/CreditGradients Data/Factset Data'
bucket_name = 'qml-solutions-new-york'

for local_folder in [

    # 'factset_api_fundamentals_annual',
    # 'factset_api_fundamentals_quarterly',
    'factset_api_fundamentals_semi_annual',

    'factset_api_fundamentals_annual_assets_in_usd',
    'factset_api_fundamentals_semi_annual_assets_in_usd',

    'factset_api_fundamentals_shares_outstanding_annual',
    'factset_api_fundamentals_shares_outstanding_quarterly',
    'factset_api_fundamentals_shares_outstanding_semi_annual',

    'factset_api_stock_prices_split',
    'factset_api_stock_returns',
    ]:

    success, error_list = bulk_upload_to_s3(
        local_dir=local_dir,
        local_folder=local_folder,
        bucket_name=bucket_name,
        aws_access_key_id=aws_credentials['key'],
        aws_secret_access_key=aws_credentials['secret'],
        num_threads=8  # Adjust number of threads based on your system's capabilities
    )

    print(success, error_list)
    print()



In [None]:
def s3_check_file_exists(bucket_name:str='qml-solutions-new-york', 
                         file_key:str='/factset-api-global-prices/B01DPB-R.csv', 
                         aws_access_key_id:str=None, 
                         aws_secret_access_key:str=None):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except s3.exceptions.ClientError:
        return False


s3_check_file_exists(bucket_name='qml-solutions-new-york', 
                         file_key='factset-api-stock-prices-split/MH33D6-R.csv', 
                         aws_access_key_id=aws_credentials['key'], 
                         aws_secret_access_key=aws_credentials['secret'])

In [None]:

temp = pd.read_csv('s3://qml-solutions-new-york/factset-api-global-prices/B01DPB-R.csv',
                   storage_options=aws_credentials)
temp

In [None]:
import boto3

# Transfer file to AWS
local_folder = '/Users/joeybortfeld/Documents/CreditGradients Data/factset_api_stock_prices/'
s3_bucket = 'qml-solutions-new-york'



def s3_check_file_exists(bucket_name:str='qml-solutions-new-york', 
                         file_key:str='/factset-api-global-prices/B01DPB-R.csv', 
                         aws_access_key_id:str=None, 
                         aws_secret_access_key:str=None):
    
    s3 = boto3.client(
        's3',
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
        return True
    except s3.exceptions.ClientError:
        return False
    
# CHECK IF FILE EXISTS IN S3
res = s3_check_file_exists(bucket_name=s3_bucket, 
                     file_key='factset-api-global-prices/B01DPB-R.csv', 
                     aws_access_key_id=aws_credentials['key'], 
                     aws_secret_access_key=aws_credentials['secret'])
res

In [None]:
import boto3

def list_s3_bucket_contents(bucket_name, prefix='', aws_access_key_id=None, aws_secret_access_key=None):
    """
    List all items in an S3 bucket and subfolder.
    
    Parameters:
    - bucket_name: str, name of the S3 bucket
    - prefix: str, the folder path within the bucket (optional)
    
    Returns:
    - List of file keys (paths) in the specified bucket and folder
    """
    s3_client = boto3.client('s3', 
                             aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,)
    paginator = s3_client.get_paginator('list_objects_v2')
    
    file_keys = []
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                file_keys.append(obj['Key'])
    
    return file_keys



# Usage example:
bucket_name = 'qml-solutions-new-york'
folder_path = 'factset-api-fundamentals-annual/'  # Optional
file_list = list_s3_bucket_contents(bucket_name, folder_path, aws_access_key_id=aws_credentials2['key'], aws_secret_access_key=aws_credentials2['secret'])
print(len(file_list))
print(file_list[:15])


In [None]:
utilities.download_fundamentals(id_list=['MH33D6-R'],
                                    field_list=['FF_BUS_DESC_ABBREV'],
                                    # periodicity=download_type_dict[download_type][0],
                                    # start_date=start_date,
                                    # end_date=end_date,
                                    # currency='USD',
                                    # verbose=False,
                                    # authorization=authorization
                                    )

