# Download & Store Data

## Imports & Settings

In [133]:
import warnings
warnings.filterwarnings('ignore')

In [134]:
from pathlib import Path
import requests
#from io import BytesIO
#from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
#from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

import os
from dotenv import load_dotenv
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np
import requests
from random import sample

In [135]:
# load API keys from .env file
load_dotenv()
TRADIER_TOKEN = os.getenv('TRADIER_TOKEN')
EOD_TOKEN = os.getenv('EOD_TOKEN')

### Set Data Store Path

In [136]:
# set data store path variable
# modify path to store the data elsewhere
DATA_STORE = Path('data_store_38.h5')

## Get Tickers

### Function

In [137]:
def get_tickers(): 
    
    # pulls all tickers of ETFs on NYSE or NASDAQ

    r = requests.get('https://eodhistoricaldata.com/api/exchange-symbol-list/US', 
        params={'api_token': EOD_TOKEN, 'fmt': 'json'}
        )
    data = r.json()
    r.close()

    df = pd.DataFrame(data)
    df = df[
        (df.Type == 'Common Stock') &
        ((df.Exchange == 'NYSE ARCA') |
        (df.Exchange == 'NASDAQ'))
        ]

    df.index = df.Code
    df.drop('Code', axis = 1, inplace=True)
    ticker_list = list(df.index)
    return ticker_list

In [138]:
# # get etf tickers
# tickers = get_tickers()

In [139]:
# # save ticker list to csv
# df = pd.DataFrame(tickers)
# df.to_csv('tickers.csv')

### CSV

In [140]:
# pull in s&p 500 tickers
stock_tickers = pd.read_csv('ticker_list.csv', header=None, usecols=[0], names = ['symbols'])
stock_tickers = list(stock_tickers['symbols'])

In [141]:
etf_tickers = pd.read_csv('etf_ticker_list.csv', header=0, usecols=[0], names = ['symbols'])
etf_tickers = list(etf_tickers['symbols'])

## Get Price Data

In [142]:
# from tqdm.notebook import tqdm
from progressbar import ProgressBar

def get_historical_price(tickers, data_type):

    # pulls historical daily OLHC prices and volume
    d = {}
    pbar = ProgressBar()

    for i, ticker in pbar(enumerate(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api' + '/' + data_type + '/' + ticker + '.US', 
            params={'api_token': EOD_TOKEN, 'fmt': 'json'}
            )
        data = r.json()
        r.close()

        d[ticker] = pd.DataFrame.from_records(data).set_index('date')

    df = pd.concat(d.values(), axis=0, keys=d.keys())

    return df

### Stocks

In [143]:
# get daily stock prices df
df = get_historical_price(stock_tickers, 'eod')

/ |#                                                  | 0 Elapsed Time: 0:00:00
- |        #                                          | 1 Elapsed Time: 0:00:00
\ |                #                                  | 2 Elapsed Time: 0:00:01
| |                      #                            | 3 Elapsed Time: 0:00:02
/ |                           #                       | 4 Elapsed Time: 0:00:02
- |                                 #                 | 5 Elapsed Time: 0:00:03
\ |                                       #           | 6 Elapsed Time: 0:00:03
| |                                               #   | 7 Elapsed Time: 0:00:04
/ |                                                 # | 8 Elapsed Time: 0:00:05
- |                                           #       | 9 Elapsed Time: 0:00:05
\ |                                  #               | 10 Elapsed Time: 0:00:06
| |                         #                        | 11 Elapsed Time: 0:00:07
/ |                  #                  

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3998903 entries, ('AAPL', '1980-12-12') to ('NWS', '2023-03-08')
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   open            float64
 1   high            float64
 2   low             float64
 3   close           float64
 4   adjusted_close  float64
 5   volume          int64  
dtypes: float64(5), int64(1)
memory usage: 198.9+ MB


In [145]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('stocks/prices/daily', df)

### ETFs

In [146]:
# get daily etf prices df
df = get_historical_price(etf_tickers, 'eod')

/ |#                                                  | 0 Elapsed Time: 0:00:00
- |      #                                            | 1 Elapsed Time: 0:00:00
\ |            #                                      | 2 Elapsed Time: 0:00:01
| |                  #                                | 3 Elapsed Time: 0:00:01
/ |                       #                           | 4 Elapsed Time: 0:00:02
- |                             #                     | 5 Elapsed Time: 0:00:02
\ |                                   #               | 6 Elapsed Time: 0:00:03
| |                                         #         | 7 Elapsed Time: 0:00:04
/ |                                               #   | 8 Elapsed Time: 0:00:04
- |                                                #  | 9 Elapsed Time: 0:00:05
\ |                                         #        | 10 Elapsed Time: 0:00:05
| |                                   #              | 11 Elapsed Time: 0:00:06
/ |                              #      

In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 473881 entries, ('SPY', '1993-01-29') to ('SPXS', '2023-03-08')
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   open            473881 non-null  float64
 1   high            473881 non-null  float64
 2   low             473881 non-null  float64
 3   close           473881 non-null  float64
 4   adjusted_close  473881 non-null  float64
 5   volume          473881 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 23.4+ MB


In [148]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('etfs/prices/daily', df)

## S&P 500 Price Data

In [149]:
sp500_stooq = (pd.read_csv('spx_d.csv', index_col=0,
                     parse_dates=True).loc['1950':'2019'].rename(columns=str.lower))
print(sp500_stooq.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17700 entries, 1950-01-03 to 2019-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    17700 non-null  float64
 1   high    17700 non-null  float64
 2   low     17700 non-null  float64
 3   close   17700 non-null  float64
 4   volume  17700 non-null  float64
dtypes: float64(5)
memory usage: 829.7 KB
None


In [150]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stooq', sp500_stooq)

## S&P 500 Constituents

In [151]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url, header=0)[0]

df.columns = ['ticker', 'name', 'gics_sector', 'gics_sub_industry',
              'location', 'first_added', 'cik', 'founded']

In [152]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stocks', df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ticker             503 non-null    object
 1   name               503 non-null    object
 2   gics_sector        503 non-null    object
 3   gics_sub_industry  503 non-null    object
 4   location           503 non-null    object
 5   first_added        499 non-null    object
 6   cik                503 non-null    int64 
 7   founded            503 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.6+ KB


## Fundemental Data

### Stocks

In [153]:
from progressbar import ProgressBar

def get_stock_fundmentals(tickers): 

    raw_data = {}
    multi_ticker_dict = {}

    columns = ['General', 'Highlights']

    pbar = ProgressBar()

    for i, ticker in pbar(enumerate(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + ticker + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()  
        r.close()
        raw_data[ticker] = data
        
        Officers = raw_data[ticker]['General'].pop('Officers', None)
        Listings = raw_data[ticker]['General'].pop('Listings', None)
        AddressData = raw_data[ticker]['General'].pop('AddressData', None)
        NumberDividendsByYear = raw_data[ticker]['SplitsDividends'].pop('NumberDividendsByYear', None)

        columns = ['General', 'Highlights', 'Valuation', 'SharesStats',
            'Technicals','SplitsDividends', 'AnalystRatings']

        single_ticker_dict = {}
        
        for column in columns:
            single_ticker_dict[column] = pd.Series(raw_data[ticker][column])

        single_ticker_series = pd.concat(single_ticker_dict)
        multi_ticker_dict[ticker] = single_ticker_series

    multi_ticker_series = pd.concat(multi_ticker_dict)

    return multi_ticker_series

In [154]:
# get daily prices df
df = get_stock_fundmentals(stock_tickers)

/ |#                                                  | 0 Elapsed Time: 0:00:00
- |    #                                              | 1 Elapsed Time: 0:00:00
\ |        #                                          | 2 Elapsed Time: 0:00:00
| |             #                                     | 3 Elapsed Time: 0:00:01
/ |                 #                                 | 4 Elapsed Time: 0:00:01
- |                      #                            | 5 Elapsed Time: 0:00:02
\ |                           #                       | 6 Elapsed Time: 0:00:02
| |                               #                   | 7 Elapsed Time: 0:00:03
/ |                                   #               | 8 Elapsed Time: 0:00:03
- |                                        #          | 9 Elapsed Time: 0:00:04
\ |                                            #     | 10 Elapsed Time: 0:00:05
| |                                   #              | 11 Elapsed Time: 0:00:06
/ |                               #     

In [155]:
df

AAPL  General         Code                    AAPL
                      Type            Common Stock
                      Name               Apple Inc
                      Exchange              NASDAQ
                      CurrencyCode             USD
                                          ...     
NWS   AnalystRatings  StrongBuy                3.0
                      Buy                      2.0
                      Hold                     2.0
                      Sell                     0.0
                      StrongSell               0.0
Length: 48791, dtype: object

In [156]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('stocks/base_fundementals', df)

### ETFs

In [157]:
from progressbar import ProgressBar

def get_etf_fundementals(tickers): 

    # pulls fundementals and wrangles data into multiple dfs

    raw_data = {}
    multi_ticker_dict = {}


    columns = ['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile',
        'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
        'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
        'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets', 'Holdings_Count',
        'Average_Mkt_Cap_Mil']
    
    pbar = ProgressBar()

    for i, ticker in pbar(enumerate(tickers)): 
        
        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + ticker + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        r.close()
        raw_data[ticker] = data  

        single_ticker_dict = {}

        single_ticker_dict['General'] = pd.Series(raw_data[ticker]['General'])
        single_ticker_dict['Technicals'] = pd.Series(raw_data[ticker]['Technicals'])

        single_ticker_dict['ETF_Data'] = pd.Series([raw_data[ticker]['ETF_Data'][name] for name in columns],
            index=[name for name in columns])
        
        single_ticker_dict['Market_Capitalisation'] = pd.Series(raw_data[ticker]['ETF_Data']['Market_Capitalisation'])
        single_ticker_dict['MorningStar'] = pd.Series(raw_data[ticker]['ETF_Data']['MorningStar'])
        single_ticker_dict['Performance'] = pd.Series(raw_data[ticker]['ETF_Data']['Performance'])

        #single_ticker_dict['Top_10_Holdings'] = pd.DataFrame(raw_data[ticker]['ETF_Data']['Top_10_Holdings'].values(), index = raw_data[ticker]['ETF_Data']['Top_10_Holdings'].keys())

        single_ticker_series = pd.concat(single_ticker_dict)
        multi_ticker_dict[ticker] = single_ticker_series

    multi_ticker_series = pd.concat(multi_ticker_dict)

    return multi_ticker_series

In [158]:
# get daily prices df
df = get_etf_fundementals(etf_tickers)

/ |#                                                  | 0 Elapsed Time: 0:00:00
- |      #                                            | 1 Elapsed Time: 0:00:00
\ |         #                                         | 2 Elapsed Time: 0:00:00
| |             #                                     | 3 Elapsed Time: 0:00:01
/ |                #                                  | 4 Elapsed Time: 0:00:01
- |                    #                              | 5 Elapsed Time: 0:00:02
\ |                        #                          | 6 Elapsed Time: 0:00:02
| |                           #                       | 7 Elapsed Time: 0:00:02
/ |                               #                   | 8 Elapsed Time: 0:00:03
- |                                    #              | 9 Elapsed Time: 0:00:03
\ |                                        #         | 10 Elapsed Time: 0:00:04
| |                                             #    | 11 Elapsed Time: 0:00:04
/ |                                     

In [159]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('etfs/base_fundementals', df)

## Bond Price Indexes

In [160]:
securities = {'BAMLCC0A0CMTRIV'   : 'US Corp Master TRI',
              'BAMLHYH0A0HYM2TRIV': 'US High Yield TRI',
              'BAMLEMCBPITRIV'    : 'Emerging Markets Corporate Plus TRI',
              #'GOLDAMGBD228NLBM'  : 'Gold (London, USD)',
              'DGS10'             : '10-Year Treasury CMR',
              }

df = web.DataReader(name=list(securities.keys()), data_source='fred', start=2000)
df = df.rename(columns=securities).dropna(how='all').resample('B').mean()

with pd.HDFStore(DATA_STORE) as store:
    store.put('fred/assets', df)