## Data Importation, Cleaning, and Transformation

Data Source: https://eodhistoricaldata.com/

In [1]:
# import dependencies
import numpy as np
import scipy as sp
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib as jl
import datetime as dt
import requests

from icecream import ic

import os
from dotenv import load_dotenv

from ta import add_all_ta_features
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import VolumeWeightedAveragePrice
from ta.momentum import StochRSIIndicator

In [2]:
load_dotenv()
TOKEN = os.getenv('TOKEN')

In [3]:
def get_etf_tickers(): 
    
    # pulls all tickers of ETFs on NYSE or NASDAQ

    r = requests.get('https://eodhistoricaldata.com/api/exchange-symbol-list/US', 
        params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
        )
    data = r.json()
    r.close()

    df = pd.DataFrame(data)
    df = df[
        (df.Type == 'ETF') &
        ((df.Exchange == 'NYSE ARCA') |
        (df.Exchange == 'NASDAQ'))
        ]

    df.index = df.Code
    df.drop('Code', axis = 1, inplace=True)
    ticker_list = list(df.index)
    return ticker_list

In [4]:
# get etf tickers
# tickers = get_etf_tickers()

In [5]:
# save ticker list to csv
#df = pd.DataFrame(tickers)
#df.to_csv('data/tickers.csv')

In [2]:
def get_historical_price(tickers, data_type):

    # pulls historical daily or intraday OLHC prices and volume

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api' + '/' + data_type + '/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        r.close()

        # ADD PROGRESS BAR
        
        for symbol in tickers:
            d[symbol] = pd.DataFrame.from_records(data)

            indicator_macd = MACD(close = d[symbol]['close'], window_slow=26, window_fast=12, window_sign=9, fillna=True)
            d[symbol]['macd'] = indicator_macd.macd()
            d[symbol]['macd_diff'] = indicator_macd.macd_diff()
            d[symbol]['macd_signal'] = indicator_macd.macd_signal()

            indicator_bb = BollingerBands(close = d[symbol]['close'], window=20, window_dev=2, fillna=True)
            d[symbol]['bb_mavg'] = indicator_bb.bollinger_mavg()
            d[symbol]['bb_hband'] = indicator_bb.bollinger_hband()
            d[symbol]['bb_lband'] = indicator_bb.bollinger_lband()
            d[symbol]['bb_hband_ind'] = indicator_bb.bollinger_hband_indicator()
            d[symbol]['bb_lband_ind'] = indicator_bb.bollinger_lband_indicator()

            indicator_vwap = VolumeWeightedAveragePrice(
                high = d[symbol]['high'],
                low = d[symbol]['low'],
                close = d[symbol]['close'],
                volume = d[symbol]['volume'],
                window=14, fillna=True)
            d[symbol]['vwap'] = indicator_vwap.volume_weighted_average_price()

            indicator_stochrsi = StochRSIIndicator(close = d[symbol]['close'], window=14, smooth1=3, smooth2=3, fillna=True)
            d[symbol]['stoch_rsi'] = indicator_stochrsi.stochrsi()
            d[symbol]['stochrsi_d'] = indicator_stochrsi.stochrsi_d()
            d[symbol]['stochrsi_k'] = indicator_stochrsi.stochrsi_k()

    return d

In [3]:
# placeholder tickers
tickers = ['SPXL', 'SPXS']

# get intraday prices | dictionary of dataframes
intraday_data = get_historical_price(tickers, 'intraday')

# get daily prices | dictionary of dataframes
daily_data = get_historical_price(tickers, 'eod')

In [10]:
# concat intraday dataframes together and convert to dask df
intraday_df = pd.concat(intraday_data.values(), axis=1, keys=intraday_data.keys())
intraday_dask_df = dd.from_pandas(intraday_df, npartitions=6)

# concat daily dataframes together and convert to dask df
daily_df = pd.concat(daily_data.values(), axis=1, keys=daily_data.keys())
daily_dask_df = dd.from_pandas(daily_df, npartitions=6)

#daily_df.to_csv('data/test_daily_df.csv')

In [11]:
daily_df.head()

Unnamed: 0_level_0,SPY,SPY,SPY,SPY,SPY,SPY,SPY,SPY,SPY,SPY,...,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL,AAPL
Unnamed: 0_level_1,date,open,high,low,close,adjusted_close,volume,macd,macd_diff,macd_signal,...,macd_signal,bb_mavg,bb_hband,bb_lband,bb_hband_ind,bb_lband_ind,vwap,stoch_rsi,stochrsi_d,stochrsi_k
0,1980-12-12,28.7392,28.8736,28.7392,28.7392,0.0999,469033600,0.0,0.0,0.0,...,0.0,28.7392,28.7392,28.7392,0.0,0.0,28.784,0.0,0.0,0.0
1,1980-12-15,27.3728,27.3728,27.2608,27.2608,0.0947,175884800,-0.117935,-0.094348,-0.023587,...,-0.023587,28.0,29.4784,26.5216,0.0,0.0,28.378768,0.0,0.0,0.0
2,1980-12-16,25.3792,25.3792,25.2448,25.2448,0.0877,105728000,-0.369811,-0.276979,-0.092832,...,-0.092832,27.0816,29.945999,24.217201,0.0,0.0,27.943661,0.0,0.0,0.0
3,1980-12-17,25.872,26.0064,25.872,25.872,0.0899,86441600,-0.512902,-0.336056,-0.176846,...,-0.176846,26.7792,29.471956,24.086444,0.0,0.0,27.734358,0.0,0.0,0.0
4,1980-12-18,26.6336,26.7456,26.6336,26.6336,0.0925,73449600,-0.558411,-0.305252,-0.253159,...,-0.253159,26.75008,29.161369,24.338791,0.0,0.0,27.648576,0.0,0.0,0.0


### Current Dev
- next job, implement get_fundementals and organize data output
- then develop flow for analysis on whiteboard, use paper as resource

In [76]:
def get_fundementals(tickers): 

    # pulls fundementals and wrangles data into multiple dfs

    d = {}
    columns = ['ISIN', 'Company_Name']#, 'Company_URL', 'ETF_URL', 'Domicile',
    #    'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
    #    'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
    #    'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets',
    #    'Average_Mkt_Cap_Mil', 'Market_Capitalisation']
    e = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()

        d[tickers[i]] = data


        for name in columns:
            a = pd.Series(d[tickers[i]]['ETF_Data'][name],
                index = [d[tickers[i]]['ETF_Data'][name]])

            b = pd.DataFrame(pd.concat([a]), columns = ['data'])

        # for symbol in tickers:

        #     d[symbol] = data

        #     # pd.Series(d[symbol]['General'])

        #     print(pd.Series(d[symbol]['ETF_Data']['ISIN'],
        #      index = [d[symbol]['ETF_Data']['ISIN']]))
        #     print(pd.Series(d[symbol]['ETF_Data']['Company_Name'],
        #       index = [d[symbol]['ETF_Data']['Company_Name']]))
            # zz = pd.Series(d[symbol]['General'])
            # a = pd.Series(d[symbol]['ETF_Data']['ISIN'],
            #  index = [d[symbol]['ETF_Data']['ISIN']])
            # b = pd.Series(d[symbol]['ETF_Data']['Company_Name'],
            #  index = [d[symbol]['ETF_Data']['Company_Name']])
            # c = pd.Series(d[symbol]['ETF_Data']['Company_URL'],
            #  index = [d[symbol]['ETF_Data']['Company_URL']])
            # d = pd.Series(d[symbol]['ETF_Data']['ETF_URL'],
            #  index = [d[symbol]['ETF_Data']['ETF_URL']])
            # e = pd.Series(d[symbol]['ETF_Data']['Domicile'],
            #  index = [d[symbol]['ETF_Data']['Domicile']])




            # f = pd.Series(d[symbol]['ETF_Data']['Index_Name'],
            #   index = [d[symbol]['ETF_Data']['Index_Name']])
            # g = pd.Series(d[symbol]['ETF_Data']['Yield'],
            #  index = [d[symbol]['ETF_Data']['Yield']])
            # h = pd.Series(d[symbol]['ETF_Data']['Dividend_Paying_Frequency'],
            #  index = [d[symbol]['ETF_Data']['Dividend_Paying_Frequency']])
            # i = pd.Series(d[symbol]['ETF_Data']['Inception_Date'],
            #  index = [d[symbol]['ETF_Data']['Inception_Date']])
            # j = pd.Series(d[symbol]['ETF_Data']['Max_Annual_Mgmt_Charge'],
            #  index = ['Max_Annual_Mgmt_Charge'])
            # k = pd.Series(d[symbol]['ETF_Data']['Ongoing_Charge'],
            #  index = ['Ongoing_Charge'])
            # l = pd.Series(d[symbol]['ETF_Data']['Date_Ongoing_Charge'],
            #  index = ['Date_Ongoing_Charge'])
            # m = pd.Series(d[symbol]['ETF_Data']['NetExpenseRatio'],
            #  index = ['NetExpenseRatio'])
            # n = pd.Series(d[symbol]['ETF_Data']['AnnualHoldingsTurnover'],
            #  index = ['AnnualHoldingsTurnover'])
            # o = pd.Series(d[symbol]['ETF_Data']['TotalAssets'],
            #  index = ['TotalAssets'])
            # p = pd.Series(d[symbol]['ETF_Data']['Average_Mkt_Cap_Mil'],
            #  index = ['Average_Mkt_Cap_Mil'])
            # q = pd.Series(d[symbol]['ETF_Data']['Market_Capitalisation'],
            #  index = ['Market_Capitalisation'])
            # r = pd.Series(d[symbol]['ETF_Data']['Holdings_Count'],
            #  index = ['Holdings_Count'])
            # s = pd.Series(d[symbol]['ETF_Data']['MorningStar'])
            # aa = pd.DataFrame(d[symbol]['ETF_Data']['Performance'].items(),
            #  index = d[symbol]['ETF_Data']['Performance'].keys()).drop(0, axis=1)
            # ad = pd.Series(d[symbol]['Technicals'])

            # fundemental_df = pd.DataFrame(pd.concat([zz, a, b, c, d, e, f,
            #  g, h, i, j, k, l, m ,n, o, p, r, s, aa[1], ad]), columns = ['data'])

    return d

In [77]:
# # placeholder tickers
tickers = ['QQQ', 'SPY']

# # get fundemental data
fundemental_data = get_fundementals(tickers)

In [78]:
fundemental_data

{'QQQ': {'General': {'Code': 'QQQ',
   'Type': 'ETF',
   'Name': 'Invesco QQQ Trust',
   'Exchange': 'NASDAQ',
   'CurrencyCode': 'USD',
   'CurrencyName': 'US Dollar',
   'CurrencySymbol': '$',
   'CountryName': 'USA',
   'CountryISO': 'US',
   'Description': 'To maintain the correspondence between the composition and weights of the securities in the trust (the securities) and the stocks in the NASDAQ-100 Index®, the adviser adjusts the securities from time to time to conform to periodic changes in the identity and/or relative weights of index securities. The composition and weighting of the securities portion of a portfolio deposit are also adjusted to conform to changes in the index.',
   'Category': 'Large Growth',
   'UpdatedAt': '2023-02-07'},
  'Technicals': {'Beta': 1.08,
   '52WeekHigh': 369.5817,
   '52WeekLow': 253.6524,
   '50DayMA': 281.4056,
   '200DayMA': 291.2847},
  'ETF_Data': {'ISIN': 'US46090E1038',
   'Company_Name': 'Invesco',
   'Company_URL': 'http://www.invesco

In [None]:
columns = ['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile',
       'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
       'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
       'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets',
       'Average_Mkt_Cap_Mil', 'Market_Capitalisation']


for name in columns:
       print(pd.Series(d[tickers[i]]['ETF_Data'][name],
             index = [d[tickers[i]]['ETF_Data'][name]]))

In [37]:
def test(tickers): 

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()
        return data

In [61]:
tickers = ['SPY', 'QQQ']
fundemental_data = test(tickers)

In [62]:
#fundemental_data.keys()
fundemental_data['ETF_Data'].keys()

dict_keys(['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile', 'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date', 'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge', 'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets', 'Average_Mkt_Cap_Mil', 'Market_Capitalisation', 'Asset_Allocation', 'World_Regions', 'Sector_Weights', 'Fixed_Income', 'Holdings_Count', 'Top_10_Holdings', 'Holdings', 'Valuations_Growth', 'MorningStar', 'Performance'])

In [40]:
columns = ['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile',
       'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
       'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
       'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets',
       'Average_Mkt_Cap_Mil', 'Market_Capitalisation']

zz = pd.Series(fundemental_data['General'])
a = pd.Series(fundemental_data['ETF_Data']['ISIN'], index = ['ISIN'])
b = pd.Series(fundemental_data['ETF_Data']['Company_Name'], index = ['Company_Name'])
c = pd.Series(fundemental_data['ETF_Data']['Company_URL'], index = ['Company_URL'])
d = pd.Series(fundemental_data['ETF_Data']['ETF_URL'], index = ['ETF_URL'])
e = pd.Series(fundemental_data['ETF_Data']['Domicile'], index = ['Domicile'])
f = pd.Series(fundemental_data['ETF_Data']['Index_Name'], index = ['Index_Name'])
g = pd.Series(fundemental_data['ETF_Data']['Yield'], index = ['Yield'])
h = pd.Series(fundemental_data['ETF_Data']['Dividend_Paying_Frequency'], index = ['Dividend_Paying_Frequency'])
i = pd.Series(fundemental_data['ETF_Data']['Inception_Date'], index = ['Inception_Date'])
j = pd.Series(fundemental_data['ETF_Data']['Max_Annual_Mgmt_Charge'], index = ['Max_Annual_Mgmt_Charge'])
k = pd.Series(fundemental_data['ETF_Data']['Ongoing_Charge'], index = ['Ongoing_Charge'])
l = pd.Series(fundemental_data['ETF_Data']['Date_Ongoing_Charge'], index = ['Date_Ongoing_Charge'])
m = pd.Series(fundemental_data['ETF_Data']['NetExpenseRatio'], index = ['NetExpenseRatio'])
n = pd.Series(fundemental_data['ETF_Data']['AnnualHoldingsTurnover'], index = ['AnnualHoldingsTurnover'])
o = pd.Series(fundemental_data['ETF_Data']['TotalAssets'], index = ['TotalAssets'])
p = pd.Series(fundemental_data['ETF_Data']['Average_Mkt_Cap_Mil'], index = ['Average_Mkt_Cap_Mil'])
q = pd.Series(fundemental_data['ETF_Data']['Market_Capitalisation'], index = ['Market_Capitalisation'])
r = pd.Series(fundemental_data['ETF_Data']['Holdings_Count'], index = ['Holdings_Count'])
s = pd.Series(fundemental_data['ETF_Data']['MorningStar'])
aa = pd.DataFrame(fundemental_data['ETF_Data']['Performance'].items(), index = fundemental_data['ETF_Data']['Performance'].keys()).drop(0, axis=1)
ad = pd.Series(fundemental_data['Technicals'])

fundemental_df = pd.DataFrame(pd.concat([zz, a, b, c, d, e, f, g, h, i, j, k, l, m ,n, o, p, r, s, aa[1], ad]), columns = ['data'])

In [41]:
asset_allocation_df = pd.DataFrame(fundemental_data['ETF_Data']['Asset_Allocation'])
region_weights_df = pd.DataFrame(fundemental_data['ETF_Data']['World_Regions'])
sector_weights_df = pd.DataFrame(fundemental_data['ETF_Data']['Sector_Weights'])
fixed_income_df = pd.DataFrame(fundemental_data['ETF_Data']['Fixed_Income'])
top_10_holdings_df = pd.DataFrame(fundemental_data['ETF_Data']['Top_10_Holdings'].values(), index = fundemental_data['ETF_Data']['Top_10_Holdings'].keys())
holdings_df = pd.DataFrame(fundemental_data['ETF_Data']['Holdings'].values(), index = fundemental_data['ETF_Data']['Holdings'].keys())
valuations_growth_df = pd.DataFrame(fundemental_data['ETF_Data']['Valuations_Growth']).T

ab = {}
ab['region_weights'] = region_weights_df
ab['sector_weights'] = sector_weights_df
weights_df = pd.concat(ab.values(), axis=1, keys=ab.keys())

In [42]:
fundemental_df

Unnamed: 0,data
Code,SPY
Type,ETF
Name,SPDR S&P 500 ETF Trust
Exchange,NYSE ARCA
CurrencyCode,USD
CurrencyName,US Dollar
CurrencySymbol,$
CountryName,USA
CountryISO,US
Description,The Trust seeks to achieve its investment obje...


In [17]:
asset_allocation_df.head()

Unnamed: 0,Cash,NotClassified,Stock non-US,Other,Stock US,Bond
Long_%,0.06721,0,1.07485,0,98.85794,0
Short_%,0.0,0,0.0,0,0.0,0
Net_Assets_%,0.06721,0,1.07485,0,98.85794,0


In [18]:
fixed_income_df.head()

Unnamed: 0,EffectiveDuration,ModifiedDuration,EffectiveMaturity,CreditQuality,Coupon,Price,YieldToMaturity
Fund_%,0.0,0,0.0,0,0.0,0,0.0
Relative_to_Category,1.24,0,0.39667,0,2.93891,0,1.76


In [19]:
top_10_holdings_df.head()

Unnamed: 0,Code,Exchange,Name,Sector,Industry,Country,Region,Assets_%
AAPL.US,AAPL,US,Apple Inc,Technology,Consumer Electronics,United States,North America,6.66998
MSFT.US,MSFT,US,Microsoft Corporation,Technology,Software-Infrastructure,United States,North America,5.56
AMZN.US,AMZN,US,Amazon.com Inc,Consumer Cyclical,Internet Retail,United States,North America,2.64922
GOOGL.US,GOOGL,US,Alphabet Inc Class A,Communication Services,Internet Content & Information,United States,North America,1.80685
GOOG.US,GOOG,US,Alphabet Inc Class C,Communication Services,Internet Content & Information,United States,North America,1.60841


In [20]:
holdings_df.head()

Unnamed: 0,Code,Exchange,Name,Sector,Industry,Country,Region,Assets_%
AAPL.US,AAPL,US,Apple Inc,Technology,Consumer Electronics,United States,North America,6.66998
MSFT.US,MSFT,US,Microsoft Corporation,Technology,Software-Infrastructure,United States,North America,5.56
AMZN.US,AMZN,US,Amazon.com Inc,Consumer Cyclical,Internet Retail,United States,North America,2.64922
GOOGL.US,GOOGL,US,Alphabet Inc Class A,Communication Services,Internet Content & Information,United States,North America,1.80685
GOOG.US,GOOG,US,Alphabet Inc Class C,Communication Services,Internet Content & Information,United States,North America,1.60841


In [21]:
valuations_growth_df.head()

Unnamed: 0,Price/Prospective Earnings,Price/Book,Price/Sales,Price/Cash Flow,Dividend-Yield Factor,Long-Term Projected Earnings Growth,Historical Earnings Growth,Sales Growth,Cash-Flow Growth,Book-Value Growth
Valuations_Rates_Portfolio,18.47341,3.4481,2.10993,10.68378,1.7921,,,,,
Valuations_Rates_To_Category,17.41992,3.80028,2.17691,11.131,1.94524,,,,,
Growth_Rates_Portfolio,,,,,,11.2708,22.15673,11.41032,8.06469,4.58624
Growth_Rates_To_Category,,,,,,10.43816,30.68086,-5.13758,-28.83055,-2.58379


In [22]:
weights_df.head()

Unnamed: 0_level_0,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,...,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights
Unnamed: 0_level_1,North America,United Kingdom,Europe Developed,Europe Emerging,Africa/Middle East,Japan,Australasia,Asia Developed,Asia Emerging,Latin America,...,Consumer Cyclicals,Financial Services,Real Estate,Communication Services,Energy,Industrials,Technology,Consumer Defensive,Healthcare,Utilities
Equity_%,98.924,0.54202,0.493,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,10.41088,13.85424,2.85615,8.16384,4.758,8.7201,24.49575,6.99121,14.46348,2.83699
Relative_to_Category,97.429,0.92916,1.181,0.001,0.035,0.07,0.008,0.185,0.088,0.072,...,9.82157,14.27109,2.4105,7.09048,4.38005,10.4354,22.4632,7.42991,16.04951,2.79934
