## Data Importation, Cleaning, and Transformation

Data Source: https://eodhistoricaldata.com/

In [49]:
# import dependencies
import numpy as np
import scipy as sp
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib as jl
import datetime as dt
import requests

from icecream import ic

import os
from dotenv import load_dotenv

from ta import add_all_ta_features
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import VolumeWeightedAveragePrice
from ta.momentum import StochRSIIndicator

In [50]:
load_dotenv()
TOKEN = os.getenv('TOKEN')

In [51]:
def get_etf_tickers(): 
    
    # pulls all tickers of ETFs on NYSE or NASDAQ

    r = requests.get('https://eodhistoricaldata.com/api/exchange-symbol-list/US', 
        params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
        )
    data = r.json()
    r.close()

    df = pd.DataFrame(data)
    df = df[
        (df.Type == 'ETF') &
        ((df.Exchange == 'NYSE ARCA') |
        (df.Exchange == 'NASDAQ'))
        ]

    df.index = df.Code
    df.drop('Code', axis = 1, inplace=True)
    ticker_list = list(df.index)
    return ticker_list

In [52]:
# get etf tickers
# tickers = get_etf_tickers()

In [53]:
# save ticker list to csv
#df = pd.DataFrame(tickers)
#df.to_csv('data/tickers.csv')

In [54]:
def get_historical_price(tickers, data_type):

    # pulls historical daily or intraday OLHC prices and volume

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api' + '/' + data_type + '/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        r.close()

        # ADD PROGRESS BAR
        
        for symbol in tickers:
            d[symbol] = pd.DataFrame.from_records(data)

            indicator_macd = MACD(close = d[symbol]['close'], window_slow=26, window_fast=12, window_sign=9, fillna=True)
            d[symbol]['macd'] = indicator_macd.macd()
            d[symbol]['macd_diff'] = indicator_macd.macd_diff()
            d[symbol]['macd_signal'] = indicator_macd.macd_signal()

            indicator_bb = BollingerBands(close = d[symbol]['close'], window=20, window_dev=2, fillna=True)
            d[symbol]['bb_mavg'] = indicator_bb.bollinger_mavg()
            d[symbol]['bb_hband'] = indicator_bb.bollinger_hband()
            d[symbol]['bb_lband'] = indicator_bb.bollinger_lband()
            d[symbol]['bb_hband_ind'] = indicator_bb.bollinger_hband_indicator()
            d[symbol]['bb_lband_ind'] = indicator_bb.bollinger_lband_indicator()

            indicator_vwap = VolumeWeightedAveragePrice(
                high = d[symbol]['high'],
                low = d[symbol]['low'],
                close = d[symbol]['close'],
                volume = d[symbol]['volume'],
                window=14, fillna=True)
            d[symbol]['vwap'] = indicator_vwap.volume_weighted_average_price()

            indicator_stochrsi = StochRSIIndicator(close = d[symbol]['close'], window=14, smooth1=3, smooth2=3, fillna=True)
            d[symbol]['stoch_rsi'] = indicator_stochrsi.stochrsi()
            d[symbol]['stochrsi_d'] = indicator_stochrsi.stochrsi_d()
            d[symbol]['stochrsi_k'] = indicator_stochrsi.stochrsi_k()

    return d

In [55]:
# placeholder tickers
tickers = ['SPY', 'QQQ']

# get intraday prices | dictionary of dataframes
intraday_data = get_historical_price(tickers, 'intraday')

# get daily prices | dictionary of dataframes
daily_data = get_historical_price(tickers, 'eod')

In [56]:
# concat intraday dataframes together and convert to dask df
intraday_df = pd.concat(intraday_data.values(), axis=1, keys=intraday_data.keys())
intraday_dask_df = dd.from_pandas(intraday_df, npartitions=6)

# concat daily dataframes together and convert to dask df
daily_df = pd.concat(daily_data.values(), axis=1, keys=daily_data.keys())
daily_dask_df = dd.from_pandas(daily_df, npartitions=6)

### Current Dev
- next job, implement get_fundementals and organize data output
- then develop flow for analysis on whiteboard, use paper as resource

In [57]:
def get_fundementals(tickers): 

    # pulls fundementals and wrangles data into multiple dfs

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()

        for symbol in tickers:

            d[symbol] = data
            zz = pd.Series(d[symbol]['General'])
            a = pd.Series(d[symbol]['ETF_Data']['ISIN'],
             index = [d[symbol]['ETF_Data']['ISIN']])
            b = pd.Series(d[symbol]['ETF_Data']['Company_Name'],
             index = [d[symbol]['ETF_Data']['Company_Name']])
            c = pd.Series(d[symbol]['ETF_Data']['Company_URL'],
             index = [d[symbol]['ETF_Data']['Company_URL']])
            d = pd.Series(d[symbol]['ETF_Data']['ETF_URL'],
             index = [d[symbol]['ETF_Data']['ETF_URL']])
            e = pd.Series(d[symbol]['ETF_Data']['Domicile'],
             index = [d[symbol]['ETF_Data']['Domicile']])
            # f = pd.Series(d[symbol]['ETF_Data']['Index_Name'],
            #  index = [d[symbol]['ETF_Data']['Index_Name']])
            # g = pd.Series(d[symbol]['ETF_Data']['Yield'],
            #  index = [d[symbol]['ETF_Data']['Yield']])
            # h = pd.Series(d[symbol]['ETF_Data']['Dividend_Paying_Frequency'],
            #  index = [d[symbol]['ETF_Data']['Dividend_Paying_Frequency']])
            # i = pd.Series(d[symbol]['ETF_Data']['Inception_Date'],
            #  index = [d[symbol]['ETF_Data']['Inception_Date']])
            # j = pd.Series(d[symbol]['ETF_Data']['Max_Annual_Mgmt_Charge'],
            #  index = ['Max_Annual_Mgmt_Charge'])
            # k = pd.Series(d[symbol]['ETF_Data']['Ongoing_Charge'],
            #  index = ['Ongoing_Charge'])
            # l = pd.Series(d[symbol]['ETF_Data']['Date_Ongoing_Charge'],
            #  index = ['Date_Ongoing_Charge'])
            # m = pd.Series(d[symbol]['ETF_Data']['NetExpenseRatio'],
            #  index = ['NetExpenseRatio'])
            # n = pd.Series(d[symbol]['ETF_Data']['AnnualHoldingsTurnover'],
            #  index = ['AnnualHoldingsTurnover'])
            # o = pd.Series(d[symbol]['ETF_Data']['TotalAssets'],
            #  index = ['TotalAssets'])
            # p = pd.Series(d[symbol]['ETF_Data']['Average_Mkt_Cap_Mil'],
            #  index = ['Average_Mkt_Cap_Mil'])
            # q = pd.Series(d[symbol]['ETF_Data']['Market_Capitalisation'],
            #  index = ['Market_Capitalisation'])
            # r = pd.Series(d[symbol]['ETF_Data']['Holdings_Count'],
            #  index = ['Holdings_Count'])
            # s = pd.Series(d[symbol]['ETF_Data']['MorningStar'])
            # aa = pd.DataFrame(d[symbol]['ETF_Data']['Performance'].items(),
            #  index = d[symbol]['ETF_Data']['Performance'].keys()).drop(0, axis=1)
            # ad = pd.Series(d[symbol]['Technicals'])

            # fundemental_df = pd.DataFrame(pd.concat([zz, a, b, c, d, e, f,
            #  g, h, i, j, k, l, m ,n, o, p, r, s, aa[1], ad]), columns = ['data'])

    return d

In [58]:
# # placeholder tickers
# tickers = ['QQQ']

# # get fundemental data
# fundemental_data = get_fundementals(tickers)

In [59]:
def test(tickers): 

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()
        return data

In [60]:
tickers = ['SPY', 'QQQ']
fundemental_data = test(tickers)

In [61]:
#fundemental_data.keys()
#fundemental_data['ETF_Data'].keys()

In [62]:
columns = ['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile',
       'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
       'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
       'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets',
       'Average_Mkt_Cap_Mil', 'Market_Capitalisation']

zz = pd.Series(fundemental_data['General'])
a = pd.Series(fundemental_data['ETF_Data']['ISIN'], index = ['ISIN'])
b = pd.Series(fundemental_data['ETF_Data']['Company_Name'], index = ['Company_Name'])
c = pd.Series(fundemental_data['ETF_Data']['Company_URL'], index = ['Company_URL'])
d = pd.Series(fundemental_data['ETF_Data']['ETF_URL'], index = ['ETF_URL'])
e = pd.Series(fundemental_data['ETF_Data']['Domicile'], index = ['Domicile'])
f = pd.Series(fundemental_data['ETF_Data']['Index_Name'], index = ['Index_Name'])
g = pd.Series(fundemental_data['ETF_Data']['Yield'], index = ['Yield'])
h = pd.Series(fundemental_data['ETF_Data']['Dividend_Paying_Frequency'], index = ['Dividend_Paying_Frequency'])
i = pd.Series(fundemental_data['ETF_Data']['Inception_Date'], index = ['Inception_Date'])
j = pd.Series(fundemental_data['ETF_Data']['Max_Annual_Mgmt_Charge'], index = ['Max_Annual_Mgmt_Charge'])
k = pd.Series(fundemental_data['ETF_Data']['Ongoing_Charge'], index = ['Ongoing_Charge'])
l = pd.Series(fundemental_data['ETF_Data']['Date_Ongoing_Charge'], index = ['Date_Ongoing_Charge'])
m = pd.Series(fundemental_data['ETF_Data']['NetExpenseRatio'], index = ['NetExpenseRatio'])
n = pd.Series(fundemental_data['ETF_Data']['AnnualHoldingsTurnover'], index = ['AnnualHoldingsTurnover'])
o = pd.Series(fundemental_data['ETF_Data']['TotalAssets'], index = ['TotalAssets'])
p = pd.Series(fundemental_data['ETF_Data']['Average_Mkt_Cap_Mil'], index = ['Average_Mkt_Cap_Mil'])
q = pd.Series(fundemental_data['ETF_Data']['Market_Capitalisation'], index = ['Market_Capitalisation'])
r = pd.Series(fundemental_data['ETF_Data']['Holdings_Count'], index = ['Holdings_Count'])
s = pd.Series(fundemental_data['ETF_Data']['MorningStar'])
aa = pd.DataFrame(fundemental_data['ETF_Data']['Performance'].items(), index = fundemental_data['ETF_Data']['Performance'].keys()).drop(0, axis=1)
ad = pd.Series(fundemental_data['Technicals'])

fundemental_df = pd.DataFrame(pd.concat([zz, a, b, c, d, e, f, g, h, i, j, k, l, m ,n, o, p, r, s, aa[1], ad]), columns = ['data'])

In [63]:
asset_allocation_df = pd.DataFrame(fundemental_data['ETF_Data']['Asset_Allocation'])
region_weights_df = pd.DataFrame(fundemental_data['ETF_Data']['World_Regions'])
sector_weights_df = pd.DataFrame(fundemental_data['ETF_Data']['Sector_Weights'])
fixed_income_df = pd.DataFrame(fundemental_data['ETF_Data']['Fixed_Income'])
top_10_holdings_df = pd.DataFrame(fundemental_data['ETF_Data']['Top_10_Holdings'].values(), index = fundemental_data['ETF_Data']['Top_10_Holdings'].keys())
holdings_df = pd.DataFrame(fundemental_data['ETF_Data']['Holdings'].values(), index = fundemental_data['ETF_Data']['Holdings'].keys())
valuations_growth_df = pd.DataFrame(fundemental_data['ETF_Data']['Valuations_Growth']).T

ab = {}
ab['region_weights'] = region_weights_df
ab['sector_weights'] = sector_weights_df
weights_df = pd.concat(ab.values(), axis=1, keys=ab.keys())

In [64]:
fundemental_df

Unnamed: 0,data
Code,SPY
Type,ETF
Name,SPDR S&P 500 ETF Trust
Exchange,NYSE ARCA
CurrencyCode,USD
CurrencyName,US Dollar
CurrencySymbol,$
CountryName,USA
CountryISO,US
Description,The Trust seeks to achieve its investment obje...


In [65]:
asset_allocation_df.head()

Unnamed: 0,Cash,NotClassified,Stock non-US,Other,Stock US,Bond
Long_%,0.05379,0,1.09685,0,98.84936,0
Short_%,0.0,0,0.0,0,0.0,0
Net_Assets_%,0.05379,0,1.09685,0,98.84936,0


In [66]:
fixed_income_df.head()

Unnamed: 0,EffectiveDuration,ModifiedDuration,EffectiveMaturity,CreditQuality,Coupon,Price,YieldToMaturity
Fund_%,0.0,0,0.0,0,0.0,0,0.0
Relative_to_Category,1.24,0,0.39667,0,2.93891,0,1.76


In [67]:
top_10_holdings_df.head()

Unnamed: 0,Code,Exchange,Name,Sector,Industry,Country,Region,Assets_%
AAPL.US,AAPL,US,Apple Inc,Technology,Consumer Electronics,United States,North America,6.32853
MSFT.US,MSFT,US,Microsoft Corporation,Technology,Software-Infrastructure,United States,North America,5.35457
AMZN.US,AMZN,US,Amazon.com Inc,Consumer Cyclical,Internet Retail,United States,North America,2.64115
GOOGL.US,GOOGL,US,Alphabet Inc Class A,Communication Services,Internet Content & Information,United States,North America,1.71363
BRK-B.US,BRK-B,US,Berkshire Hathaway Inc,Financial Services,Insurance-Diversified,United States,North America,1.63875


In [68]:
holdings_df.head()

Unnamed: 0,Code,Exchange,Name,Sector,Industry,Country,Region,Assets_%
AAPL.US,AAPL,US,Apple Inc,Technology,Consumer Electronics,United States,North America,6.32853
MSFT.US,MSFT,US,Microsoft Corporation,Technology,Software-Infrastructure,United States,North America,5.35457
AMZN.US,AMZN,US,Amazon.com Inc,Consumer Cyclical,Internet Retail,United States,North America,2.64115
GOOGL.US,GOOGL,US,Alphabet Inc Class A,Communication Services,Internet Content & Information,United States,North America,1.71363
BRK-B.US,BRK-B,US,Berkshire Hathaway Inc,Financial Services,Insurance-Diversified,United States,North America,1.63875


In [69]:
valuations_growth_df.head()

Unnamed: 0,Price/Prospective Earnings,Price/Book,Price/Sales,Price/Cash Flow,Dividend-Yield Factor,Long-Term Projected Earnings Growth,Historical Earnings Growth,Sales Growth,Cash-Flow Growth,Book-Value Growth
Valuations_Rates_Portfolio,18.44287,3.43229,2.09261,10.65504,1.80814,,,,,
Valuations_Rates_To_Category,17.41992,3.80028,2.17691,11.131,1.94524,,,,,
Growth_Rates_Portfolio,,,,,,11.2708,22.15675,11.41034,8.06471,4.58625
Growth_Rates_To_Category,,,,,,10.43816,30.68086,-5.13758,-28.83055,-2.58379


In [70]:
weights_df.head()

Unnamed: 0_level_0,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,region_weights,...,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights,sector_weights
Unnamed: 0_level_1,North America,United Kingdom,Europe Developed,Europe Emerging,Africa/Middle East,Japan,Australasia,Asia Developed,Asia Emerging,Latin America,...,Consumer Cyclicals,Financial Services,Real Estate,Communication Services,Energy,Industrials,Technology,Consumer Defensive,Healthcare,Utilities
Equity_%,98.903,0.55529,0.502,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,10.46035,13.89861,2.86686,7.85484,4.91258,8.74034,24.06966,7.11355,14.66101,2.90454
Relative_to_Category,97.429,0.92916,1.181,0.001,0.035,0.07,0.008,0.185,0.088,0.072,...,9.82157,14.27109,2.4105,7.09048,4.38005,10.4354,22.4632,7.42991,16.04951,2.79934
