## Data Importation, Cleaning, and Transformation

Data Source: https://eodhistoricaldata.com/

In [1]:
# import dependencies
import numpy as np
import scipy as sp
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib as jl
import datetime as dt
import requests

from icecream import ic

import os
from dotenv import load_dotenv

from ta import add_all_ta_features
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import VolumeWeightedAveragePrice
from ta.momentum import StochRSIIndicator

In [2]:
load_dotenv()
TOKEN = os.getenv('TOKEN')

In [3]:
def get_etf_tickers(): 
    
    # pulls all tickers of ETFs on NYSE or NASDAQ

    r = requests.get('https://eodhistoricaldata.com/api/exchange-symbol-list/US', 
        params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
        )
    data = r.json()
    r.close()

    df = pd.DataFrame(data)
    df = df[
        (df.Type == 'ETF') &
        ((df.Exchange == 'NYSE ARCA') |
        (df.Exchange == 'NASDAQ'))
        ]

    df.index = df.Code
    df.drop('Code', axis = 1, inplace=True)
    ticker_list = list(df.index)
    return ticker_list

In [4]:
# get etf tickers
# tickers = get_etf_tickers()

In [5]:
# save ticker list to csv
#df = pd.DataFrame(tickers)
#df.to_csv('data/tickers.csv')

In [6]:
def get_historical_price(tickers, data_type):

    # pulls historical daily or intraday OLHC prices and volume

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api' + '/' + data_type + '/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        r.close()

        # ADD PROGRESS BAR
        
        for symbol in tickers:
            d[symbol] = pd.DataFrame.from_records(data)

            indicator_macd = MACD(close = d[symbol]['close'], window_slow=26, window_fast=12, window_sign=9, fillna=True)
            d[symbol]['macd'] = indicator_macd.macd()
            d[symbol]['macd_diff'] = indicator_macd.macd_diff()
            d[symbol]['macd_signal'] = indicator_macd.macd_signal()

            indicator_bb = BollingerBands(close = d[symbol]['close'], window=20, window_dev=2, fillna=True)
            d[symbol]['bb_mavg'] = indicator_bb.bollinger_mavg()
            d[symbol]['bb_hband'] = indicator_bb.bollinger_hband()
            d[symbol]['bb_lband'] = indicator_bb.bollinger_lband()
            d[symbol]['bb_hband_ind'] = indicator_bb.bollinger_hband_indicator()
            d[symbol]['bb_lband_ind'] = indicator_bb.bollinger_lband_indicator()

            indicator_vwap = VolumeWeightedAveragePrice(
                high = d[symbol]['high'],
                low = d[symbol]['low'],
                close = d[symbol]['close'],
                volume = d[symbol]['volume'],
                window=14, fillna=True)
            d[symbol]['vwap'] = indicator_vwap.volume_weighted_average_price()

            indicator_stochrsi = StochRSIIndicator(close = d[symbol]['close'], window=14, smooth1=3, smooth2=3, fillna=True)
            d[symbol]['stoch_rsi'] = indicator_stochrsi.stochrsi()
            d[symbol]['stochrsi_d'] = indicator_stochrsi.stochrsi_d()
            d[symbol]['stochrsi_k'] = indicator_stochrsi.stochrsi_k()

    return d

In [7]:
# placeholder tickers
tickers = ['SPXL', 'SPXS']

# get intraday prices | dictionary of dataframes
intraday_data = get_historical_price(tickers, 'intraday')

# get daily prices | dictionary of dataframes
daily_data = get_historical_price(tickers, 'eod')

In [8]:
# concat intraday dataframes together and convert to dask df
intraday_df = pd.concat(intraday_data.values(), axis=1, keys=intraday_data.keys())
intraday_dask_df = dd.from_pandas(intraday_df, npartitions=6)

# concat daily dataframes together and convert to dask df
daily_df = pd.concat(daily_data.values(), axis=1, keys=daily_data.keys())
daily_dask_df = dd.from_pandas(daily_df, npartitions=6)

#daily_df.to_csv('data/test_daily_df.csv')

In [9]:
daily_df.head()

Unnamed: 0_level_0,SPXL,SPXL,SPXL,SPXL,SPXL,SPXL,SPXL,SPXL,SPXL,SPXL,...,SPXS,SPXS,SPXS,SPXS,SPXS,SPXS,SPXS,SPXS,SPXS,SPXS
Unnamed: 0_level_1,date,open,high,low,close,adjusted_close,volume,macd,macd_diff,macd_signal,...,macd_signal,bb_mavg,bb_hband,bb_lband,bb_hband_ind,bb_lband_ind,vwap,stoch_rsi,stochrsi_d,stochrsi_k
0,2008-11-05,62.5621,69.8196,61.38,69.51,85142.5238,11,0.0,0.0,0.0,...,0.0,69.51,69.51,69.51,0.0,0.0,66.9032,0.0,0.0,0.0
1,2008-11-06,75.0,80.5,69.85,79.94,97918.1755,99,0.832023,0.665618,0.166405,...,0.166405,74.725,85.155,64.295,0.0,0.0,75.77732,0.0,0.0,0.0
2,2008-11-07,78.39,79.83,73.25,73.9499,90580.9267,267,0.996568,0.664131,0.332437,...,0.332437,74.466633,83.013989,65.919277,0.0,0.0,75.706011,0.0,0.0,0.0
3,2008-11-10,69.64,79.32,67.89,75.58,92577.6417,194,1.244165,0.729382,0.514783,...,0.514783,74.744975,82.209736,67.280214,0.0,0.0,75.215855,0.0,0.0,0.0
4,2008-11-11,80.27,85.4,76.24,80.25,98297.8932,281,1.796508,1.02538,0.771128,...,0.771128,75.84598,83.844325,67.847635,0.0,0.0,77.001506,0.0,0.0,0.0


### Current Dev
- next job, implement get_fundementals and organize data output
- then develop flow for analysis on whiteboard, use paper as resource

### ETF Fundementals

In [227]:
def test(tickers): 

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()
        return data

In [228]:
tickers = ['SPY']#, 'QQQ']
fundemental_data = test(tickers)

In [232]:
fundemental_data.keys()
fundemental_data['General'].keys()
fundemental_data['Technicals'].keys()
fundemental_data['ETF_Data'].keys()

dict_keys(['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile', 'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date', 'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge', 'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets', 'Average_Mkt_Cap_Mil', 'Market_Capitalisation', 'Asset_Allocation', 'World_Regions', 'Sector_Weights', 'Fixed_Income', 'Holdings_Count', 'Top_10_Holdings', 'Holdings', 'Valuations_Growth', 'MorningStar', 'Performance'])

In [220]:
def get_fundementals(tickers): 

    # pulls fundementals and wrangles data into multiple dfs

    raw_data = {}
    single_ticker_clean_data = {}
    all_clean_data = {}


    columns = ['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile',
        'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date',
        'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge',
        'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets', 'Holdings_Count',
        'Average_Mkt_Cap_Mil']

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()

        raw_data[tickers[i]] = data    

        a = pd.Series(raw_data[tickers[i]]['General'])

        b = pd.Series([raw_data[tickers[i]]['ETF_Data'][name] for name in columns],
            index=[name for name in columns])

        # c = pd.DataFrame([d[tickers[i]]['ETF_Data']['Market_Capitalisation']])#,
        # index = [d[tickers[i]]['ETF_Data']['Market_Capitalisation'].keys()])
        # c = pd.Series(d[symbol]['ETF_Data']['Market_Capitalisation'],
        #  index = ['Market_Capitalisation'])


        c = pd.Series(raw_data[tickers[i]]['ETF_Data']['MorningStar'])

        # d = pd.DataFrame(d[tickers[i]]['ETF_Data']['Performance'].items(),
        #     index = d[tickers[i]]['ETF_Data']['Performance'].keys()).drop(0, axis=1)
        # d = pd.DataFrame(d[symbol]['ETF_Data']['Performance'].items(),
        #  index = d[symbol]['ETF_Data']['Performance'].keys()).drop(0, axis=1)
        
        
        # e = pd.Series(d[symbol]['Technicals'])
        # e = pd.Series(d[tickers[i]]['Technicals'].items(), 
        #     index = d[tickers[i]]['Technicals'].keys()).drop(0, axis=1)

        single_ticker_clean_data['general'] = pd.DataFrame(pd.concat([a, b, c]), columns = ['data'])


        single_ticker_clean_data['asset_allocation'] = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Asset_Allocation'])
        region_weights_df = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['World_Regions'])
        sector_weights_df = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Sector_Weights'])
        single_ticker_clean_data['fixed_income'] = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Fixed_Income'])
        single_ticker_clean_data['top_10_holdings'] = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Top_10_Holdings'].values(), index = raw_data[tickers[i]]['ETF_Data']['Top_10_Holdings'].keys())
        single_ticker_clean_data['holdings'] = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Holdings'].values(), index = raw_data[tickers[i]]['ETF_Data']['Holdings'].keys())
        single_ticker_clean_data['valuations_growth'] = pd.DataFrame(raw_data[tickers[i]]['ETF_Data']['Valuations_Growth']).T

        placeholder = {}
        placeholder['region_weights'] = region_weights_df
        placeholder['sector_weights'] = sector_weights_df
        weights_df = pd.concat(placeholder.values(), axis=1, keys=placeholder.keys())
        single_ticker_clean_data['weights'] = weights_df.copy()

        all_clean_data[tickers[i]] = single_ticker_clean_data


    return all_clean_data, single_ticker_clean_data

In [221]:
# # placeholder tickers
tickers = ['SPY', 'QQQ']

# # get fundemental data
fundemental_data, single_ticker = get_fundementals(tickers)

In [226]:
#single_ticker['general']

type(fundemental_data['QQQ'])

dict

In [37]:
def test(tickers): 

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()
        return data

In [61]:
tickers = ['SPY', 'QQQ']
fundemental_data = test(tickers)

In [62]:
#fundemental_data.keys()
fundemental_data['ETF_Data'].keys()

dict_keys(['ISIN', 'Company_Name', 'Company_URL', 'ETF_URL', 'Domicile', 'Index_Name', 'Yield', 'Dividend_Paying_Frequency', 'Inception_Date', 'Max_Annual_Mgmt_Charge', 'Ongoing_Charge', 'Date_Ongoing_Charge', 'NetExpenseRatio', 'AnnualHoldingsTurnover', 'TotalAssets', 'Average_Mkt_Cap_Mil', 'Market_Capitalisation', 'Asset_Allocation', 'World_Regions', 'Sector_Weights', 'Fixed_Income', 'Holdings_Count', 'Top_10_Holdings', 'Holdings', 'Valuations_Growth', 'MorningStar', 'Performance'])

### Stock Fundementals

In [None]:
def test(tickers): 

    d = {}

    for i in range(len(tickers)): 

        r = requests.get('https://eodhistoricaldata.com/api/fundamentals/' + tickers[i] + '.US', 
            params={'api_token': '63dc0e2f4efc43.34327983', 'fmt': 'json'}
            )
        data = r.json()
        
        r.close()
        return data

In [None]:
tickers = ['AAPL']
fundemental_data = test(tickers)

In [None]:
fundemental_data.keys()

In [None]:
# officers = fundemental_data['General'].pop('Officers', None)
# listings = fundemental_data['General'].pop('Listings', None)
# address = fundemental_data['General'].pop('AddressData', None)
# pd.Series(fundemental_data['General'])
# pd.Series(fundemental_data['Highlights'])
# pd.Series(fundemental_data['Valuation'])
# pd.Series(fundemental_data['SharesStats'])
# pd.Series(fundemental_data['Technicals'])
# NumberDividendsByYear = fundemental_data['SplitsDividends'].pop('NumberDividendsByYear', None)
# pd.Series(fundemental_data['SplitsDividends'])
# pd.Series(fundemental_data['AnalystRatings'])
# pd.DataFrame(fundemental_data['Holders']['Institutions']).T.set_index(['name'])
# pd.DataFrame(fundemental_data['Holders']['Funds']).T.set_index(['name'])
# pd.DataFrame(fundemental_data['InsiderTransactions']).T.set_index(['date'])
# pd.DataFrame(fundemental_data['outstandingShares']['annual']).T.set_index('dateFormatted')
# pd.DataFrame(fundemental_data['outstandingShares']['quarterly']).T.set_index('dateFormatted')
# # join on date
# pd.DataFrame(fundemental_data['Earnings']['History']).T
# pd.DataFrame(fundemental_data['Earnings']['Trend']).T
# pd.DataFrame(fundemental_data['Earnings']['Annual']).T
# # join on date
# pd.DataFrame(fundemental_data['Financials']['Balance_Sheet']['quarterly']).T
# pd.DataFrame(fundemental_data['Financials']['Cash_Flow']['quarterly']).T
# pd.DataFrame(fundemental_data['Financials']['Income_Statement']['quarterly']).T
# pd.DataFrame(fundemental_data['Financials']['Balance_Sheet']['yearly']).T
# pd.DataFrame(fundemental_data['Financials']['Cash_Flow']['yearly']).T
# pd.DataFrame(fundemental_data['Financials']['Income_Statement']['yearly']).T