In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
pip install -r requirements.txt


Explore stock market dataset from Yahoo Finance

In [None]:
import yfinance as yf
import pandas as pd


Load broad market indicies

In [None]:
# Capture S&P500, NASDAQ100 and Russell 200 indecies and their equal weighted counter parts
# As well as VIX volatility index, DYX US Dollar index and TNX US 10 Year Treasuries Rate Index
broad_market_indicies = '^SPX ^SPXEW ^NDX ^NDXE ^RUT ^R2ESC ^VIX DX-Y.NYB ^TNX'

In [None]:
broad_market = yf.download(broad_market_indicies, period='max', group_by='tickers') 
broad_market

In [None]:
broad_market.to_csv('data/broad_market.csv.bz2', index='Date')

Load Sector Indicies

In [None]:
sector_indicies = 'XLE ^SP500-15 ^SP500-20 ^SP500-25 ^SP500-30 ^SP500-35 ^SP500-40 ^SP500-45 ^SP500-50 ^SP500-55 ^SP500-60'

In [None]:
sectors = yf.download(sector_indicies, period='max') 
sectors

In [None]:
sectors.to_csv('data/sectors.csv.bz2')

Load Growth Stocks

In [None]:

ibd50 = pd.read_csv('data/IBD50.csv')
ibd50


In [None]:
ibd250 = pd.read_csv('data/IBD250.csv')
ibd250

In [None]:
# merge IBD 50 and 250 symbol sets
ibd50_set = set(ibd50['Symbol'])
ibd250_set = set(ibd250['Symbol'])
ibd_growth_set = ibd50_set.union(ibd250_set)

In [None]:
len(ibd_growth_set)

In [None]:
ibdgrowth_str = ' '.join(ibd_growth_set)
ibdgrowth_str



In [None]:
ibdgrowth_data = yf.download(ibdgrowth_str, period='max', group_by='tickers') 
ibdgrowth_data

In [None]:
ibdgrowth_data.columns.levels

In [None]:
# for ticker in ibd50_data.columns.levels[0]:
#    print(f'ticker: {ticker}')
#    ticker_data = ibd50_data[ticker]
#    print(f'ticker historic data: {ticker_data}')
#    # remove missing values
#    ticker_data = ticker_data.dropna()
#    print(f'ticker historic data: {ticker_data}')
#    # save ticker data
#    ticker_data.to_csv(f'data/{ticker}.csv')


In [None]:
ibdgrowth_data.to_csv('data/ibdgrowth_hist.csv.bz2', index='Date')

In [None]:
ibdgrowth_loaded = pd.read_csv('data/ibdgrowth_hist.csv.bz2', header=[0, 1], index_col=0)
ibdgrowth_loaded

In [None]:
for ticker in ibdgrowth_loaded.columns.levels[0][:2]:
    print(f'ticker: {ticker}')
    ticker_data = ibdgrowth_loaded[ticker]
    print(f'ticker historic data: {ticker_data}')
    # remove missing values
    ticker_data = ticker_data.dropna()
    print(f'ticker historic data without missing data: {ticker_data}')


Load historical stock sales and earnings data

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

FMP_API_KEY=os.getenv("FMP_API_KEY")

print(f'FMP_API_KEY={FMP_API_KEY}')

In [None]:
import fmpsdk

# Company Valuation Methods
symbol: str = "AAPL"
symbols: ["AAPL", "CSCO", "QQQQ"]
exchange: str = "NYSE"
exchanges: ["NYSE", "NASDAQ"]
query: str = "AA"
limit: int = 3
period: str = "quarter"
download: bool = True
market_cap_more_than: int = 1000000000
beta_more_than: int = 1
volume_more_than: int = 10000
sector: str = "Technology"
dividend_more_than: int = 0
industry: str = "Software"
filing_type: str = "10-K"
print(f"Company Profile: {fmpsdk.company_profile(apikey=FMP_API_KEY, symbol=symbol)=}")


In [None]:
# there should be no duplicate symbols in this list
assert not ibdgrowth_loaded.columns.levels[0].duplicated().any() 

In [None]:
earnings_all_df = pd.DataFrame()
for ticker in ibdgrowth_loaded.columns.levels[0]:
    earnings = fmpsdk.historical_earning_calendar(apikey=FMP_API_KEY, symbol=ticker, limit=-1)
    if earnings is not None and len(earnings) > 0:
        edf = pd.DataFrame(earnings)
        edf['date'] = pd.to_datetime(edf['date'])
        edf = edf.set_index(['symbol', 'date'])
        print(f"Earnings calendar for {ticker}: \n{edf}")
        # edf = edf.pivot(columns='symbol')
        # edf.swaplevel(i=0,j=1, axis=0)
        # edf.drop(columns=['symbol'])
        earnings_all_df = pd.concat([earnings_all_df, edf])
        print(f"Earnings calendar after pivot for {ticker}: \n{edf}")
        n_earnings = len(earnings)
        print(f"Total earnings reports for {ticker}: {n_earnings}")
#    earliest_earn = earnings[-1] if len(earnings > 0 else 'None')
#    print(f"Earliest earnings report for {ticker}: {earliest_earn}")


In [None]:
earnings

In [None]:
aaon = earnings_all_df.loc[['AAON']]

In [None]:
aaon

In [None]:
len(earnings_all_df)

In [None]:
earnings_all_df


In [None]:
# make sure there are no duplicate entries with composite index (date, symbol)
assert not earnings_all_df.index.duplicated().any()

In [None]:
dupes = earnings_all_df[earnings_all_df.duplicated(keep=False)]

In [None]:
dupes

In [None]:
dupes.to_csv('data/dupes_earnings.csv')

In [None]:
earnings_file = 'data/earnings_calendar.csv.bz2'

In [None]:
earnings_all_df.to_csv(earnings_file)

Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

In [None]:
assert not earnings_loaded_df.index.duplicated().any()

In [None]:
dupes = earnings_loaded_df.loc['AAON'].duplicated().any()
#  and earnings_loaded_df.duplicated(keep=False)
dupes

In [None]:
# check if data for a given symbol was saved and loaded as expected
earnings_loaded_df.loc[['AAON']]


In [None]:
# convert date strings to numerical representation
ufd = pd.to_datetime(earnings_loaded_df['updatedFromDate'])
ufd_year = ufd.dt.year
ufd_month = ufd.dt.month
ufd_day = ufd.dt.day

earn_n_cols = len(earnings_loaded_df.columns)
earnings_loaded_df.insert(loc=earn_n_cols, column='updatedFromDate_year', value=ufd_year)
earnings_loaded_df.insert(loc=earn_n_cols, column='updatedFromDate_month', value=ufd_month)
earnings_loaded_df.insert(loc=earn_n_cols, column='updatedFromDate_day', value=ufd_day)
earnings_loaded_df.pop('updatedFromDate')


In [None]:
earnings_loaded_df

In [None]:
# convert date strings to numerical representation
fde = pd.to_datetime(earnings_loaded_df['fiscalDateEnding'])
fde_year = ufd.dt.year
fde_month = ufd.dt.month
fde_day = ufd.dt.day

earn_n_cols = len(earnings_loaded_df.columns)
earnings_loaded_df.insert(loc=earn_n_cols, column='fiscalDateEnding_year', value=fde_year)
earnings_loaded_df.insert(loc=earn_n_cols, column='fiscalDateEnding_month', value=fde_month)
earnings_loaded_df.insert(loc=earn_n_cols, column='fiscalDateEnding_day', value=fde_day)
earnings_loaded_df.pop('fiscalDateEnding')


In [None]:
earnings_loaded_df

In [None]:
# convert earnings reporting time - Before Market Open / After Market Close - categories to numerical representation
earnings_loaded_df['time'] = earnings_loaded_df['time'].replace(['bmo', 'amc', '--'],
                        [0, 1, -1], inplace=False).astype('int32')



In [None]:
earnings_loaded_df.loc[earnings_loaded_df['time'] == '--']

In [None]:
earnings_loaded_df

In [None]:
from darts import TimeSeries

type(earnings_loaded_df.index)

In [None]:
earnings_loaded_df.index.names

In [None]:
earnings_loaded_df.dtypes

In [None]:
tickers = earnings_loaded_df.index.get_level_values('symbol').unique()
for t in tickers:
    t_earn = earnings_loaded_df.loc[[t]]
    t_earn = t_earn.droplevel('symbol')
    t_earn.index = pd.to_datetime(t_earn.index)
    print(f'index type for {t}: {type(t_earn.index)}')
    assert not t_earn.index.duplicated().any()
    print(f'{t} earnings: \n{t_earn}')
    t_earn_series = TimeSeries.from_dataframe(t_earn, fillna_value=-1, freq='D', fill_missing_dates=True)
    assert len(t_earn_series.gaps()) == 0


In [None]:
t

In [None]:
t_earn.loc[t_earn.duplicated()]

Experiment with other stock data

In [None]:
t_earn

In [None]:

msft = yf.Ticker("MSFT")
print(msft)
"""
returns
<yfinance.Ticker object at 0x1a1715e898>
"""


In [None]:
import json

# get stock info
print(json.dumps(msft.info, indent=2))

"""
returns:
{
 'quoteType': 'EQUITY',
 'quoteSourceName': 'Nasdaq Real Time Price',
 'currency': 'USD',
 'shortName': 'Microsoft Corporation',
 'exchangeTimezoneName': 'America/New_York',
  ...
 'symbol': 'MSFT'
}
"""


In [None]:

# get historical market data
msft_hist = msft.history(period="max")
print(msft_hist)
"""
returns:
              Open    High    Low    Close      Volume  Dividends  Splits
Date
1986-03-13    0.06    0.07    0.06    0.07  1031788800        0.0     0.0
1986-03-14    0.07    0.07    0.07    0.07   308160000        0.0     0.0
...
2019-04-15  120.94  121.58  120.57  121.05    15792600        0.0     0.0
2019-04-16  121.64  121.65  120.10  120.77    14059700        0.0     0.0
"""


In [None]:
# show actions (dividends, splits)
print(msft.actions)
"""
returns:
            Dividends  Splits
Date
1987-09-21       0.00     2.0
1990-04-16       0.00     2.0
...
2018-11-14       0.46     0.0
2019-02-20       0.46     0.0
"""

In [None]:

# show dividends
print(msft.dividends)
"""
returns:
Date
2003-02-19    0.08
2003-10-15    0.16
...
2018-11-14    0.46
2019-02-20    0.46
"""

In [None]:
# show splits
print(msft.splits)
"""
returns:
Date
1987-09-21    2.0
1990-04-16    2.0
...
1999-03-29    2.0
2003-02-18    2.0
"""

In [None]:
msft_hist.head()

In [None]:
msft_hist.tail()

In [None]:
# create a deep copy so we can experiment with data without triggering unnecessary downloads 
df=msft_hist.copy(deep=True)

# df=msft_hist.reset_index(drop=True)
# data['Date']=pd.to_datetime(data['Date'])
print(len(df))
df.head()

Ingest data into darts timeseries


In [None]:
# Count rows without values
# https://stackoverflow.com/questions/28199524/best-way-to-count-the-number-of-rows-with-missing-values-in-a-pandas-dataframe
df.shape[0] - df.dropna().shape[0]



In [None]:
# count any cells without values
df.isnull().values.ravel().sum()

Convert data to timeseries format that models can work with

Regular time intervals between data points and no missing values

In [None]:
df

In [None]:
import pandas as pd

# Clean up date index. Remove time zone.
df.index = pd.to_datetime(df.index, utc=True).date
df.index = pd.to_datetime(df.index, utc=True)
df.index.name = 'Date'
df

In [None]:
df.index

In [None]:
type(df.index)

In [None]:
from darts import TimeSeries

series = TimeSeries.from_dataframe(df, fill_missing_dates=True, freq='B') # value_cols=["Close"],  "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits"


In [None]:
# get regularized time series with NaN fill-ins
reg_df = series.pd_dataframe()

In [None]:
# get number of rows without values
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# fill in missing values
reg_df = reg_df.interpolate()


In [None]:
# Check again number of rows without values. Should be 0.
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# update series with new regularized dates and values
series = TimeSeries.from_dataframe(reg_df)

Save prepared timeseries data to local csv for model training

In [None]:
data_file_name = 'data/msft_data.csv'

In [None]:
# series.to_csv(data_file_name)


Make sure data can load back into timeseries

In [None]:
# series = TimeSeries.from_csv(data_file_name, time_col='Date') #, freq='B')

In [None]:
series

In [None]:
# get regularized time series with NaN fill-ins
loaded_df = series.pd_dataframe()

In [None]:
# get number of rows without values
loaded_df.shape[0] - loaded_df.dropna().shape[0]