In [None]:
pip install -r requirements.txt


Explore stock market dataset from Yahoo Finance

In [None]:
import yfinance as yf
import pandas as pd


Load broad market indicies

In [None]:
broad_market_indicies = '^SPX ^NDX ^VIX ^RUT DX-Y.NYB ^TNX'

In [None]:
broad_market = yf.download(broad_market_indicies, period='max') 
broad_market

In [None]:
broad_market.to_csv('data/broad_market.csv.bz2')

Load Sector Indicies

In [None]:
sector_indicies = 'XLE ^SP500-15 ^SP500-20 ^SP500-25 ^SP500-30 ^SP500-35 ^SP500-40 ^SP500-45 ^SP500-50 ^SP500-55 ^SP500-60'

In [None]:
sectors = yf.download(sector_indicies, period='max') 
sectors

In [None]:
sectors.to_csv('data/sectors.csv.bz2')

Load Growth Stocks

In [None]:

ibd50 = pd.read_csv('data/IBD50.csv')
ibd50


In [None]:
ibd250 = pd.read_csv('data/IBD250.csv')
ibd250

In [None]:
# merge IBD 50 and 250 symbol sets
ibd50_set = set(ibd50['Symbol'])
ibd250_set = set(ibd250['Symbol'])
ibd_growth_set = ibd50_set.union(ibd250_set)

In [None]:
len(ibd_growth_set)

In [None]:
ibdgrowth_str = ' '.join(ibd_growth_set)
ibdgrowth_str



In [None]:
ibdgrowth_data = yf.download(ibdgrowth_str, period='max', group_by='tickers') 
ibdgrowth_data

In [None]:
ibdgrowth_data.columns.levels

In [None]:
# for ticker in ibd50_data.columns.levels[0]:
#    print(f'ticker: {ticker}')
#    ticker_data = ibd50_data[ticker]
#    print(f'ticker historic data: {ticker_data}')
#    # remove missing values
#    ticker_data = ticker_data.dropna()
#    print(f'ticker historic data: {ticker_data}')
#    # save ticker data
#    ticker_data.to_csv(f'data/{ticker}.csv')


In [None]:
ibdgrowth_data.to_csv('data/ibdgrowth_hist.csv.bz2', index='Date')

In [None]:
ibdgrowth_loaded = pd.read_csv('data/ibdgrowth_hist.csv.bz2', header=[0, 1], index_col=0)
ibdgrowth_loaded

In [None]:
ticker = ibdgrowth_loaded.columns.levels[0][0]
print(f'ticker: {ticker}')
ticker_data = ibdgrowth_loaded[ticker]
print(f'ticker historic data: {ticker_data}')
# remove missing values
ticker_data = ticker_data.dropna()
print(f'ticker historic data without missing data: {ticker_data}')


In [None]:
csv('data/ibd50_hist.csv')

In [None]:

msft = yf.Ticker("MSFT")
print(msft)
"""
returns
<yfinance.Ticker object at 0x1a1715e898>
"""


In [None]:
import json

# get stock info
print(json.dumps(msft.info, indent=2))

"""
returns:
{
 'quoteType': 'EQUITY',
 'quoteSourceName': 'Nasdaq Real Time Price',
 'currency': 'USD',
 'shortName': 'Microsoft Corporation',
 'exchangeTimezoneName': 'America/New_York',
  ...
 'symbol': 'MSFT'
}
"""


In [None]:

# get historical market data
msft_hist = msft.history(period="max")
print(msft_hist)
"""
returns:
              Open    High    Low    Close      Volume  Dividends  Splits
Date
1986-03-13    0.06    0.07    0.06    0.07  1031788800        0.0     0.0
1986-03-14    0.07    0.07    0.07    0.07   308160000        0.0     0.0
...
2019-04-15  120.94  121.58  120.57  121.05    15792600        0.0     0.0
2019-04-16  121.64  121.65  120.10  120.77    14059700        0.0     0.0
"""


In [None]:
# show actions (dividends, splits)
print(msft.actions)
"""
returns:
            Dividends  Splits
Date
1987-09-21       0.00     2.0
1990-04-16       0.00     2.0
...
2018-11-14       0.46     0.0
2019-02-20       0.46     0.0
"""

In [None]:

# show dividends
print(msft.dividends)
"""
returns:
Date
2003-02-19    0.08
2003-10-15    0.16
...
2018-11-14    0.46
2019-02-20    0.46
"""

In [None]:
# show splits
print(msft.splits)
"""
returns:
Date
1987-09-21    2.0
1990-04-16    2.0
...
1999-03-29    2.0
2003-02-18    2.0
"""

In [None]:
msft_hist.head()

In [None]:
msft_hist.tail()

In [None]:
# create a deep copy so we can experiment with data without triggering unnecessary downloads 
df=msft_hist.copy(deep=True)

# df=msft_hist.reset_index(drop=True)
# data['Date']=pd.to_datetime(data['Date'])
print(len(df))
df.head()

Ingest data into darts timeseries


In [None]:
# Count rows without values
# https://stackoverflow.com/questions/28199524/best-way-to-count-the-number-of-rows-with-missing-values-in-a-pandas-dataframe
df.shape[0] - df.dropna().shape[0]



In [None]:
# count any cells without values
df.isnull().values.ravel().sum()

Convert data to timeseries format that models can work with

Regular time intervals between data points and no missing values

In [None]:
df

In [None]:
import pandas as pd

# Clean up date index. Remove time zone.
df.index = pd.to_datetime(df.index, utc=True).date
df.index = pd.to_datetime(df.index, utc=True)
df.index.name = 'Date'
df

In [None]:
df.index

In [None]:
type(df.index)

In [None]:
from darts import TimeSeries

series = TimeSeries.from_dataframe(df, fill_missing_dates=True, freq='B') # value_cols=["Close"],  "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits"


In [None]:
# get regularized time series with NaN fill-ins
reg_df = series.pd_dataframe()

In [None]:
# get number of rows without values
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# fill in missing values
reg_df = reg_df.interpolate()


In [None]:
# Check again number of rows without values. Should be 0.
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# update series with new regularized dates and values
series = TimeSeries.from_dataframe(reg_df)

Save prepared timeseries data to local csv for model training

In [None]:
data_file_name = 'data/market_data.csv'

In [None]:
# series.to_csv(data_file_name)
series.to_csv(data_file_name)

Make sure data can load back into timeseries

In [None]:
series = TimeSeries.from_csv(data_file_name, time_col='Date') #, freq='B')

In [None]:
series

In [None]:
# get regularized time series with NaN fill-ins
loaded_df = series.pd_dataframe()

In [None]:
# get number of rows without values
loaded_df.shape[0] - loaded_df.dropna().shape[0]