In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
pip install -r requirements.txt


Explore stock market dataset from Yahoo Finance

In [None]:
import yfinance as yf
import pandas as pd


## Prepare broad market indicies

In [None]:
# Capture S&P500, NASDAQ100 and Russell 200 indecies and their equal weighted counter parts
# As well as VIX volatility index, DYX US Dollar index and TNX US 10 Year Treasuries Rate Index
broad_market_indicies = '^SPX ^SPXEW ^NDX ^NDXE ^RUT ^R2ESC ^VIX DX-Y.NYB ^TNX'

In [None]:
broad_market = yf.download(broad_market_indicies, period='max', group_by='tickers') 
broad_market

In [None]:
broad_market.to_csv('data/broad_market.csv.bz2', index='Date')

## Prepare Sector Indicies

In [None]:
sector_indicies = 'XLE ^SP500-15 ^SP500-20 ^SP500-25 ^SP500-30 ^SP500-35 ^SP500-40 ^SP500-45 ^SP500-50 ^SP500-55 ^SP500-60'

In [None]:
sectors = yf.download(sector_indicies, period='max') 
sectors

In [None]:
sectors.to_csv('data/sectors.csv.bz2')

## Load list of IBD Growth Stocks

In [8]:

ibd50 = pd.read_csv('data/IBD50.csv')
ibd50


Unnamed: 0,Symbol
0,GCT
1,NVDA
2,CRWD
3,RCL
4,AMPH
5,ELF
6,CCL
7,CELH
8,ASND
9,ZS


In [9]:
ibd250 = pd.read_csv('data/IBD250.csv')
ibd250

Unnamed: 0,Symbol
0,INTU
1,ALKT
2,FICO
3,GWRE
4,COIN
...,...
295,CAMT
296,STLA
297,RACE
298,RYAAY


In [10]:
# merge IBD 50 and 250 symbol sets
ibd50_set = set(ibd50['Symbol'])
ibd250_set = set(ibd250['Symbol'])
ibd_growth_set = ibd50_set.union(ibd250_set)

In [11]:
len(ibd_growth_set)

309

In [12]:
ibdgrowth_str = ' '.join(ibd_growth_set)
ibdgrowth_str



'SSD TPH LEN FERG ZG TEAM MEDP LULU ASO FLT FBIN FLEX SCCO ELF VSTS AEO OC ROST LRN AZEK JPM BK RACE TRI YELP CRS MCO ZS CADE CELH DUOL WOR PSN CSWC GE CCL TREX TRIP RELX NFLX VRNS MAIN WAL MFC TSLX FITB STRL PTC LPG OZK HWKN HWM ETN SPOT ERIE HG MBWM SFM TOL CAT ICE WDAY BROS BCO GPS PVH DOCU IBP VRTX QLYS AZTA PH REGN Z CRBG ALLY TDW JBI RCL EXPE ALKT TMUS TBBK OFG ALKS AOS ARM CYBR CHKP WBS TGH KBH APO SKX KKR RAMP DHI AWI BLX FOUR COLB KNF WTFC MAS FI MARA PCOR NBIX RGA GOOGL BKNG TWLO UBS ZTS ITCI QCOM ASND LHX FICO FRPT MMYT AMZN INFA IMGN URBN FTAI GOOG DFH PBRA INTU BAC CDRE GBDC DV CRH GEN MBIN FOR DB EME FNF CAMT NMIH AMK DT PLUS RY LMB RL STLA OSW PCAR GVA AER FSS NTNX ERJ IHG TMHC BLD PATH DXCM CRSP MATX DOCN SMCI BR ALTR FIX GPN SHW COIN ARCO IOT AX WFC CAH AXP COST AGO S GPK DBX NVDA URI RYAAY XP PLAB PANW IT AMP SNOW V STNE PINS BX CMG AVGO CRWD AXON TWST ITT OTEX FRSH ENSG ANF WIX WAB PWR PDD MNST GDDY PRI ASML GWRE MSFT SPXC FLR ABBNY OWL ISRG MLM WRB CLS ROL HOLI WSM 

In [14]:
stocks_ticker_set = ibd_growth_set

## Prepare stocks price data

In [None]:
ibdgrowth_data = yf.download(ibdgrowth_str, period='max', group_by='tickers') 
ibdgrowth_data

In [None]:
ibdgrowth_data.columns.levels

In [None]:
ibdgrowth_data.to_csv('data/ibdgrowth_hist.csv.bz2', index='Date')

In [None]:
ibdgrowth_loaded = pd.read_csv('data/ibdgrowth_hist.csv.bz2', header=[0, 1], index_col=0)
ibdgrowth_loaded

In [None]:
for ticker in ibdgrowth_loaded.columns.levels[0][:2]:
    print(f'ticker: {ticker}')
    ticker_data = ibdgrowth_loaded[ticker]
    print(f'ticker historic data: {ticker_data}')
    # remove missing values
    ticker_data = ticker_data.dropna()
    print(f'ticker historic data without missing data: {ticker_data}')


## Prepare historical stock sales and earnings data

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

FMP_API_KEY=os.getenv("FMP_API_KEY")

print(f'FMP_API_KEY={FMP_API_KEY}')

In [None]:
import fmpsdk

# Company Valuation Methods
symbol: str = "AAPL"
symbols: ["AAPL", "CSCO", "QQQQ"]
exchange: str = "NYSE"
exchanges: ["NYSE", "NASDAQ"]
query: str = "AA"
limit: int = 3
period: str = "quarter"
download: bool = True
market_cap_more_than: int = 1000000000
beta_more_than: int = 1
volume_more_than: int = 10000
sector: str = "Technology"
dividend_more_than: int = 0
industry: str = "Software"
filing_type: str = "10-K"
print(f"Company Profile: {fmpsdk.company_profile(apikey=FMP_API_KEY, symbol=symbol)=}")


In [None]:
# there should be no duplicate symbols in this list
assert not ibdgrowth_loaded.columns.levels[0].duplicated().any() 

In [None]:
earnings_all_df = pd.DataFrame()
for ticker in stocks_ticker_set:
    earnings = fmpsdk.historical_earning_calendar(apikey=FMP_API_KEY, symbol=ticker, limit=-1)
    if earnings is not None and len(earnings) > 0:
        edf = pd.DataFrame(earnings)
        edf['date'] = pd.to_datetime(edf['date'])
        edf = edf.set_index(['symbol', 'date'])
        print(f"Earnings calendar for {ticker}: \n{edf}")
        # edf = edf.pivot(columns='symbol')
        # edf.swaplevel(i=0,j=1, axis=0)
        # edf.drop(columns=['symbol'])
        earnings_all_df = pd.concat([earnings_all_df, edf])
        print(f"Earnings calendar after pivot for {ticker}: \n{edf}")
        n_earnings = len(earnings)
        print(f"Total earnings reports for {ticker}: {n_earnings}")
#    earliest_earn = earnings[-1] if len(earnings > 0 else 'None')
#    print(f"Earliest earnings report for {ticker}: {earliest_earn}")


In [None]:
earnings

In [None]:
aaon = earnings_all_df.loc[['AAON']]

In [None]:
aaon

In [None]:
len(earnings_all_df)

In [None]:
earnings_all_df


In [None]:
earnings_file = 'data/earnings_calendar.csv.bz2'

In [None]:
earnings_all_df.to_csv(earnings_file)

### Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

## Prepare key metrics data for company fundamentals

In [15]:
keymetrics_all_df = pd.DataFrame()
for ticker in stocks_ticker_set:
    kms = fmpsdk.key_metrics(apikey=FMP_API_KEY, symbol=ticker, period='quarter')
    if kms is not None and len(kms) > 0:
        kms_df = pd.DataFrame(kms)
        kms_df['date'] = pd.to_datetime(kms_df['date'])
        kms_df = kms_df.set_index(['symbol', 'date'])
        print(f"Key metrics for {ticker} sample: \n{kms_df.columns")
        keymetrics_all_df = pd.concat([keymetrics_all_df, kms_df])
        print(f"Key metrics concatenated {ticker}: \n{keymetrics_all_df.columns}")
        n_kms = len(kms_df)
        print(f"Total key metrics reports for {ticker}: {n_kms}")


Key metrics for SSD: 
                  calendarYear period  revenuePerShare  netIncomePerShare  \
symbol date                                                                 
SSD    2023-09-30         2023     Q3        13.593701           2.437630   
       2023-06-30         2023     Q2        14.005015           2.512620   
       2023-03-31         2023     Q1        12.542361           2.064163   
       2022-12-31         2022     Q4        11.172179           1.353119   
       2022-09-30         2022     Q3        12.932100           2.061126   
       2022-06-30         2022     Q2        13.749728           2.168733   
       2022-03-31         2022     Q1        11.430788           2.190324   
       2021-12-31         2021     Q4         9.684761           1.615022   
       2021-09-30         2021     Q3         9.167622           1.704825   
       2021-06-30         2021     Q2         9.446079           1.668808   

                   operatingCashFlowPerShare  freeCas

Experiment with other stock data

In [16]:
keymetrics_all_df

Unnamed: 0_level_0,Unnamed: 1_level_0,calendarYear,period,revenuePerShare,netIncomePerShare,operatingCashFlowPerShare,freeCashFlowPerShare,cashPerShare,bookValuePerShare,tangibleBookValuePerShare,shareholdersEquityPerShare,...,averagePayables,averageInventory,daysSalesOutstanding,daysPayablesOutstanding,daysOfInventoryOnHand,receivablesTurnover,payablesTurnover,inventoryTurnover,roe,capexPerShare
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
SSD,2023-09-30,2023,Q3,13.593701,2.437630,0.000000,0.000000,13.380967,39.122911,19.441544,39.122911,...,96557000.0,513307500.0,54.483075,28.852564,152.776520,1.651889,3.119307,0.589096,0.062307,0.000000
SSD,2023-06-30,2023,Q2,14.005015,2.512620,4.553235,4.104197,9.561555,37.013101,16.815604,37.013101,...,96574500.0,549301000.0,58.423190,28.396751,151.541723,1.540484,3.169377,0.593896,0.067885,-0.449038
SSD,2023-03-31,2023,Q1,12.542361,2.064163,0.069420,-0.370805,5.926801,34.907510,14.563201,34.907510,...,96571500.0,566617000.0,57.202365,30.463712,184.259396,1.573362,2.954335,0.488442,0.059132,-0.440225
SSD,2022-12-31,2022,Q4,11.172179,1.353119,3.204571,2.602015,14.128629,33.370008,13.013107,33.370008,...,98243500.0,548410500.0,50.925231,32.024534,182.247652,1.767297,2.810345,0.493834,0.040549,-0.602556
SSD,2022-09-30,2022,Q3,12.932100,2.061126,2.918389,2.690842,7.223554,31.241796,12.590381,31.241796,...,105807000.0,539932000.0,54.366039,28.718926,157.216657,1.655445,3.133822,0.572458,0.065973,-0.227548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ARVN,2022-06-30,2022,Q2,0.588346,-1.315789,-1.477444,-1.503759,25.248120,12.605263,12.605263,12.605263,...,10450000.0,4500000.0,23.290735,0.000000,0.000000,3.864198,0.000000,0.000000,-0.104384,-0.026316
ARVN,2022-03-31,2022,Q1,0.456604,-1.196226,-1.077358,-1.116981,26.950943,13.647170,13.647170,13.647170,...,19950000.0,1900000.0,24.917355,0.000000,0.000000,3.611940,0.000000,0.000000,-0.087654,-0.039623
ARVN,2021-12-31,2021,Q4,0.498106,-1.003788,-0.677176,-0.713066,28.458333,14.804924,14.804924,14.804924,...,17985802.5,1900000.0,87.946768,4213.483146,-94.231252,1.023346,0.021360,-0.955097,-0.067801,-0.035890
ARVN,2021-09-30,2021,Q3,0.186441,-0.938790,13.685272,13.658202,31.013599,16.420425,16.420425,16.420425,...,4823486.0,4500000.0,83.710177,0.000000,0.000000,1.075138,0.000000,0.000000,-0.057172,-0.027070


In [None]:
kms_file = 'data/keymetrics_history.csv.bz2'
keymetrics_all_df.to_csv(kms_file)

In [None]:

msft = yf.Ticker("MSFT")
print(msft)
"""
returns
<yfinance.Ticker object at 0x1a1715e898>
"""


In [None]:
import json

# get stock info
print(json.dumps(msft.info, indent=2))

"""
returns:
{
 'quoteType': 'EQUITY',
 'quoteSourceName': 'Nasdaq Real Time Price',
 'currency': 'USD',
 'shortName': 'Microsoft Corporation',
 'exchangeTimezoneName': 'America/New_York',
  ...
 'symbol': 'MSFT'
}
"""


In [None]:

# get historical market data
msft_hist = msft.history(period="max")
print(msft_hist)
"""
returns:
              Open    High    Low    Close      Volume  Dividends  Splits
Date
1986-03-13    0.06    0.07    0.06    0.07  1031788800        0.0     0.0
1986-03-14    0.07    0.07    0.07    0.07   308160000        0.0     0.0
...
2019-04-15  120.94  121.58  120.57  121.05    15792600        0.0     0.0
2019-04-16  121.64  121.65  120.10  120.77    14059700        0.0     0.0
"""


In [None]:
# show actions (dividends, splits)
print(msft.actions)
"""
returns:
            Dividends  Splits
Date
1987-09-21       0.00     2.0
1990-04-16       0.00     2.0
...
2018-11-14       0.46     0.0
2019-02-20       0.46     0.0
"""

In [None]:

# show dividends
print(msft.dividends)
"""
returns:
Date
2003-02-19    0.08
2003-10-15    0.16
...
2018-11-14    0.46
2019-02-20    0.46
"""

In [None]:
# show splits
print(msft.splits)
"""
returns:
Date
1987-09-21    2.0
1990-04-16    2.0
...
1999-03-29    2.0
2003-02-18    2.0
"""

In [None]:
msft_hist.head()

In [None]:
msft_hist.tail()

In [None]:
# create a deep copy so we can experiment with data without triggering unnecessary downloads 
df=msft_hist.copy(deep=True)

# df=msft_hist.reset_index(drop=True)
# data['Date']=pd.to_datetime(data['Date'])
print(len(df))
df.head()

Ingest data into darts timeseries


In [None]:
# Count rows without values
# https://stackoverflow.com/questions/28199524/best-way-to-count-the-number-of-rows-with-missing-values-in-a-pandas-dataframe
df.shape[0] - df.dropna().shape[0]



In [None]:
# count any cells without values
df.isnull().values.ravel().sum()

Convert data to timeseries format that models can work with

Regular time intervals between data points and no missing values

In [None]:
df

In [None]:
import pandas as pd

# Clean up date index. Remove time zone.
df.index = pd.to_datetime(df.index, utc=True).date
df.index = pd.to_datetime(df.index, utc=True)
df.index.name = 'Date'
df

In [None]:
df.index

In [None]:
type(df.index)

In [None]:
from darts import TimeSeries

series = TimeSeries.from_dataframe(df, fill_missing_dates=True, freq='B') # value_cols=["Close"],  "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits"


In [None]:
# get regularized time series with NaN fill-ins
reg_df = series.pd_dataframe()

In [None]:
# get number of rows without values
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# fill in missing values
reg_df = reg_df.interpolate()


In [None]:
# Check again number of rows without values. Should be 0.
reg_df.shape[0] - reg_df.dropna().shape[0]

In [None]:
# update series with new regularized dates and values
series = TimeSeries.from_dataframe(reg_df)

Save prepared timeseries data to local csv for model training

In [None]:
data_file_name = 'data/msft_data.csv'

In [None]:
# series.to_csv(data_file_name)


Make sure data can load back into timeseries

In [None]:
# series = TimeSeries.from_csv(data_file_name, time_col='Date') #, freq='B')

In [None]:
series

In [None]:
# get regularized time series with NaN fill-ins
loaded_df = series.pd_dataframe()

In [None]:
# get number of rows without values
loaded_df.shape[0] - loaded_df.dropna().shape[0]