In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
pip install -r requirements.txt


Note: you may need to restart the kernel to use updated packages.


Explore stock market dataset from Yahoo Finance

In [3]:
import yfinance as yf
import pandas as pd
from pathlib import Path

## Load list of IBD Growth Stocks

In [4]:
all_stocks_file = 'all_stocks.csv'

In [5]:
all_stock_set = set()
stock_files = [
    'IBD50.csv', 
    'IBD250.csv', 
    'IBD250_18Jan2024.csv', 
    'ibdlive_picks.csv', 
    'russell2000_iwm_holdings.csv', 
    'sp500_ivv_holdings.csv',
    'nasdaq100_cndx_holdings.csv',
    all_stocks_file
    ]
for f in stock_files:
    fp = f'data/{f}'
    if Path(fp).is_file():
        stocks = pd.read_csv(fp)
        print(f'loaded {len(stocks)} symbols from {fp}')
        stock_set = set(stocks['Symbol'])
        print(f'{len(stock_set)} symbols in stock set')
        all_stock_set |= stock_set
        print(f'total symbols loaded: {len(all_stock_set)}')
    else:
        print(f'{fp} not found.')


loaded 50 symbols from data/IBD50.csv
50 symbols in stock set
total symbols loaded: 50
loaded 300 symbols from data/IBD250.csv
300 symbols in stock set
total symbols loaded: 309
loaded 300 symbols from data/IBD250_18Jan2024.csv
300 symbols in stock set
total symbols loaded: 354
loaded 253 symbols from data/ibdlive_picks.csv
195 symbols in stock set
total symbols loaded: 402
loaded 1974 symbols from data/russell2000_iwm_holdings.csv
1971 symbols in stock set
total symbols loaded: 2259
loaded 509 symbols from data/sp500_ivv_holdings.csv
509 symbols in stock set
total symbols loaded: 2646
loaded 110 symbols from data/nasdaq100_cndx_holdings.csv
109 symbols in stock set
total symbols loaded: 2656
loaded 2656 symbols from data/all_stocks.csv
2656 symbols in stock set
total symbols loaded: 2656


In [6]:
len(all_stock_set), all_stock_set

(2656,
 {'ACET',
  'EVRI',
  'PHR',
  'LRN',
  'HLF',
  'KLXE',
  'O',
  'KDP',
  'CANO',
  'BBIO',
  'OVLY',
  'BEAM',
  'AGO',
  'WOW',
  'SLDP',
  'STOK',
  'OTLK',
  'PAYX',
  'GTLS',
  'SP',
  'CASS',
  'MACK',
  'HASI',
  'INOD',
  'SG',
  'WRK',
  'CHMG',
  'HUM',
  'RNAC',
  'DRS',
  'ATRC',
  'FRSH',
  'GBP',
  'EVGO',
  'DELL',
  'PRAA',
  'IMXI',
  'NDAQ',
  'FCF',
  'TEL',
  'SYY',
  'VST',
  'MRVL',
  'SPRY',
  'INVH',
  'MGRC',
  'VRNS',
  'NXT',
  'BCO',
  'CVI',
  'CIO',
  'USD',
  'INZY',
  'JAMF',
  'DFH',
  'LXFR',
  'ZG',
  'PLMR',
  'NU',
  'CRS',
  'BTAI',
  'CFFN',
  'EGLE',
  'LHX',
  'IDYA',
  'VRNT',
  'HON',
  'NRIM',
  'NWSA',
  'ACGL',
  'BR',
  'CMCSA',
  'IEX',
  'SLG',
  'LEGH',
  'CCI',
  'GNW',
  'OPI',
  'MMYT',
  'TRGP',
  'FRST',
  'PRO',
  'NEE',
  'RIOT',
  'GNTY',
  'MTTR',
  'AVNW',
  'AJG',
  'FHTX',
  'KALV',
  'XPRO',
  'TH',
  'HP',
  'HOUS',
  'DUK',
  'AAON',
  'NAUT',
  'PEBO',
  'SCCO',
  'XMTR',
  'ITW',
  'SBUX',
  'LBRT',
  'GMS',
  '

In [7]:
stocks_ticker_set = all_stock_set

In [8]:
growth_stocks_df = pd.DataFrame()
growth_stocks_df['Symbol'] = list(stocks_ticker_set)
growth_stocks_df = growth_stocks_df.set_index(['Symbol'])
growth_stocks_df

ACET
EVRI
PHR
LRN
HLF
...
VIR
OMCL
CSTM
DASH
HFWA


In [9]:
growth_stocks_df.to_csv(f'data/{all_stocks_file}')

## Prepare broad market indicies

In [None]:
# Capture S&P500, NASDAQ100 and Russell 200 indecies and their equal weighted counter parts
# As well as VIX volatility index, DYX US Dollar index, TNX US 12 Weeks Treasury Yield, 5 Years Treasury Yield and 10 Year Treasuries Yield
broad_market_indicies = '^SPX ^SPXEW ^NDX ^NDXE ^RUT ^R2ESC ^VIX DX-Y.NYB ^IRX ^FVX ^TNX'

In [None]:
broad_market = yf.download(broad_market_indicies, period='max', group_by='tickers') 
broad_market

In [None]:
broad_market.to_csv('data/broad_market.csv.bz2', index='Date')

## Prepare Sector Indicies

In [None]:
sector_indicies = 'XLE ^SP500-15 ^SP500-20 ^SP500-25 ^SP500-30 ^SP500-35 ^SP500-40 ^SP500-45 ^SP500-50 ^SP500-55 ^SP500-60'

In [None]:
sectors = yf.download(sector_indicies, period='max') 
sectors

In [None]:
sectors.to_csv('data/sectors.csv.bz2')

## Prepare stocks price data

In [10]:
stock_price_data = yf.download(all_stock_set, period='max', group_by='tickers') 
stock_price_data

[**********************72%%*********             ]  1916 of 2656 completed

Failed to get ticker 'THE CONTENT CONTAINED HEREIN IS OWNED OR LICENSED BY BLACKROCK AND/OR ITS THIRD-PARTY INFORMATION PROVIDERS AND IS PROTECTED BY APPLICABLE COPYRIGHTS, TRADEMARKS, SERVICE MARKS, AND/OR OTHER INTELLECTUAL PROPERTY RIGHTS. SUCH CONTENT IS SOLELY FOR YOUR PERSONAL, NON-COMMERCIAL USE. ACCORDINGLY, YOU MAY NOT COPY, DISTRIBUTE, MODIFY, POST, FRAME OR DEEP LINK THIS CONTENT. YOU MAY DOWNLOAD MATERIAL DISPLAYED ON THIS WEBSITE FOR YOUR PERSONAL USE PROVIDED YOU ALSO RETAIN ALL COPYRIGHT AND OTHER PROPRIETARY NOTICES CONTAINED ON THE MATERIALS. MODIFICATION OR USE OF THE MATERIALS FOR ANY OTHER PURPOSE VIOLATES BLACKROCK'S INTELLECTUAL PROPERTY RIGHTS.
HOLDINGS SUBJECT TO CHANGE. SEE WWW.ISHARES.COM FOR THE MOST RECENT FUNDS HOLDINGS.
THE VALUES FOR “PRICE” SHOWN HEREIN GENERALLY REPRESENT A PRICE PROVIDED BY A THIRD-PARTY PRICING VENDOR FOR THE PORTFOLIO HOLDING AND DO NOT REFLECT THE IMPACT OF SYSTEMATIC FAIR VALUATION (“THE VENDOR PRICE”). THE VENDOR PRICE IS NOT NECE

[*********************100%%**********************]  2656 of 2656 completed


24 Failed downloads:
['ESH4', 'SGAFT', 'PBRA', 'P5N994', 'NQH4', 'LGFB', 'RTYH4', 'MOGA', 'BFB', 'PDLI', 'ICSUAGD', 'MLIFT', 'METCV', 'ADRO', 'XTSLA', 'GTXI', 'LGFA', '-', 'GEFB', 'MSFUT', 'BRKB']: Exception('%ticker%: No timezone found, symbol may be delisted')
['\xa0']: Exception('\xa0: No timezone found, symbol may be delisted')
["THE CONTENT CONTAINED HEREIN IS OWNED OR LICENSED BY BLACKROCK AND/OR ITS THIRD-PARTY INFORMATION PROVIDERS AND IS PROTECTED BY APPLICABLE COPYRIGHTS, TRADEMARKS, SERVICE MARKS, AND/OR OTHER INTELLECTUAL PROPERTY RIGHTS. SUCH CONTENT IS SOLELY FOR YOUR PERSONAL, NON-COMMERCIAL USE. ACCORDINGLY, YOU MAY NOT COPY, DISTRIBUTE, MODIFY, POST, FRAME OR DEEP LINK THIS CONTENT. YOU MAY DOWNLOAD MATERIAL DISPLAYED ON THIS WEBSITE FOR YOUR PERSONAL USE PROVIDED YOU ALSO RETAIN ALL COPYRIGHT AND OTHER PROPRIETARY NOTICES CONTAINED ON THE MATERIALS. MODIFICATION OR USE OF THE MATERIALS FOR ANY OTHER PURPOSE VIOLATES BLACKROCK'S INTELLECTUAL PROPERTY RIGHTS.\nHOLDINGS




Unnamed: 0_level_0,BEAM,BEAM,BEAM,BEAM,BEAM,BEAM,CRDO,CRDO,CRDO,CRDO,...,VGR,VGR,VGR,VGR,LTC,LTC,LTC,LTC,LTC,LTC
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,,,,,,,,,
1962-01-03,,,,,,,,,,,...,,,,,,,,,,
1962-01-04,,,,,,,,,,,...,,,,,,,,,,
1962-01-05,,,,,,,,,,,...,,,,,,,,,,
1962-01-08,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-16,25.280001,25.690001,24.410000,25.100000,25.100000,902300.0,18.870001,19.850000,18.570000,18.990000,...,10.65,10.660,10.660,806100.0,32.849998,33.150002,32.820000,32.950001,32.950001,315000.0
2024-01-17,24.340000,24.785000,23.559999,24.129999,24.129999,787500.0,18.770000,18.879999,17.969999,18.750000,...,10.30,10.300,10.300,901500.0,32.700001,32.980000,31.799999,31.889999,31.889999,316100.0
2024-01-18,24.389999,24.400000,23.080000,23.459999,23.459999,665800.0,19.360001,19.412001,18.740000,19.190001,...,10.19,10.310,10.310,1008000.0,32.000000,32.160000,31.410000,31.670000,31.670000,355300.0
2024-01-19,23.459999,23.990000,22.750000,23.690001,23.690001,748400.0,19.500000,20.840000,19.150000,20.780001,...,10.14,10.290,10.290,2874000.0,31.850000,32.090000,31.549999,32.049999,32.049999,301200.0


In [11]:
stock_price_data.columns.levels

FrozenList([['-', 'A', 'AADI', 'AAL', 'AAN', 'AAON', 'AAPL', 'AAT', 'ABBNY', 'ABBV', 'ABCB', 'ABG', 'ABM', 'ABNB', 'ABR', 'ABT', 'ABUS', 'ABVX', 'ACA', 'ACAD', 'ACCD', 'ACCO', 'ACDC', 'ACEL', 'ACET', 'ACGL', 'ACHR', 'ACIC', 'ACIW', 'ACLS', 'ACLX', 'ACMR', 'ACN', 'ACNB', 'ACRE', 'ACRS', 'ACRV', 'ACT', 'ACTG', 'ACVA', 'ADBE', 'ADEA', 'ADI', 'ADM', 'ADMA', 'ADNT', 'ADP', 'ADPT', 'ADRO', 'ADSK', 'ADTN', 'ADUS', 'ADV', 'AEE', 'AEHR', 'AEIS', 'AEL', 'AEO', 'AEP', 'AER', 'AES', 'AESI', 'AEVA', 'AFCG', 'AFL', 'AFRI', 'AFRM', 'AFYA', 'AGEN', 'AGIO', 'AGM', 'AGO', 'AGS', 'AGTI', 'AGX', 'AGYS', 'AHCO', 'AHH', 'AI', 'AIG', 'AIN', 'AIR', 'AIRS', 'AIT', 'AIV', 'AIZ', 'AJG', 'AKAM', 'AKR', 'AKRO', 'AKTS', 'AKYA', 'ALB', 'ALCO', 'ALDX', 'ALE', 'ALEC', 'ALEX', 'ALG', 'ALGN', ...], ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']])

In [12]:
stock_price_data.to_csv('data/all_stocks_price_hist.csv.bz2', index='Date')

In [13]:
# stock_price_data.to_csv('data/all_stocks_price_hist.bak.csv', index='Date')

In [14]:
stock_price_data_loaded = pd.read_csv('data/all_stocks_price_hist.csv.bz2', header=[0, 1], index_col=0)
stock_price_data_loaded

Unnamed: 0_level_0,BEAM,BEAM,BEAM,BEAM,BEAM,BEAM,CRDO,CRDO,CRDO,CRDO,...,VGR,VGR,VGR,VGR,LTC,LTC,LTC,LTC,LTC,LTC
Unnamed: 0_level_1,Open,High,Low,Close,Adj Close,Volume,Open,High,Low,Close,...,Low,Close,Adj Close,Volume,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02 00:00:00,,,,,,,,,,,...,,,,,,,,,,
1962-01-03 00:00:00,,,,,,,,,,,...,,,,,,,,,,
1962-01-04 00:00:00,,,,,,,,,,,...,,,,,,,,,,
1962-01-05 00:00:00,,,,,,,,,,,...,,,,,,,,,,
1962-01-08 00:00:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-16 00:00:00,25.280001,25.690001,24.410000,25.100000,25.100000,902300.0,18.870001,19.850000,18.570000,18.990000,...,10.65,10.660,10.660,806100.0,32.849998,33.150002,32.820000,32.950001,32.950001,315000.0
2024-01-17 00:00:00,24.340000,24.785000,23.559999,24.129999,24.129999,787500.0,18.770000,18.879999,17.969999,18.750000,...,10.30,10.300,10.300,901500.0,32.700001,32.980000,31.799999,31.889999,31.889999,316100.0
2024-01-18 00:00:00,24.389999,24.400000,23.080000,23.459999,23.459999,665800.0,19.360001,19.412001,18.740000,19.190001,...,10.19,10.310,10.310,1008000.0,32.000000,32.160000,31.410000,31.670000,31.670000,355300.0
2024-01-19 00:00:00,23.459999,23.990000,22.750000,23.690001,23.690001,748400.0,19.500000,20.840000,19.150000,20.780001,...,10.14,10.290,10.290,2874000.0,31.850000,32.090000,31.549999,32.049999,32.049999,301200.0


In [None]:
for ticker in stock_price_data_loaded.columns.levels[0][:2]:
    print(f'ticker: {ticker}')
    ticker_data = stock_price_data_loaded[ticker]
    print(f'ticker historic data: {ticker_data}')
    # remove missing values
    ticker_data = ticker_data.dropna()
    print(f'ticker historic data without missing data: {ticker_data}')


## Prepare historical stock sales and earnings data

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

FMP_API_KEY=os.getenv("FMP_API_KEY")

print(f'FMP_API_KEY={FMP_API_KEY!= None}')

In [None]:
import fmpsdk

# Company Valuation Methods
symbol: str = "AAPL"
symbols: ["AAPL", "CSCO", "QQQQ"]
exchange: str = "NYSE"
exchanges: ["NYSE", "NASDAQ"]
query: str = "AA"
limit: int = 3
period: str = "quarter"
download: bool = True
market_cap_more_than: int = 1000000000
beta_more_than: int = 1
volume_more_than: int = 10000
sector: str = "Technology"
dividend_more_than: int = 0
industry: str = "Software"
filing_type: str = "10-K"
print(f"Company Profile: {fmpsdk.company_profile(apikey=FMP_API_KEY, symbol=symbol)=}")


In [None]:
earnings_all_df = pd.DataFrame()
for ticker in stocks_ticker_set: # ['AAON']: # 
    earnings = fmpsdk.historical_earning_calendar(apikey=FMP_API_KEY, symbol=ticker, limit=-1)
    if earnings is not None and len(earnings) > 0:
        edf = pd.DataFrame(earnings)
        edf['date'] = pd.to_datetime(edf['date'])
        edf = edf.set_index(['symbol', 'date'])
        # edf = edf.pivot(columns='symbol')
        # edf.swaplevel(i=0,j=1, axis=0)
        # edf.drop(columns=['symbol'])
        earnings_all_df = pd.concat([earnings_all_df, edf])
        n_earnings = len(earnings)
        # print(f"Total earnings reports for {ticker}: {n_earnings}")
#    earliest_earn = earnings[-1] if len(earnings > 0 else 'None')
#    print(f"Earliest earnings report for {ticker}: {earliest_earn}")


In [None]:
earnings

In [None]:
aaon = earnings_all_df.loc[['AAON']]

In [None]:
aaon

In [None]:
len(earnings_all_df)

In [None]:
earnings_all_df


In [None]:
len(earnings_all_df.index.levels[0])


In [None]:
earnings_file = 'data/earnings_calendar.csv.bz2'

In [None]:
earnings_all_df.to_csv(earnings_file)

### Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

## Prepare historical dividends
  * This is secondary information since growth stocks usually do not have dividends and rarely have splits
  * Additionally the dividends and split information is partially reflected in Adj Close of price history data

In [None]:
def fetch_dividends_history():
    divs_hist_all_df = pd.DataFrame()
    for ticker in stocks_ticker_set: # ['AAON']:
        divs_hist = fmpsdk.historical_stock_dividend(apikey=FMP_API_KEY, symbol=ticker)
        # print(f"Loaded historical dividends for {ticker}: \n{divs_hist}")
        print(f"Loaded {len(divs_hist['historical'])} historical dividends for {ticker}")
        if divs_hist['historical'] is not None and len(divs_hist['historical']) > 0:
            dh_df_tmp = pd.DataFrame.from_dict(data=divs_hist['historical'])
            # print(f"Historical dividends for {ticker} dataframe: \n{dh_df_tmp.head()}")
            dh_df_tmp['symbol'] = ticker
            dh_df = dh_df_tmp
            # print(f"Historical dividends for {ticker} dataframe: \n{dh_df_tmp.head()}")
            # print(f"Historical dividends for {ticker} full dataframe: \n{dh_df.head()}")
            dh_df['date'] = pd.to_datetime(dh_df['date'])
            dh_df = dh_df.set_index(['symbol', 'date'])
            n_divs_hist = len(dh_df)
            print(f"Total dividends history reports for {ticker}: {n_divs_hist}")
            # print(f"Historical dividends for {ticker} full dataframe: \n{dh_df}")
            divs_hist_all_df = pd.concat([divs_hist_all_df, dh_df])
    return divs_hist_all_df


In [None]:
# divs_hist_file = 'data/dividends_history.csv.bz2'

In [None]:
# divs_hist_all_df.to_csv(divs_hist_file)

### Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

## Prepare key metrics data for company fundamentals

In [None]:

keymetrics_all_df = pd.DataFrame()
for ticker in stocks_ticker_set:
    kms = fmpsdk.key_metrics(apikey=FMP_API_KEY, symbol=ticker, period='quarter', limit=-1)
    if kms is not None and len(kms) > 0:
        kms_df = pd.DataFrame(kms)
        kms_df['date'] = pd.to_datetime(kms_df['date'])
        kms_df = kms_df.set_index(['symbol', 'date'])
        # print(f"Key metrics for {ticker} sample: \n{kms_df.columns}")
        keymetrics_all_df = pd.concat([keymetrics_all_df, kms_df])
        # print(f"Key metrics concatenated {ticker}: \n{keymetrics_all_df.columns}")
        n_kms = len(kms_df)
        print(f"Total key metrics reports for {ticker}: {n_kms}")
    else:
        print(f"No {ticker} key metrics reports: kms={kms}")

Experiment with other stock data

In [None]:
keymetrics_all_df

In [None]:
kms_file = 'data/keymetrics_history.csv.bz2'
keymetrics_all_df.to_csv(kms_file)

## Prepare forward looking analyst estimates to be used as future covariates

In [None]:
DEFAULT_LIMIT=-1
import typing
from fmpsdk.url_methods import __return_json_v3, __validate_period


def analyst_estimates(
    apikey: str, 
    symbol: str, 
    period: str = "annual",
    limit: int = DEFAULT_LIMIT
) -> typing.Optional[typing.List[typing.Dict]]:
    """
    Query FMP /analyst-estimates/ API.

    :param apikey: Your API key.
    :param symbol: Company ticker.
    :param period: 'annual' or 'quarter'
    :param limit: Number of rows to return.
    :return: A list of dictionaries.
    """
    path = f"/analyst-estimates/{symbol}"
    query_vars = {
        "apikey": apikey,
        "symbol": symbol,
        "period": __validate_period(value=period),
        "limit": limit,
    }
    return __return_json_v3(path=path, query_vars=query_vars)



In [None]:

def fetch_estimates(period=None):
    assert period in ['quarter', 'annual']
    estimates_all_df = pd.DataFrame()
    for ticker in stocks_ticker_set: # ['ALTR']: 
        est = analyst_estimates(apikey=FMP_API_KEY, symbol=ticker, period=period, limit=-1)
        # print('est:', est)
        if est is not None and len(est) > 0:
            est_df = pd.DataFrame(est)
            est_df['date'] = pd.to_datetime(est_df['date'])
            est_df = est_df.set_index(['symbol', 'date'])
            # print(f"Analyst estimates for {ticker} sample: \n{est_df.columns}")
            estimates_all_df = pd.concat([estimates_all_df, est_df])
            # print(f"Key metrics concatenated {ticker}: \n{estimates_all_df.columns}")
            n_est = len(est_df)
            print(f"{n_est} total {ticker} {period} analyst estimates reports")
        else:
            print(f"No {ticker} {period} analyst estimates reports: est={est}")

    return estimates_all_df



In [None]:
# 'TW' in stocks_ticker_set

In [None]:
for p in ['annual', 'quarter']:
    estimates_all_df = fetch_estimates(p)
    est_file_name= f'data/analyst_estimates_{p}.csv.bz2'    
    estimates_all_df.to_csv(est_file_name)
    print(f'all {p} estimates count:', len(estimates_all_df.index))
    