In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# pip install -r requirements.txt
# !pip install -e ./

import sys
!{sys.executable} -m pip install -e ./


Explore stock market dataset from Yahoo Finance

In [None]:
import yfinance as yf
import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

FMP_API_KEY=os.getenv("FMP_API_KEY")

print(f'FMP_API_KEY={FMP_API_KEY!= None}')

## Load list of IBD Growth Stocks

In [None]:
all_stocks_file = 'all_stocks.csv'

In [None]:
all_stock_set = set()
stock_files = [
    'IBD50.csv', 
    'IBD250.csv', 
    'IBD250_18Jan2024.csv', 
    'ibdlive_picks.csv', 
    'russell2000_iwm_holdings.csv', 
    'sp500_ivv_holdings.csv',
    'nasdaq100_cndx_holdings.csv',
    all_stocks_file
    ]
for f in stock_files:
    fp = f'data/{f}'
    if Path(fp).is_file():
        stocks = pd.read_csv(fp)
        print(f'loaded {len(stocks)} symbols from {fp}')
        stock_set = set(stocks['Symbol'])
        print(f'{len(stock_set)} symbols in stock set')
        all_stock_set |= stock_set
        print(f'total symbols loaded: {len(all_stock_set)}')
    else:
        print(f'{fp} not found.')


In [None]:
len(all_stock_set), all_stock_set

In [None]:
stocks_ticker_set = all_stock_set

In [None]:
growth_stocks_df = pd.DataFrame()
growth_stocks_df['Symbol'] = list(stocks_ticker_set)
growth_stocks_df = growth_stocks_df.set_index(['Symbol'])
growth_stocks_df.index = growth_stocks_df.index.drop_duplicates()
# drop known junk symbols from the data feed
junk = ['MSFUT', 'GEFB', 'METCV', 'SGAFT', 'NQH4', 'XTSLA', '-', 'PDLI', 'ADRO', 'ICSUAGD', 'BFB', 'GTXI', 'P5N994', 'LGFB', 'MLIFT', 'ESH4', 'LGFA', 'MOGA', 'PBRA', 'BRKB', 'RTYH4', '\xa0', 'CRDA']
growth_stocks_df.index = growth_stocks_df.index.drop(junk)
growth_stocks_df

In [None]:
growth_stocks_df.to_csv(f'data/{all_stocks_file}')

## Prepare broad market indicies

In [None]:
# Capture S&P500, NASDAQ100 and Russell 200 indecies and their equal weighted counter parts
# As well as VIX volatility index, DYX US Dollar index, TNX US 12 Weeks Treasury Yield, 5 Years Treasury Yield and 10 Year Treasuries Yield
broad_market_indicies = '^SPX ^SPXEW ^NDX ^NDXE ^RUT ^R2ESC ^VIX DX-Y.NYB ^IRX ^FVX ^TNX'

In [None]:
broad_market = yf.download(broad_market_indicies, period='max', group_by='tickers') 
broad_market

In [None]:
broad_market.to_csv('data/broad_market.csv.bz2', index='Date')

## Prepare Sector Indicies

In [None]:
sector_indicies = 'XLE ^SP500-15 ^SP500-20 ^SP500-25 ^SP500-30 ^SP500-35 ^SP500-40 ^SP500-45 ^SP500-50 ^SP500-55 ^SP500-60'

In [None]:
sectors = yf.download(sector_indicies, period='max') 
sectors

In [None]:
sectors.to_csv('data/sectors.csv.bz2')

## Prepare stocks price data

In [None]:
price_interval = "1d" # "1wk"

In [None]:
stock_price_data = yf.download(all_stock_set, period='max', group_by='tickers', interval=price_interval) 
stock_price_data

In [None]:
stock_price_data.columns.levels

In [None]:
stock_price_data.tail(20)

In [None]:
stock_price_data.dropna(how='all')

In [None]:
jan24 = stock_price_data[stock_price_data.index.get_level_values('Date') == pd.Timestamp('2024-01-24')]


In [None]:
jan24

In [None]:
jan24nona = jan24.dropna(axis=1)

In [None]:
jan24nona

In [None]:
stock_price_data['ADV'].dropna()

In [None]:
stock_price_data['VRTS'].dropna()

In [None]:
stock_price_data['BHVN'].dropna()

In [None]:
bhvn = stock_price_data['BHVN']

In [None]:
bhvn

In [None]:
bhvn.dropna(how="all")

In [None]:
price_hist_file = f'data/all_stocks_price_hist_{price_interval}.csv.bz2'

In [None]:
stock_price_data.to_csv(price_hist_file, index='Date')

In [None]:
# stock_price_data.to_csv('data/all_stocks_price_hist.bak.csv', index='Date')

In [None]:
stock_price_data_loaded = pd.read_csv(price_hist_file, header=[0, 1], index_col=0)
stock_price_data_loaded

In [None]:
for ticker in stock_price_data_loaded.columns.levels[0][:2]:
    print(f'ticker: {ticker}')
    ticker_data = stock_price_data_loaded[ticker]
    print(f'ticker historic data: {ticker_data}')
    # remove missing values
    ticker_data = ticker_data.dropna()
    print(f'ticker historic data without missing data: {ticker_data}')


## Prepare historical stock sales and earnings data

In [None]:
import fmpsdk

# Company Valuation Methods
symbol: str = "AAPL"
symbols: ["AAPL", "CSCO", "QQQQ"]
exchange: str = "NYSE"
exchanges: ["NYSE", "NASDAQ"]
query: str = "AA"
limit: int = 3
period: str = "quarter"
download: bool = True
market_cap_more_than: int = 1000000000
beta_more_than: int = 1
volume_more_than: int = 10000
sector: str = "Technology"
dividend_more_than: int = 0
industry: str = "Software"
filing_type: str = "10-K"
print(f"Company Profile: {fmpsdk.company_profile(apikey=FMP_API_KEY, symbol=symbol)=}")


In [None]:
earnings_all_df = pd.DataFrame()
for ticker in stocks_ticker_set: # ['AAON']: # 
    earnings = fmpsdk.historical_earning_calendar(apikey=FMP_API_KEY, symbol=ticker, limit=-1)
    if earnings is not None and len(earnings) > 0:
        edf = pd.DataFrame(earnings)
        edf['date'] = pd.to_datetime(edf['date'])
        edf = edf.set_index(['symbol', 'date'])
        # edf = edf.pivot(columns='symbol')
        # edf.swaplevel(i=0,j=1, axis=0)
        # edf.drop(columns=['symbol'])
        earnings_all_df = pd.concat([earnings_all_df, edf])
        n_earnings = len(earnings)
        # print(f"Total earnings reports for {ticker}: {n_earnings}")
#    earliest_earn = earnings[-1] if len(earnings > 0 else 'None')
#    print(f"Earliest earnings report for {ticker}: {earliest_earn}")


In [None]:
earnings

In [None]:
aaon = earnings_all_df.loc[['AAON']]

In [None]:
aaon

In [None]:
len(earnings_all_df)

In [None]:
earnings_all_df


In [None]:
len(earnings_all_df.index.levels[0])


In [None]:
earnings_file = 'data/earnings_calendar.csv.bz2'

In [None]:
earnings_all_df.to_csv(earnings_file)

### Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

## Prepare historical dividends
  * This is secondary information since growth stocks usually do not have dividends and rarely have splits
  * Additionally the dividends and split information is partially reflected in Adj Close of price history data

In [None]:
def fetch_dividends_history():
    divs_hist_all_df = pd.DataFrame()
    for ticker in stocks_ticker_set: # ['AAON']:
        divs_hist = fmpsdk.historical_stock_dividend(apikey=FMP_API_KEY, symbol=ticker)
        # print(f"Loaded historical dividends for {ticker}: \n{divs_hist}")
        print(f"Loaded {len(divs_hist['historical'])} historical dividends for {ticker}")
        if divs_hist['historical'] is not None and len(divs_hist['historical']) > 0:
            dh_df_tmp = pd.DataFrame.from_dict(data=divs_hist['historical'])
            # print(f"Historical dividends for {ticker} dataframe: \n{dh_df_tmp.head()}")
            dh_df_tmp['symbol'] = ticker
            dh_df = dh_df_tmp
            # print(f"Historical dividends for {ticker} dataframe: \n{dh_df_tmp.head()}")
            # print(f"Historical dividends for {ticker} full dataframe: \n{dh_df.head()}")
            dh_df['date'] = pd.to_datetime(dh_df['date'])
            dh_df = dh_df.set_index(['symbol', 'date'])
            n_divs_hist = len(dh_df)
            print(f"Total dividends history reports for {ticker}: {n_divs_hist}")
            # print(f"Historical dividends for {ticker} full dataframe: \n{dh_df}")
            divs_hist_all_df = pd.concat([divs_hist_all_df, dh_df])
    return divs_hist_all_df


In [None]:
# divs_hist_file = 'data/dividends_history.csv.bz2'

In [None]:
# divs_hist_all_df.to_csv(divs_hist_file)

### Read back data and verify it

In [None]:
import pandas as pd

earnings_loaded_df = pd.read_csv('data/earnings_calendar.csv.bz2', index_col=['symbol', 'date'])
print(earnings_loaded_df)

## Prepare key metrics data for company fundamentals

In [None]:

keymetrics_all_df = pd.DataFrame()
for ticker in stocks_ticker_set:
    kms = fmpsdk.key_metrics(apikey=FMP_API_KEY, symbol=ticker, period='quarter', limit=-1)
    if kms is not None and len(kms) > 0:
        kms_df = pd.DataFrame(kms)
        kms_df['date'] = pd.to_datetime(kms_df['date'])
        kms_df = kms_df.set_index(['symbol', 'date'])
        # print(f"Key metrics for {ticker} sample: \n{kms_df.columns}")
        keymetrics_all_df = pd.concat([keymetrics_all_df, kms_df])
        # print(f"Key metrics concatenated {ticker}: \n{keymetrics_all_df.columns}")
        n_kms = len(kms_df)
        print(f"Total key metrics reports for {ticker}: {n_kms}")
    else:
        print(f"No {ticker} key metrics reports: kms={kms}")

In [None]:
keymetrics_all_df

In [None]:
kms_file = 'data/keymetrics_history.csv.bz2'
keymetrics_all_df.to_csv(kms_file)

## Prepare institutional ownership data

In [None]:

from fmpsdk.settings import DEFAULT_LIMIT, SEC_RSS_FEEDS_FILENAME, BASE_URL_v3
from fmpsdk.url_methods import __return_json_v4
import typing


def institutional_symbol_ownership(
    apikey: str, 
    symbol: str, 
    limit: int,
    includeCurrentQuarter: bool = False,
) -> typing.Optional[typing.List[typing.Dict]]:
    """
    Query FMP /institutional-ownership/ API.

    :param apikey: Your API key.
    :param symbol: Company ticker.
    :param limit: up to how many quarterly reports to return.
    :param includeCurrentQuarter: Whether to include any available data in the current quarter.
    :return: A list of dictionaries.
    """
    path = f"institutional-ownership/symbol-ownership"
    query_vars = {"symbol": symbol, "apikey": apikey, "includeCurrentQuarter": includeCurrentQuarter, "limit": limit}
    return __return_json_v4(path=path, query_vars=query_vars)


In [None]:

inst_ownership_all_df = pd.DataFrame()
for ticker in stocks_ticker_set:
    inst_ownership = institutional_symbol_ownership(apikey=FMP_API_KEY, symbol=ticker, limit=-1)
    # print("inst_ownership: ", inst_ownership)
    if inst_ownership is not None and len(inst_ownership) > 0:
        inst_ownership_df = pd.DataFrame(inst_ownership)
        inst_ownership_df['date'] = pd.to_datetime(inst_ownership_df['date'])
        inst_ownership_df = inst_ownership_df.set_index(['symbol', 'date'])
        # print(f"Institutional ownership for {ticker} # columns: \n{len(inst_ownership_df.columns)}")
        n_iown = len(inst_ownership_df)
        print(f"Total institutional ownership reports for {ticker}: {n_iown}")
        inst_ownership_all_df = pd.concat([inst_ownership_all_df, inst_ownership_df])
        # print(f"Institutional ownership concatenated {ticker} # columns: \n{inst_ownership_all_df.columns}")
    else:
        print(f"No {ticker} institutional ownership reports: inst_ownership={inst_ownership}")

In [None]:
inst_ownership_all_df

In [None]:
bad1 = inst_ownership_all_df[15707:15709]

In [None]:
bad1[['totalPutsChange', 'totalCallsChange']]

In [None]:
offenders = inst_ownership_all_df[~inst_ownership_all_df.applymap(np.isreal).all(1)]

In [None]:
offenders[['totalPutsChange', 'totalCallsChange']]

In [None]:
import numpy as np

def check_int(value):
    try:
        int(value)
        return np.NaN
    except ValueError:
        return value

In [None]:
offenders['totalPutsChange'].apply(check_int)

In [None]:
nans = offenders.astype({'totalPutsChange': 'int64', 'totalCallsChange': 'int64'}, errors="ignore").isna()

In [None]:
len(offenders[['totalPutsChange']])

In [None]:
inst_ownership_all_df.dtypes

In [None]:
inst_ownership_file = 'data/institutional_symbol_ownership.csv.bz2'
inst_ownership_all_df.to_csv(inst_ownership_file)

## Prepare forward looking analyst estimates to be used as future covariates

In [None]:
DEFAULT_LIMIT=-1
import typing
from fmpsdk.url_methods import __return_json_v3, __validate_period


def analyst_estimates(
    apikey: str, 
    symbol: str, 
    period: str = "annual",
    limit: int = DEFAULT_LIMIT
) -> typing.Optional[typing.List[typing.Dict]]:
    """
    Query FMP /analyst-estimates/ API.

    :param apikey: Your API key.
    :param symbol: Company ticker.
    :param period: 'annual' or 'quarter'
    :param limit: Number of rows to return.
    :return: A list of dictionaries.
    """
    path = f"/analyst-estimates/{symbol}"
    query_vars = {
        "apikey": apikey,
        "symbol": symbol,
        "period": __validate_period(value=period),
        "limit": limit,
    }
    return __return_json_v3(path=path, query_vars=query_vars)



In [None]:

def fetch_estimates(period=None):
    assert period in ['quarter', 'annual']
    estimates_all_df = pd.DataFrame()
    for ticker in stocks_ticker_set: # ['ALTR']: 
        est = analyst_estimates(apikey=FMP_API_KEY, symbol=ticker, period=period, limit=-1)
        # print('est:', est)
        if est is not None and len(est) > 0:
            est_df = pd.DataFrame(est)
            est_df['date'] = pd.to_datetime(est_df['date'])
            est_df = est_df.set_index(['symbol', 'date'])
            # print(f"Analyst estimates for {ticker} sample: \n{est_df.columns}")
            estimates_all_df = pd.concat([estimates_all_df, est_df])
            # print(f"Key metrics concatenated {ticker}: \n{estimates_all_df.columns}")
            n_est = len(est_df)
            print(f"{n_est} total {ticker} {period} analyst estimates reports")
        else:
            print(f"No {ticker} {period} analyst estimates reports: est={est}")

    return estimates_all_df



In [None]:
# 'TW' in stocks_ticker_set

In [None]:
for p in ['annual', 'quarter']:
    estimates_all_df = fetch_estimates(p)
    est_file_name= f'data/analyst_estimates_{p}.csv.bz2'    
    estimates_all_df.to_csv(est_file_name)
    print(f'all {p} estimates count:', len(estimates_all_df.index))
    