In [None]:
# default_exp priceHist

# priceHist

> Parse historical stock price data files.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import datetime
import numpy as np
import os
import pandas as pd

from secscan import utils,tickerMap,scrape8K,scrape6K

USExchanges = ['AMEX','NASDAQ','NYSE','OTCBB']

Uses the historical price data from eoddata.com -
this comes as one CSV file per exchange per day, sample:

```
Symbol,Date,Open,High,Low,Close,Volume
AACG,08-Jan-2020,1.41,1.58,1.3642,1.5112,90800
AAL,08-Jan-2020,27.1,28.09,27.07,27.84,10497200
...
```


In [None]:
#export

def getHistFStuff(exch,dateStr) :
    """
    Returns (fName, fDir, fPath) containing the CSV file of historical prices
    for the given exchange and date.
    """
    fName = exch+'_'+dateStr+'.csv'
    histFDir = os.path.join(utils.stockPriceRoot,fName[:len(exch)+5])
    histFPath = os.path.join(histFDir,fName)
    return fName,histFDir,histFPath

In [None]:
# test getHistFStuff:

t = getHistFStuff('NASDAQ','20240510')
assert (t[0]=='NASDAQ_20240510.csv'
        and t[1]==os.path.join(utils.stockPriceRoot,'NASDAQ_2024')
        and t[2]==os.path.join(utils.stockPriceRoot,'NASDAQ_2024',t[0]))

In [None]:
#export

def getDayMap(dateStr, exch, symCol='Symbol', priceCol='Close') :
    """
    Parse the CSV file for a single day and exchange.
    Returns a dict:<stock symbol> -> <value>
    """
    fPath = getHistFStuff(exch,dateStr)[2]
    if not os.path.exists(fPath) :
        return {}
    df = pd.read_csv(fPath, na_filter=False)
    dayMap = {}
    for sym,val in zip(df[symCol],df[priceCol]) :
        if isinstance(sym,str) :
            dayMap[sym] = val
        else :
            print(dateStr,exch,'non-string symbol',repr(sym))
    return dayMap

In [None]:
# test getDayMap:

if not ('\\ikedi\\' in utils.stockPriceRoot) :
    utils.setStockPriceRoot('testdata')

m = getDayMap('20200108','AMEX')
assert (len(m)==2024
        and min(m.keys())=='AAAU' and m['AAAU']==15.56
        and max(m.keys())=='ZSL' and m['ZSL']==26.04)

m = getDayMap('20200108','NASDAQ')
assert (len(m)==3233
        and min(m.keys())=='AACG' and m['AACG']==1.5112
        and max(m.keys())=='ZYXI' and m['ZYXI']==8.47)

In [None]:
#export

@utils.delegates(getDayMap)
def getCombDayMap(dateStr, exchs=USExchanges, checkDups=True, **kwargs) :
    """
    Combines the day maps for a list of exchanges, optionally checking for symbols
    duplicated between exchanges.
    Returns a dict:<stock symbol> -> <value>
    """
    combDayMap = {}
    for exch in exchs :
        m = getDayMap(dateStr,exch,**kwargs)
        if checkDups :
            dupKeys = (set(m.keys()) & combDayMap.keys())
            if len(dupKeys) > 0 :
                print('duplicated keys:',exch,dateStr,sorted(dupKeys)[:10])
        combDayMap.update(m)
    return combDayMap

In [None]:
# test getCombDayMap:

m = getCombDayMap('20200108', exchs=['AMEX','NASDAQ'])
assert len(m)==2024+3233

In [None]:
#export

@utils.delegates(getCombDayMap)
def getCombDayMapWithLookback(forD, dayMapCache=None, lookback=7, **kwargs) :
    """
    Get the day map for a given day, looking back a given number of days -
    i.e. if a stock doesn't trade on a particular day, looks back the specified
    number of days to find the most recent trade.
    
    Returns a dict:<stock symbol> -> <value>
    """
    d1 = utils.toDate(forD) + datetime.timedelta(-lookback)
    d2 = utils.toDate(forD) + datetime.timedelta(1)
    dayMap = {}
    if dayMapCache is None :
        dayMapCache = {}
    for d in utils.dateStrsBetween(d1, d2) :
        if d not in dayMapCache :
            dayMapCache[d] = getCombDayMap(d, **kwargs)
        dayMap.update(dayMapCache[d])
    return dayMap

@utils.delegates(getCombDayMapWithLookback)
def getCombDayMapsForRangeWithLookback(d1, d2, **kwargs) :
    """
    Get all the combined day maps from d1 (inclusive) to d2 (exclusive), with lookback.
    
    Returns a dict:dateStr -> {<stock symbol> -> <value>}
    Skips weekend dates in the returned dict.
    """
    dayMapCache = {}
    res = {}
    for d in utils.dateStrsBetween(d1, d2, excludeWeekends=True) :
        res[d] = getCombDayMapWithLookback(d, dayMapCache=dayMapCache, **kwargs)
    return res

Direct indexing tests:

In [None]:
#export

@utils.delegates(getCombDayMapsForRangeWithLookback)
def getCleanedPriceData(d1, d2, minPrice=None, restrictToNames=False, **kwargs) :
    """
    Constructs a price dataset for direct indexing tests.
    
    First gets all the combined day maps from d1 to d2, with specified lookback,
    and skipping weekend days.
    
    Then, cuts down to stocks that have values for all the specified days.
    
    If minPrice is not None, also cuts down to stocks with value > minPrice on all days.
    
    If restrictToNames is True, also cuts down to stocks present in the SEC ticker list.
    
    Returns a tuple (syms, symNames, dateStrs, priceMat), where:
    syms is a sorted list of ticker symbols
    symNames is a list of corresponding names from the SEC ticker list ('' if not present)
    dateStrs is a list of date strings in the format '20240624'
    priceMat is a matrix of values with len(syms) rows and len(dateStrs) columns
    """
    dayMaps = getCombDayMapsForRangeWithLookback(d1, d2, **kwargs)
    dateStrs = sorted(dayMaps.keys())
    symsPresentAllDays = sorted(set.intersection(*(set(dayMap.keys())
                                                   for dayMap in dayMaps.values())))
    symsToRemove = set()
    if minPrice is not None :
        for dayMap in dayMaps.values() :
            for sym,val in dayMap.items() :
                if val < minPrice :
                    symsToRemove.add(sym)
    tickerNames = tickerMap.getSecTickerDict(field2='name')
    if restrictToNames :
        for sym in symsPresentAllDays :
            if sym not in tickerNames :
                symsToRemove.add(sym)
    if len(symsToRemove) > 0 :
        symsPresentAllDays = [sym for sym in symsPresentAllDays
                              if sym not in symsToRemove]
    priceMat = np.zeros((len(symsPresentAllDays), len(dateStrs)))
    for dateNo,d in enumerate(dateStrs) :
        dayMap = dayMaps[d]
        for symNo,sym in enumerate(symsPresentAllDays) :
            priceMat[symNo,dateNo] = dayMap[sym]
    symNames = [
        f' ({tickerNames[sym]})' if sym in tickerNames else ''
        for sym in symsPresentAllDays
    ]
    print(len(symsPresentAllDays),'symbols',len(dateStrs),'dates')
    return (symsPresentAllDays, symNames, dateStrs, priceMat)

@utils.delegates(getCleanedPriceData)
def getForwardReturns(d1, d2, d3, weekdaysForward=20, **kwargs) :
    """
    """
    syms, symNames, dateStrs, priceMat = getCleanedPriceData(d1,d3,**kwargs)
    dateStrs = [d for d in dateStrs if d<d2]
    returnMat = np.zeros((len(syms), len(dateStrs)))
    for j in range(len(dateStrs)) :
        returnMat[:,j] = priceMat[:,j+weekdaysForward]/priceMat[:,j]
    return (syms, symNames, dateStrs, returnMat)

def getClosestReturn(sym, syms, symNames, dateStrs, returnMat) :
    """
    """
    rowNo = syms.index(sym)
    diffs = ((returnMat - returnMat[rowNo])**2).sum(axis=1)
    symsAndDiffs = list((sym+symName,diff) for sym,symName,diff in zip(syms,symNames,diffs))
    symsAndDiffs.sort(key=lambda x : x[1])
    return symsAndDiffs

In [None]:
#notest

m = getForwardReturns('20230101','20240101','20240201',minPrice=1.0,priceCol='Open',lookback=4)

duplicated keys: NYSE 20230306 ['AMB.W']
duplicated keys: NYSE 20230307 ['AMB.W']
duplicated keys: NYSE 20230308 ['AMB.W']
duplicated keys: NYSE 20230309 ['AMB.W']
duplicated keys: NYSE 20230310 ['AMB.W']
duplicated keys: NYSE 20230313 ['AMB.W']
duplicated keys: NYSE 20230314 ['AMB.W']
duplicated keys: NYSE 20230315 ['AMB.W']
duplicated keys: NYSE 20230316 ['AMB.W']
duplicated keys: NYSE 20230317 ['AMB.W']
duplicated keys: NYSE 20230320 ['AMB.W']
duplicated keys: NYSE 20230321 ['AMB.W']
duplicated keys: NYSE 20230322 ['AMB.W']
duplicated keys: OTCBB 20230323 ['CETY']
duplicated keys: OTCBB 20230614 ['LTRY', 'LTRYW']
SEC ticker list length 10283 with 10283 unique values
dict: ticker->name [multiValue=last] has 10283 keys with 8025 unique values
8541 symbols 283 dates


In [None]:
#notest

getClosestReturn('MNDO',*m)[:40]

[('MNDO (MIND CTI LTD)', 0.0),
 ('FPE', 0.3939572159416754),
 ('FPEI', 0.4037418391021024),
 ('LOAN (MANHATTAN BRIDGE CAPITAL, INC)', 0.4058725785512839),
 ('CHI (CALAMOS CONVERTIBLE OPPORTUNITIES & INCOME FUND)', 0.4152784927005114),
 ('IHTA (Invesco High Income 2024 Target Term Fund)', 0.44170900864517204),
 ('JPC (Nuveen Preferred & Income Opportunities Fund)', 0.452376416767534),
 ('EFSCP (ENTERPRISE FINANCIAL SERVICES CORP)', 0.45330330487244486),
 ('JPI (Nuveen Preferred & Income Term Fund)', 0.4547505807153403),
 ('HIO (WESTERN ASSET HIGH INCOME OPPORTUNITY FUND INC.)',
  0.46793309830180163),
 ('SPN-B', 0.4700152766777501),
 ('PRE-J', 0.47212628763457065),
 ('ACR-C', 0.47461934816855417),
 ('HIPS', 0.47686817879050203),
 ('JHPI', 0.47973534129229467),
 ('FINS (Angel Oak Financial Strategies Income Term Trust)',
  0.4845376741716778),
 ('PGZ (Principal Real Estate Income Fund)', 0.4862788449951719),
 ('PQDI', 0.4975648213837466),
 ('OAK-A', 0.4991444262088841),
 ('GPM-A', 0.5011

In [None]:
#notest

getClosestReturn('TAIT',*m)[:20]

[('TAIT (TAITRON COMPONENTS INC)', 0.0),
 ('NNI (NELNET INC)', 0.7368105665953435),
 ('SPN-B', 0.771899859504044),
 ('ICVT', 0.81698375759366),
 ('COD-B', 0.8352495377240741),
 ('CLSM', 0.8366884639203913),
 ('CET (CENTRAL SECURITIES CORP)', 0.8384678465364781),
 ('CWB', 0.8487942439973535),
 ('ET-D', 0.8519271383152587),
 ('TLGPY (Telstra Group Ltd)', 0.8523217702721007),
 ('CSF', 0.8566815134696606),
 ('MGMT', 0.858181003231267),
 ('TDSC', 0.8644684103862601),
 ('WEDXF', 0.8715553578868909),
 ('WBIF', 0.8718234798733043),
 ('NCV-A', 0.8744353593018128),
 ('STT-D', 0.8744449134382206),
 ('QRMI', 0.8770904594690824),
 ('ET-C', 0.8804238997785947),
 ('XRMI', 0.8821812102633625)]

In [None]:
#notest

getClosestReturn('RELL',*m)[:20]

[('RELL (RICHARDSON ELECTRONICS, LTD.)', 0.0),
 ('CASS (CASS INFORMATION SYSTEMS INC)', 2.3632986384989505),
 ('POWI (POWER INTEGRATIONS INC)', 2.8332377785971987),
 ('GWRS (Global Water Resources, Inc.)', 2.8404981620316088),
 ('ANGO (ANGIODYNAMICS INC)', 2.8471128912348176),
 ('OARK', 2.9107047683916987),
 ('RRBI (RED RIVER BANCSHARES INC)', 2.9644257369823506),
 ('RICK (RCI HOSPITALITY HOLDINGS, INC.)', 2.9732589455373617),
 ('GTY (GETTY REALTY CORP /MD/)', 2.982627077477133),
 ('RGCO (RGC RESOURCES INC)', 2.9997206622373067),
 ('TMP (TOMPKINS FINANCIAL CORP)', 3.0179666671694),
 ('KLIC (KULICKE & SOFFA INDUSTRIES INC)', 3.0608739237335296),
 ('SEAS', 3.0614640639877395),
 ('AMBA (AMBARELLA INC)', 3.0634315959649765),
 ('CWE.A', 3.0799862214034004),
 ('CNRG', 3.136485251345724),
 ('CWEN (Clearway Energy, Inc.)', 3.1397049276381717),
 ('KEYS (Keysight Technologies, Inc.)', 3.1410272634367935),
 ('MYE (MYERS INDUSTRIES INC)', 3.1592785252589612),
 ("DENN (DENNY'S Corp)", 3.16430883555

In [None]:
#notest

getClosestReturn('SILC',*m)[:20]

[('SILC (SILICOM LTD.)', 0.0),
 ('B (BARNES GROUP INC)', 2.661399007253552),
 ('AMN (AMN HEALTHCARE SERVICES INC)', 2.724160992667765),
 ('VMI (VALMONT INDUSTRIES INC)', 2.7338617655898383),
 ('CALX (CALIX, INC)', 3.023091318498608),
 ('NTCT (NETSCOUT SYSTEMS INC)', 3.093619695327906),
 ('WHR (WHIRLPOOL CORP /DE/)', 3.1580493246124544),
 ('NJDCY', 3.193515114816411),
 ('CNHI', 3.29110063382151),
 ('HTLD (HEARTLAND EXPRESS INC)', 3.3082190659668576),
 ('GGT (GABELLI MULTIMEDIA TRUST INC.)', 3.3232999413469373),
 ('PAYC (Paycom Software, Inc.)', 3.3241541343912147),
 ('IRDM (Iridium Communications Inc.)', 3.3623235906548),
 ('CCRN (CROSS COUNTRY HEALTHCARE INC)', 3.366726739268066),
 ('NNDNF', 3.3774229025067783),
 ('F (FORD MOTOR CO)', 3.4039068620830566),
 ('TAN', 3.4594237422058955),
 ('RAYS', 3.4860890952497),
 ('EL (ESTEE LAUDER COMPANIES INC)', 3.4909226509975593),
 ('NWG (NatWest Group plc)', 3.5315279137915208)]

In [None]:
#notest

getClosestReturn('ELV',*m)[:20]

[('ELV (Elevance Health, Inc.)', 0.0),
 ('IHF', 0.1953185559368255),
 ('UNH (UNITEDHEALTH GROUP INC)', 0.25919901364555165),
 ('SIXL', 0.28807940930622417),
 ('AVIE', 0.2918642231635463),
 ('DIAX (Nuveen Dow 30sm Dynamic Overwrite Fund)', 0.2923529537316265),
 ('FBCV', 0.3074473957197455),
 ('VFMV', 0.31679367718052276),
 ('HYLG', 0.3243023540402277),
 ('NDJI', 0.3318950586036155),
 ('DIVZ', 0.3353862296524296),
 ('TPYP', 0.33630163046910805),
 ('SMMV', 0.33704934235129946),
 ('DIVO', 0.3371396383561065),
 ('PHDG', 0.34182974217536133),
 ('FTCS', 0.34346836052288554),
 ('WES (Western Midstream Partners, LP)', 0.34389658771690557),
 ('RSF (RiverNorth Capital & Income Fund, Inc.)', 0.3485322830580354),
 ('XLV', 0.3486680373936523),
 ('INFL', 0.3514287227508094)]

Text-based prediction tests:

In [None]:
#export

textScraperL = [
    {'class': scrape8K.scraper8K, 'fClass': '8-K',
        'infoTextKeys': ['explanatoryNote','itemTexts','text99']},
    {'class': scrape6K.scraper6K, 'fClass': '6-K',
        'infoTextKeys': ['mainText','text99']},
]

@utils.delegates(getCombDayMapWithLookback)
def getTextPriceDataset(d1, d2, d3, d4, minPrice=None, **kwargs) :
    """
    Constructs a text/price dataset for text-based prediction tests.
    
    First gets day maps from d2 and d3, with specified lookback.
    
    Then, cuts down to stocks that have values for both days.

    If minPrice is not None, also cuts down to stocks with value > minPrice on both days.

    Returns a tuple (syms, symNames, dateStrs, priceMat), where:
    syms is a sorted list of ticker symbols
    symNames is a list of corresponding names from the SEC ticker list ('' if not present)
    dateStrs is a list of date strings in the format '20240624'
    priceMat is a matrix of values with len(syms) rows and len(dateStrs) columns
    """
    dm3 = getCombDayMapWithLookback(d3, **kwargs)
    dm4 = getCombDayMapWithLookback(d4, **kwargs)
    symsWithPrices = sorted(set(dm3.keys()).intersection(dm4.keys()))
    print(len(symsWithPrices),'stock symbols found')
    if minPrice is not None :
        print('restricting to price >=',minPrice,end=' ... ')
        symsWithPrices = [sym for sym in symsWithPrices
                          if dm3[sym]>=minPrice and dm4[sym]>=minPrice]
        print('now',len(symsWithPrices),'stocks')
    cikToTicker = tickerMap.getCikToFirstTickerMap()
    tickerToCik = dict((ticker,cik) for cik,ticker in cikToTicker.items())
    print('restricting to CIKs',end=' ... ')
    symsWithPrices = [sym for sym in symsWithPrices if sym in tickerToCik]
    print('now',len(symsWithPrices),'stocks')
    for s in textScraperL :
        print('loading',s['fClass'],end=' ... ')
        s['scraper'] = s['class'](startD=d1, endD=d2)
        s['scraper'].printCounts(verbose=False)
    print('getting CIK texts',end=' ... ')
    symsWithPricesAsSet = set(symsWithPrices)
    symTexts = collections.defaultdict(list)
    for d in utils.dateStrsBetween(d1, d2, excludeWeekends=True) :
        for s in textScraperL :
            for accNo,info in s['scraper'].infoMap[d].items() :
                if info == 'ERROR' :
                    continue
                ciks = [cik.lstrip('0') for cik in info.get('ciks',[])]
                ciks = [cik for cik in ciks
                        if cikToTicker.get(cik,'-') in symsWithPricesAsSet]
                if len(ciks)==0 :
                    continue
                for cik in ciks :
                    symTexts[cikToTicker[cik]].append('TEXT')
#                 if any(findAllSS(andL, info, ['cikNames']+s['infoTextKeys'])
#                         for andL in searchREs) :
#                     res.append((dStrIso, s['fClass'], accNo, info))
    print(len(symTexts),'stocks with CIK text found')

In [None]:
#notest

getTextPriceDataset('20231001','20240101','20240101','20240401',minPrice=1.0)
s8 = scrape8K.scraper8K(startD='20231001',endD='20240101')
s6 = scrape6K.scraper6K(startD='20231001',endD='20240101')

15656 stock symbols found
restricting to price >= 1.0 ... now 11738 stocks
SEC ticker list length 10283 with 8026 unique values
dict: cik->ticker [multiValue=first] has 8026 keys with 8026 unique values
restricting to CIKs ... now 5542 stocks
loading 8-K ... Total filings: 17114
loading 6-K ... Total filings: 6629
getting CIK texts ... 4916 stocks with CIK text found


In [None]:
# s.infoMap['20231002'].keys()

dict_keys(['0000919574-23-005442', '0001654954-23-012485', '0001654954-23-012499', '0001292814-23-004049', '0001654954-23-012483', '0001654954-23-012503', '0001171843-23-005961', '0001292814-23-004061', '0001292814-23-004063', '0001341004-23-000259', '0001654954-23-012477', '0000950170-23-050915', '0001159508-23-000066', '0001171843-23-005962', '0001654954-23-012452', '0001062993-23-018650', '0001292814-23-004057', '0000950157-23-001016', '0001279569-23-001252', '0001013762-23-000479', '0001193125-23-248004', '0001383395-23-000060', '0001193125-23-248965', '0001395064-23-000127', '0001292814-23-004059', '0001292814-23-004053', '0001292814-23-004055', '0001292814-23-004051', '0001493152-23-034964', '0001477932-23-007292', '0001171843-23-005966', '0000950103-23-014435', '0001104659-23-105920', '0001171843-23-005968', '0001013762-23-000143', '0000950170-23-051129', '0001493152-23-034958', '0001648416-23-000182', '0001013762-23-000318', '0001650107-23-000073', '0001668438-23-000002', '0001

In [None]:
# [k for k,v in s.infoMap['20231002'].items() if 'explanatoryNote' in v]

['0000950170-23-051160',
 '0001104659-23-105520',
 '0001335258-23-000089',
 '0001193125-23-249190',
 '0000714395-23-000046']

In [None]:
# s8.getTextDigest(s8.infoMap['20231002']['0000950170-23-051160'])

In [None]:
# s6.getTextDigest(s6.infoMap['20231002']['0000919574-23-005442'])

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()