In [None]:
# default_exp priceHist

# priceHist

> Parse historical stock price data files.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import datetime
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from secscan import utils,tickerMap,scrape8K,scrape6K

USExchanges = ['AMEX','NASDAQ','NYSE','OTCBB']

Uses the historical price data from eoddata.com -
this comes as one CSV file per exchange per day, sample:

```
Symbol,Date,Open,High,Low,Close,Volume
AACG,08-Jan-2020,1.41,1.58,1.3642,1.5112,90800
AAL,08-Jan-2020,27.1,28.09,27.07,27.84,10497200
...
```


In [None]:
#export

def getHistFStuff(exch,dateStr) :
    """
    Returns (fName, fDir, fPath) containing the CSV file of historical prices
    for the given exchange and date.
    """
    fName = exch+'_'+dateStr+'.csv'
    histFDir = os.path.join(utils.stockPriceRoot,fName[:len(exch)+5])
    histFPath = os.path.join(histFDir,fName)
    return fName,histFDir,histFPath

In [None]:
# test getHistFStuff:

t = getHistFStuff('NASDAQ','20240510')
assert (t[0]=='NASDAQ_20240510.csv'
        and t[1]==os.path.join(utils.stockPriceRoot,'NASDAQ_2024')
        and t[2]==os.path.join(utils.stockPriceRoot,'NASDAQ_2024',t[0]))

In [None]:
#export

def getDayMap(dateStr, exch, symCol='Symbol', priceCol='Open') :
    """
    Parse the CSV file for a single day and exchange.
    Returns a dict:<stock symbol> -> <value>
    """
    fPath = getHistFStuff(exch,dateStr)[2]
    if not os.path.exists(fPath) :
        return {}
    df = pd.read_csv(fPath, na_filter=False)
    dayMap = {}
    for sym,val in zip(df[symCol],df[priceCol]) :
        if isinstance(sym,str) :
            dayMap[sym] = val
        else :
            print(dateStr,exch,'non-string symbol',repr(sym))
    return dayMap

In [None]:
# test getDayMap:

if not ('\\ikedi\\' in utils.stockPriceRoot) :
    utils.setStockPriceRoot('testdata')

m = getDayMap('20200108','AMEX')
assert (len(m)==2024
        and min(m.keys())=='AAAU' and m['AAAU']==15.75
        and max(m.keys())=='ZSL' and m['ZSL']==25.33)

m = getDayMap('20200108','NASDAQ')
assert (len(m)==3233
        and min(m.keys())=='AACG' and m['AACG']==1.41
        and max(m.keys())=='ZYXI' and m['ZYXI']==8.52)

In [None]:
#export

@utils.delegates(getDayMap)
def getCombDayMap(dateStr, exchs=USExchanges, checkDups=True, **kwargs) :
    """
    Combines the day maps for a list of exchanges, optionally checking for symbols
    duplicated between exchanges.
    Returns a dict:<stock symbol> -> <value>
    """
    combDayMap = {}
    for exch in exchs :
        m = getDayMap(dateStr,exch,**kwargs)
        if checkDups :
            dupKeys = (set(m.keys()) & combDayMap.keys())
            if len(dupKeys) > 0 :
                print('duplicated keys:',exch,dateStr,sorted(dupKeys)[:10])
        combDayMap.update(m)
    return combDayMap

In [None]:
# test getCombDayMap:

m = getCombDayMap('20200108', exchs=['AMEX','NASDAQ'])
assert len(m)==2024+3233

In [None]:
#export

dayMapCache = {}

@utils.delegates(getCombDayMap)
def getCombDayMapWithLookback(forD, lookback=7, **kwargs) :
    """
    Get the day map for a given day, looking back a given number of days -
    i.e. if a stock doesn't trade on a particular day, looks back the specified
    number of days to find the most recent trade.
    
    Returns a dict:<stock symbol> -> <value>
    """
    d1 = utils.toDate(forD) + datetime.timedelta(-lookback)
    d2 = utils.toDate(forD) + datetime.timedelta(1)
    dayMap = {}
    for d in utils.dateStrsBetween(d1, d2) :
        if d not in dayMapCache :
            dayMapCache[d] = getCombDayMap(d, **kwargs)
        dayMap.update(dayMapCache[d])
    return dayMap

@utils.delegates(getCombDayMapWithLookback)
def getCombDayMapsForRangeWithLookback(d1, d2, **kwargs) :
    """
    Get all the combined day maps from d1 (inclusive) to d2 (exclusive), with lookback.
    
    Returns a dict:dateStr -> {<stock symbol> -> <value>}
    Skips weekend dates in the returned dict.
    """
    res = {}
    for d in utils.dateStrsBetween(d1, d2, excludeWeekends=True) :
        res[d] = getCombDayMapWithLookback(d, **kwargs)
    return res

Direct indexing tests:

In [None]:
#export

# Load current SEC ticker <-> cik/name mappings:
tickerNames = tickerMap.getSecTickerDict(field2='name')
cikToTicker = tickerMap.getCikToFirstTickerMap()
tickerToCik = dict((ticker,cik) for cik,ticker in cikToTicker.items())

SEC ticker list length 10161 with 10161 unique values
dict: ticker->name [multiValue=last] has 10161 keys with 7987 unique values
SEC ticker list length 10161 with 7988 unique values
dict: cik->ticker [multiValue=first] has 7988 keys with 7988 unique values


In [None]:
#export

@utils.delegates(getCombDayMapsForRangeWithLookback)
def getCleanedPriceData(d1, d2, minPrice=None, restrictToNames=False, **kwargs) :
    """
    Constructs a price dataset for direct indexing tests.
    
    First gets all the combined day maps from d1 to d2, with specified lookback,
    and skipping weekend days.
    
    Then, cuts down to stocks that have values for all the specified days.
    
    If minPrice is not None, also cuts down to stocks with value > minPrice on all days.
    
    If restrictToNames is True, also cuts down to stocks present in the SEC ticker list.
    
    Returns a tuple (syms, symNames, dateStrs, priceMat), where:
    syms is a sorted list of ticker symbols
    symNames is a list of corresponding names from the SEC ticker list ('' if not present)
    dateStrs is a list of date strings in the format '20240624'
    priceMat is a matrix of values with len(syms) rows and len(dateStrs) columns
    """
    dayMaps = getCombDayMapsForRangeWithLookback(d1, d2, **kwargs)
    dateStrs = sorted(dayMaps.keys())
    symsPresentAllDays = sorted(set.intersection(*(set(dayMap.keys())
                                                   for dayMap in dayMaps.values())))
    symsToRemove = set()
    if minPrice is not None :
        for dayMap in dayMaps.values() :
            for sym,val in dayMap.items() :
                if val < minPrice :
                    symsToRemove.add(sym)
    if restrictToNames :
        for sym in symsPresentAllDays :
            if sym not in tickerNames :
                symsToRemove.add(sym)
    if len(symsToRemove) > 0 :
        symsPresentAllDays = [sym for sym in symsPresentAllDays
                              if sym not in symsToRemove]
    priceMat = np.zeros((len(symsPresentAllDays), len(dateStrs)))
    for dateNo,d in enumerate(dateStrs) :
        dayMap = dayMaps[d]
        for symNo,sym in enumerate(symsPresentAllDays) :
            priceMat[symNo,dateNo] = dayMap[sym]
    symNames = [tickerNames.get(sym,'') for sym in symsPresentAllDays]
    print(len(symsPresentAllDays),'symbols',len(dateStrs),'dates')
    return (symsPresentAllDays, symNames, dateStrs, priceMat)

@utils.delegates(getCleanedPriceData)
def getSortedReturns(d1, d2, highestFirst=True, **kwargs) :
    """
    Gets a sorted list of stock price returns between two dates.

    Returns a list of tuples, sorted by return:
        [(sym, symName, <percent return between d1 and d2>)]
    """
    syms, symNames, dateStrs, priceMat = getCleanedPriceData(d1,
        utils.toDateStr(utils.toDate(d2) + datetime.timedelta(1)),
        **kwargs)
    returns = (priceMat[:,-1]/priceMat[:,0] - 1.0)
    res = sorted(zip(syms,symNames,priceMat[:,0],priceMat[:,-1],returns),
                 key = lambda x : x[-1],
                 reverse = highestFirst)
    print('restricting to named stocks',end=' - ')
    res = [el for el in res if el[1]]
    print('now',len(res),'stocks')
    return res

@utils.delegates(getCleanedPriceData)
def getForwardReturns(d1, d2, weekdaysForward=20, **kwargs) :
    """
    Calculates a matrix of forward percentage returns looking ahead weekdaysForward
    weekdays for each date from d1 (inclusive) to d2 (exclusive). Only weekdays are
    included in the matrix to avoid over-emphasizing forward returns from Fridays.

    Returns a tuple (syms, symNames, dateStrs, returnMat), as for getCleanedPriceData,
    except returnMat is a matrix of forward returns looking ahead from each day.
    """
    d3 = utils.toDateStr(utils.toDate(d2)
                         + datetime.timedelta(weekdaysForward + 2*((weekdaysForward+4)//5)))
    syms, symNames, dateStrs, priceMat = getCleanedPriceData(d1,d3,**kwargs)
    dateStrs = [d for d in dateStrs if d<d2]
    returnMat = np.zeros((len(syms), len(dateStrs)))
    for j in range(len(dateStrs)) :
        returnMat[:,j] = (priceMat[:,j+weekdaysForward]/priceMat[:,j] - 1.0)
    return (syms, symNames, dateStrs, returnMat)

def getClosestReturn(sym, syms, symNames, dateStrs, returnMat, fName=None, n=20) :
    """
    Finds the symbols with the closest average forward return to a given symbol.

    Returns a list:
        [(sym, symName, <total absolute value of forward return differences>)]
    """
    rowNo = syms.index(sym)
    diffs = ((returnMat - returnMat[rowNo])**2).sum(axis=1)
    symsAndDiffs = list(zip(syms,symNames,diffs))
    symsAndDiffs.sort(key=lambda x : x[2])
    if n is not None :
        symsAndDiffs = symsAndDiffs[:n]
    if fName is not None :
        genServList(fName,symsAndDiffs)
    return symsAndDiffs


def genServList(fName, syms) :
    """
    Save a list of stock symbols and CIKs in display format for my secscrape server.
    """
    print('saving serv list',fName)
    with open(os.path.expanduser(os.path.join('~','Dropbox',
                    'sw','secScripts',fName+'.txt')),'w') as f :
        for sym in syms :
            if isinstance(sym,tuple) :
                sym = sym[0]
            if sym not in tickerToCik :
                print('skipping',sym,'(no CIK)')
            else :
                f.write(f"    {tickerToCik[sym]} # {sym}\n")

Max and min returns over an interval:

In [None]:
#notest

m = getSortedReturns('20240401','20240714', minPrice=1.0)

duplicated keys: OTCBB 20240423 ['GBTC', 'RILY']
duplicated keys: OTCBB 20240502 ['GBTC']
10561 symbols 75 dates
restricting to named stocks - now 5695 stocks


In [None]:
#notest

m[:20]

[('CATX', 'Perspective Therapeutics, Inc.', 1.25, 12.12, 8.696),
 ('GDXD', 'BANK OF MONTREAL /CAN/', 3.15, 16.66, 4.288888888888889),
 ('REBN', 'Reborn Coffee, Inc.', 1.26, 6.11, 3.8492063492063497),
 ('INSG', 'INSEEGO CORP.', 2.79, 11.56, 3.1433691756272406),
 ('CORZZ', 'Core Scientific, Inc./tx', 2.32, 9.47, 3.081896551724139),
 ('ASTS', 'AST SpaceMobile, Inc.', 2.93, 11.88, 3.054607508532423),
 ('EBS', 'Emergent BioSolutions Inc.', 2.55, 10.03, 2.9333333333333336),
 ('PSIX',
  'POWER SOLUTIONS INTERNATIONAL, INC.',
  2.35,
  9.04,
  2.846808510638297),
 ('FTEL', 'Fitell Corp', 7.9, 29.99, 2.7962025316455694),
 ('WGS', 'GeneDx Holdings Corp.', 9.1, 34.17, 2.7549450549450554),
 ('NYCB', 'NEW YORK COMMUNITY BANCORP, INC.', 3.24, 11.91, 2.6759259259259256),
 ('KOSS', 'KOSS CORP', 2.64, 9.33, 2.534090909090909),
 ('CADL', 'Candel Therapeutics, Inc.', 1.72, 6.0, 2.488372093023256),
 ('AEHL', 'Antelope Enterprise Holdings Ltd', 1.64, 5.56, 2.3902439024390243),
 ('UVXY', 'ProShares Trust II

In [None]:
#notest

m[-20:]

[('ZNTL', 'Zentalis Pharmaceuticals, Inc.', 15.58, 4.03, -0.7413350449293967),
 ('VSTM', 'Verastem, Inc.', 11.86, 3.03, -0.7445193929173692),
 ('IVVD', 'Invivyd, Inc.', 4.7, 1.2, -0.7446808510638299),
 ('BIG', 'BIG LOTS INC', 4.4, 1.11, -0.7477272727272728),
 ('AONC', 'American Oncology Network, Inc.', 5.35, 1.31, -0.7551401869158878),
 ('IZM', 'ICZOOM Group Inc.', 8.5, 1.99, -0.7658823529411765),
 ('ENGNW', 'enGene Holdings Inc.', 5.31, 1.23, -0.768361581920904),
 ('ZKH', 'ZKH Group Ltd', 16.11, 3.73, -0.7684667908131595),
 ('CRTD', 'Creatd, Inc.', 4.4, 1.0, -0.7727272727272727),
 ('NWGL', 'Nature Wood Group Ltd', 15.04, 3.19, -0.7878989361702128),
 ('CRNC', 'Cerence Inc.', 15.8, 3.19, -0.7981012658227848),
 ('NOTV', 'Inotiv, Inc.', 11.0, 2.0, -0.8181818181818181),
 ('GCTS', 'GCT Semiconductor Holding, Inc.', 29.0, 5.1, -0.8241379310344827),
 ('FUJIY', 'FUJI PHOTO FILM CO LTD /FI', 70.25, 12.12, -0.8274733096085409),
 ('NWTN', 'NWTN, Inc.', 6.24, 1.04, -0.8333333333333334),
 ('MRNS', 

Closest forward returns over all the days in an interval:

In [None]:
#notest

m = getForwardReturns('20240101','20240615',minPrice=1.0)

9981 symbols 140 dates


In [None]:
#notest

getClosestReturn('MNDO',*m)

[('MNDO', 'MIND CTI LTD', 0.0),
 ('PRI-L', '', 0.150786798250306),
 ('REGCO', 'REGENCY CENTERS CORP', 0.15623323991179444),
 ('ANCTF', '', 0.15825376544828385),
 ('AIF', '', 0.15884752792716933),
 ('BME', 'BlackRock Health Sciences Trust', 0.16135522353362927),
 ('WRB-G', '', 0.162102727295135),
 ('ABT', 'ABBOTT LABORATORIES', 0.16869169232840456),
 ('FAIL', '', 0.17584905810304813),
 ('RSF', 'RiverNorth Capital & Income Fund, Inc.', 0.176096806139761),
 ('TRT-C', '', 0.1767701230331341),
 ('CHY', 'CALAMOS CONVERTIBLE & HIGH INCOME FUND', 0.17740224547552594),
 ('DLR-K', '', 0.17758247900251733),
 ('ENX', 'EATON VANCE NEW YORK MUNICIPAL BOND FUND', 0.1787138901154057),
 ('DLR-J', '', 0.18046714129889124),
 ('CULL', 'Cullman Bancorp, Inc. /MD/', 0.18257851993533203),
 ('TAK', 'TAKEDA PHARMACEUTICAL CO LTD', 0.18378324625111211),
 ('PMF', 'PIMCO MUNICIPAL INCOME FUND', 0.18503522954135557),
 ('PSA-J', '', 0.18720806266925138),
 ('BCV-A', '', 0.18777675160267926)]

In [None]:
#notest

getClosestReturn('TAIT',*m)

[('TAIT', 'TAITRON COMPONENTS INC', 0.0),
 ('KFFB', 'Kentucky First Federal Bancorp', 0.14538973846305878),
 ('FLBR', '', 0.17198881940562005),
 ('KLIP', '', 0.1787211560796597),
 ('NOC', 'NORTHROP GRUMMAN CORP /DE/', 0.17949008898455426),
 ('EWZ', '', 0.18528258102466955),
 ('ESLT', 'ELBIT SYSTEMS LTD', 0.18604156120943144),
 ('FLLA', '', 0.18631143809515038),
 ('SRG-A', '', 0.1920282855710523),
 ('CPPTL', 'Copper Property CTL Pass Through Trust', 0.1935995140600607),
 ('BRAZ', '', 0.19735889334794265),
 ('KWR', 'QUAKER CHEMICAL CORP', 0.19855884616373368),
 ('FLSA', '', 0.20960313835676808),
 ('THD', '', 0.2096700252778919),
 ('MSM', 'MSC INDUSTRIAL DIRECT CO INC', 0.20977692680253807),
 ('ILF', '', 0.21382553394218418),
 ('TD', 'TORONTO DOMINION BANK', 0.21413124924008203),
 ('LFGP', '', 0.21501306778188117),
 ('FLN', '', 0.21608857272755927),
 ('CCOR', '', 0.21744128436155158)]

In [None]:
#notest

getClosestReturn('RELL',*m)

[('RELL', 'RICHARDSON ELECTRONICS, LTD.', 0.0),
 ('ARLP', 'ALLIANCE RESOURCE PARTNERS LP', 0.8426632731221821),
 ('HR', 'Healthcare Realty Trust Inc', 0.8685501138643191),
 ('NXRT', 'NexPoint Residential Trust, Inc.', 0.8877812501283826),
 ('BRT', 'BRT Apartments Corp.', 0.8944790543762936),
 ('RILYL', 'B. Riley Financial, Inc.', 0.9117271819743408),
 ('CHDN', 'Churchill Downs Inc', 0.9602184808245319),
 ('WHITF', '', 0.9867460113787685),
 ('NEM', 'NEWMONT Corp /DE/', 0.9937202679545482),
 ('DSWL', 'DESWELL INDUSTRIES INC', 0.9968389583402446),
 ('FSNUY', '', 0.9983663328171469),
 ('RWWI', '', 1.000894652211174),
 ('STRS', 'STRATUS PROPERTIES INC', 1.005896142416717),
 ('NHYKF', '', 1.013227836348273),
 ('EWV', '', 1.0164930525688431),
 ('KRBN', '', 1.0325450561350438),
 ('KDP', 'Keurig Dr Pepper Inc.', 1.0460477212218473),
 ('PSHG', 'Performance Shipping Inc.', 1.0605056929521017),
 ('AMDS', '', 1.0677273526710604),
 ('VTR', 'Ventas, Inc.', 1.0692890187562403)]

In [None]:
#notest

getClosestReturn('RELL',*m,'zzrellret')

saving serv list zzrellret
skipping OARK (no CIK)
skipping SEAS (no CIK)
skipping CWE.A (no CIK)
skipping CNRG (no CIK)


[('RELL', 'RICHARDSON ELECTRONICS, LTD.', 0.0),
 ('CASS', 'CASS INFORMATION SYSTEMS INC', 2.3632986384989505),
 ('POWI', 'POWER INTEGRATIONS INC', 2.8332377785971987),
 ('GWRS', 'Global Water Resources, Inc.', 2.8404981620316088),
 ('ANGO', 'ANGIODYNAMICS INC', 2.8471128912348176),
 ('OARK', '', 2.9107047683916987),
 ('RRBI', 'RED RIVER BANCSHARES INC', 2.9644257369823506),
 ('RICK', 'RCI HOSPITALITY HOLDINGS, INC.', 2.9732589455373617),
 ('GTY', 'GETTY REALTY CORP /MD/', 2.982627077477133),
 ('RGCO', 'RGC RESOURCES INC', 2.9997206622373067),
 ('TMP', 'TOMPKINS FINANCIAL CORP', 3.0179666671694),
 ('KLIC', 'KULICKE & SOFFA INDUSTRIES INC', 3.0608739237335296),
 ('SEAS', '', 3.0614640639877395),
 ('AMBA', 'AMBARELLA INC', 3.0634315959649765),
 ('CWE.A', '', 3.0799862214034004),
 ('CNRG', '', 3.136485251345724),
 ('CWEN', 'Clearway Energy, Inc.', 3.1397049276381717),
 ('KEYS', 'Keysight Technologies, Inc.', 3.1410272634367935),
 ('MYE', 'MYERS INDUSTRIES INC', 3.1592785252589612),
 ('DENN

In [None]:
#notest

getClosestReturn('SILC',*m,'zzsilcret')

saving serv list zzsilcret
skipping NJDCY (no CIK)
skipping CNHI (no CIK)
skipping NNDNF (no CIK)
skipping TAN (no CIK)
skipping RAYS (no CIK)


[('SILC', 'SILICOM LTD.', 0.0),
 ('B', 'BARNES GROUP INC', 2.661399007253552),
 ('AMN', 'AMN HEALTHCARE SERVICES INC', 2.724160992667765),
 ('VMI', 'VALMONT INDUSTRIES INC', 2.7338617655898383),
 ('CALX', 'CALIX, INC', 3.023091318498608),
 ('NTCT', 'NETSCOUT SYSTEMS INC', 3.093619695327906),
 ('WHR', 'WHIRLPOOL CORP /DE/', 3.1580493246124544),
 ('NJDCY', '', 3.193515114816411),
 ('CNHI', '', 3.29110063382151),
 ('HTLD', 'HEARTLAND EXPRESS INC', 3.3082190659668576),
 ('GGT', 'GABELLI MULTIMEDIA TRUST INC.', 3.3232999413469373),
 ('PAYC', 'Paycom Software, Inc.', 3.3241541343912147),
 ('IRDM', 'Iridium Communications Inc.', 3.3623235906548),
 ('CCRN', 'CROSS COUNTRY HEALTHCARE INC', 3.366726739268066),
 ('NNDNF', '', 3.3774229025067783),
 ('F', 'FORD MOTOR CO', 3.4039068620830566),
 ('TAN', '', 3.4594237422058955),
 ('RAYS', '', 3.4860890952497),
 ('EL', 'ESTEE LAUDER COMPANIES INC', 3.4909226509975593),
 ('NWG', 'NatWest Group plc', 3.5315279137915208)]

In [None]:
#notest

getClosestReturn('ELV',*m,'zzelvret')

saving serv list zzelvret
skipping LBO (no CIK)
skipping CHM-B (no CIK)
skipping FAPR (no CIK)
skipping MBNKP (no CIK)
skipping INBKZ (no CIK)
skipping EFC-A (no CIK)
skipping CIM-B (no CIK)
skipping HAWX (no CIK)
skipping MAYT (no CIK)
skipping DX-C (no CIK)
skipping GAPR (no CIK)
skipping AGNCM (no CIK)


[('ELV', 'Elevance Health, Inc.', 0.0),
 ('FRA',
  'BLACKROCK FLOATING RATE INCOME STRATEGIES FUND, INC.',
  0.07792145358025468),
 ('PG', 'PROCTER & GAMBLE Co', 0.08380181782246093),
 ('IHTA', 'Invesco High Income 2024 Target Term Fund', 0.08649365255295244),
 ('EFT', 'Eaton Vance Floating-Rate Income Trust', 0.09259227807810386),
 ('LBO', '', 0.09527177383962684),
 ('CHM-B', '', 0.09736963092743309),
 ('PHD', 'Pioneer Floating Rate Fund, Inc.', 0.09987258228675128),
 ('FAPR', '', 0.10052034936773645),
 ('MBNKP', 'MEDALLION FINANCIAL CORP', 0.10133025031148749),
 ('FCT',
  'FIRST TRUST SENIOR FLOATING RATE INCOME FUND II',
  0.10353472774718903),
 ('FMN',
  'Federated Hermes Premier Municipal Income Fund',
  0.10409115273864003),
 ('INBKZ', 'First Internet Bancorp', 0.1041426621148287),
 ('EFC-A', '', 0.10460069379087342),
 ('CIM-B', '', 0.10489488496021222),
 ('HAWX', '', 0.10557372797964441),
 ('MAYT', '', 0.1056304937108109),
 ('DX-C', '', 0.10599815419396316),
 ('GAPR', '', 0.1065

Text-based prediction tests:

In [None]:
#export

textScraperClasses = [
    scrape8K.scraper8K, scrape6K.scraper6K,
]

@utils.delegates(getCombDayMapWithLookback)
def getTextPriceDataset(d1, d2, d3, d4, minPrice=None, **kwargs) :
    """
    Constructs a text/price dataset for text-based prediction tests.
    
    First gets day maps from d3 and d4, with specified lookback.
    
    Then, cuts down to stocks that have values for both days.

    If minPrice is not None, also cuts down to stocks with value > minPrice on both days.

    Then, creates a dict symTexts mapping each stock symbol to a digest of the text for
    that symbol between dates d1 and d2, scraped from SEC 8-K and 6-K filings.

    Then, cuts down to stocks for which some text was found between dates d1 and d2.

    Returns a tuple (symsWithPrices, dmStart, dmEnd, mReturns, symTexts), where:
    symsWithPrices is a sorted list of ticker symbols that passed the filters above.
    dmStart, dmEnd is d3, d4
    mReturns is a dict: sym -> <return between d3 and d4>
    symTexts is a dict: sym -> [text1, text2, ... ], earlier texts first
    """
    dmStart = getCombDayMapWithLookback(d3, **kwargs)
    dmEnd = getCombDayMapWithLookback(d4, **kwargs)
    symsWithPrices = sorted(set(dmStart.keys()).intersection(dmEnd.keys()))
    print(len(symsWithPrices),'stock symbols found')
    if minPrice is not None :
        print('restricting to price >=',minPrice,end=' ... ')
        symsWithPrices = [sym for sym in symsWithPrices
                          if dmStart[sym]>=minPrice and dmEnd[sym]>=minPrice]
        print('now',len(symsWithPrices),'stocks')
    print('restricting to CIKs',end=' ... ')
    symsWithPrices = [sym for sym in symsWithPrices if sym in tickerToCik]
    print('now',len(symsWithPrices),'stocks')
    scraperL = []
    for cl in textScraperClasses :
        print('loading',cl,end=' ... ')
        scraperL.append(cl(startD=d1, endD=d2))
        scraperL[-1].printCounts(verbose=False)
    print('getting CIK texts',end=' ... ')
    symsWithPricesAsSet = set(symsWithPrices)
    symTexts = collections.defaultdict(list)
    dStrList = utils.dateStrsBetween(d1, d2)
    nDays = len(dStrList)
    for dNo,d in enumerate(dStrList) :
        for s in scraperL :
            for accNo,info in s.infoMap[d].items() :
                if info == 'ERROR' :
                    continue
                ciks = [cik.lstrip('0') for cik in info.get('ciks',[])]
                ciks = [cik for cik in ciks
                        if cikToTicker.get(cik,'-') in symsWithPricesAsSet]
                if len(ciks)==0 :
                    continue
                tDigest = s.getTextDigest(info).strip()
                if tDigest == '' :
                    continue
                fDesc = f'FORM {s.formClass} -{nDays-dNo} DAYS.'
                tDigest = 'START ' + fDesc + ' ' + tDigest + ' END ' + fDesc
                for cik in ciks :
                    symTexts[cikToTicker[cik]].append(tDigest)
    print(len(symTexts),'stocks with CIK text found')
    symsWithPrices = [sym for sym in symsWithPrices if sym in symTexts]
    dmStart = dict((sym,val) for sym,val in dmStart.items() if sym in symTexts)
    dmEnd = dict((sym,val) for sym,val in dmEnd.items() if sym in symTexts)
    mReturns = dict((sym, dmEnd[sym]/dmStart[sym] - 1.0)
                    for sym in symsWithPrices)
    return symsWithPrices, dmStart, dmEnd, mReturns, symTexts

def getCombTextDigest(sym, symTexts, maxLen=8191) :
    """
    Returns a combined text digest for a symbol, where symTexts maps symbols
    to lists of text digests, earliest first. Just concatenates all the most
    recent text digests that will fit into the given character limit.
    """
    textL = []
    totLen = 0
    for txt in reversed(symTexts[sym]) :
        # stay below limit, while keeping newest texts
        print('text length',len(txt),txt[:20],end='... ')
        if totLen + len(txt) + 1 > maxLen :
            print('limit exceeded, stopping',end=' ')
            break
        textL.append(txt)
        totLen += (len(txt) + 1)
    textL.reverse() # order old to new
    textL.append('')
    return '\n'.join(textL)

openAI_client = None

@utils.delegates(getCombTextDigest)
def getCombTextEmbedding(sym, symTexts, model='text-embedding-3-small', **kwargs) :
    """
    Calls the OpenAI API to get the embedding for the combined text digest for a given symbol.
    """
    combText = getCombTextDigest(sym, symTexts, **kwargs)
    print('symbol',sym,'text length',len(combText))
    response = openAI_client.embeddings.create(input=combText, model=model)
    return np.array(response.data[0].embedding)

@utils.delegates(getCombTextEmbedding)
def cacheEmbeddings(fName, syms, symTexts, **kwargs) :
    """
    Caches the embeddings for the combined text digests for the given symbols
    in the named file under utils.stockDataRoot, subdir 'embeddings'.
    """
    fDir = os.path.join(utils.stockDataRoot,'embeddings')
    embCache = utils.loadPklFromDir(fDir,fName,{})
    dirty = False
    nMissing = 0
    for sym in syms :
        if sym in embCache :
            print(sym,'loaded',end='; ')
        elif openAI_client is None :
            print('MISSING',sym,end='; ')
            nMissing += 1
        else :
            print('getting',sym,'embedding',end=' ')
            embCache[sym] = getCombTextEmbedding(sym, symTexts, **kwargs)
            dirty = True
    if nMissing > 0 :
        print()
        print('total of',nMissing,'missing embeddings')
        raise MemoryError('openAI client not initialized '
                          + '- initialize it and rerun to get embeddings')
    if dirty :
        print('saving cache to',fName)
        utils.savePklToDir(fDir,fName,embCache)
    print(len(embCache),'embeddings in cache')
    return embCache

def findClosestEmbs(toSym, syms, cemb, fName=None, n=20,
                    distFunc=lambda x,y : x.dot(y), reverse=True) :
    """
    Finds the symbols in the given list with the embeddings closest to the given symbol,
    using the given distance function. The embeddings must have been previously cached
    using cacheEmbeddings.
    """
    distL = [(sym, distFunc(cemb[toSym],cemb[sym]))
               for sym in syms]
    distL.sort(key = lambda x : x[1], reverse=reverse)
    if n is not None :
        distL = distL[:n]
    if fName is not None :
        genServList(fName, distL)
    return distL

def makePortEmbDataSet(portSyms, syms, cemb, posWeight=1.0, negWeight=0.001) :
    """
    Create a data set for training classification models based on a given list of portfolio symbols.
    The embeddings for the portfolio symbols will have y=1 and weight=posWeight,
    while those for the non-portfolio symbols will have y=0 and weight=negWeight.
    """
    x = np.zeros((len(syms), len(cemb[syms[0]])))
    y = np.zeros(len(syms), dtype=np.int32)
    weights = np.zeros(len(syms))
    for i,sym in enumerate(syms) :
        x[i] = cemb[sym]
        if sym in portSyms :
            y[i] = 1
            weights[i] = posWeight
        else :
            weights[i] = negWeight
    return x,y,weights

@utils.delegates(makePortEmbDataSet)
def getRecsFromPort(portSyms, syms, cemb,
                    n=30, model=None, **kwargs) :
    """
    Fit a classification model based on the given portfolio of symbols
    and a cache of other embeddings. Returns the best predicted symbols
    from among the given symbol list.
    """
    dd = makePortEmbDataSet(portSyms, syms, cemb, **kwargs)
    if model is None :
        model = LogisticRegression()
    model.fit(*dd)
    ypred = model.predict_log_proba(dd[0])
    res = sorted(zip(syms,np.exp(ypred)[:,0]),key=lambda x : x[1])[:n]
    return [(sym,('* ' if sym in portSyms else '') + tickerNames[sym],p)
            for sym,p in res]

Code to initialize OpenAI client:

In [None]:
#notest

from openai import OpenAI
#from openai.embeddings_utils import get_embedding, cosine_similarity
with open(os.path.join('..','xyzzy.txt'),'r') as f :
    openAI_api_key = f.read()
openAI_client = OpenAI(api_key=openAI_api_key)

Testing on text 20230701 - 20240101:

In [None]:
#notest

m = getTextPriceDataset('20230701', '20240101',
                        '20240101', '20240401',
                        minPrice=1.0, priceCol='Open', lookback=14)

16906 stock symbols found
restricting to price >= 1.0 ... now 12415 stocks
restricting to CIKs ... now 5487 stocks
loading <class 'secscan.scrape8K.scraper8K'> ... Total filings: 33632
loading <class 'secscan.scrape6K.scraper6K'> ... Total filings: 12995
getting CIK texts ... 4971 stocks with CIK text found


In [None]:
#notest

cemb = cacheEmbeddings('2023H2-te3-small.pkl',m[0],m[-1])

A loaded; AA loaded; AACG loaded; AACT loaded; AADI loaded; AAL loaded; AAMC loaded; AAME loaded; AAN loaded; AAOI loaded; AAON loaded; AAP loaded; AAPL loaded; AAT loaded; AAWH loaded; AB loaded; ABAT loaded; ABBV loaded; ABCB loaded; ABCL loaded; ABEO loaded; ABEV loaded; ABG loaded; ABL loaded; ABLV loaded; ABLZF loaded; ABM loaded; ABNB loaded; ABOS loaded; ABR loaded; ABSI loaded; ABT loaded; ABUS loaded; ABVC loaded; AC loaded; ACA loaded; ACAB loaded; ACAC loaded; ACAD loaded; ACCD loaded; ACCO loaded; ACDC loaded; ACEL loaded; ACET loaded; ACFN loaded; ACGL loaded; ACHC loaded; ACHR loaded; ACHV loaded; ACI loaded; ACIC loaded; ACIU loaded; ACIW loaded; ACLS loaded; ACLX loaded; ACM loaded; ACMR loaded; ACN loaded; ACNB loaded; ACNT loaded; ACR loaded; ACRE loaded; ACRG loaded; ACRS loaded; ACRV loaded; ACST loaded; ACT loaded; ACTG loaded; ACU loaded; ACVA loaded; ACXP loaded; ADAG loaded; ADBE loaded; ADC loaded; ADCT loaded; ADEA loaded; ADI loaded; ADIL loaded; ADM loaded; 

FGB loaded; FGBI loaded; FGF loaded; FGI loaded; FGPR loaded; FHB loaded; FHI loaded; FHN loaded; FHTX loaded; FI loaded; FIAC loaded; FIBK loaded; FICO loaded; FIGS loaded; FIHL loaded; FINV loaded; FINW loaded; FIP loaded; FIS loaded; FISI loaded; FITB loaded; FIVE loaded; FIVN loaded; FIX loaded; FIZZ loaded; FKWL loaded; FKYS loaded; FL loaded; FLCX loaded; FLEX loaded; FLGC loaded; FLGT loaded; FLIC loaded; FLL loaded; FLNC loaded; FLNG loaded; FLO loaded; FLR loaded; FLS loaded; FLUX loaded; FLWS loaded; FLXS loaded; FLYW loaded; FLYX loaded; FMAO loaded; FMBH loaded; FMBM loaded; FMC loaded; FMCB loaded; FMFG loaded; FMNB loaded; FMS loaded; FMST loaded; FMX loaded; FMY loaded; FN loaded; FNA loaded; FNB loaded; FNCH loaded; FND loaded; FNF loaded; FNGR loaded; FNKO loaded; FNLC loaded; FNRN loaded; FNV loaded; FNVT loaded; FNWB loaded; FNWD loaded; FOLD loaded; FONR loaded; FOR loaded; FORA loaded; FORL loaded; FORM loaded; FORR loaded; FORTY loaded; FOSL loaded; FOUR loaded; F

loaded; NRXS loaded; NSA loaded; NSC loaded; NSIT loaded; NSP loaded; NSPR loaded; NSSC loaded; NSTC loaded; NSTD loaded; NSTS loaded; NSYS loaded; NTAP loaded; NTB loaded; NTBL loaded; NTCT loaded; NTES loaded; NTG loaded; NTGR loaded; NTIC loaded; NTIP loaded; NTLA loaded; NTNX loaded; NTPIF loaded; NTR loaded; NTRA loaded; NTRB loaded; NTRS loaded; NTST loaded; NTWK loaded; NTZ loaded; NU loaded; NUE loaded; NURO loaded; NUS loaded; NUV loaded; NUVB loaded; NUVL loaded; NUVR loaded; NUW loaded; NUZE loaded; NVAX loaded; NVCR loaded; NVCT loaded; NVDA loaded; NVEC loaded; NVEE loaded; NVEI loaded; NVFY loaded; NVG loaded; NVGS loaded; NVMI loaded; NVNI loaded; NVNO loaded; NVO loaded; NVR loaded; NVRI loaded; NVRO loaded; NVS loaded; NVST loaded; NVT loaded; NVTS loaded; NVX loaded; NWBI loaded; NWE loaded; NWFL loaded; NWG loaded; NWGL loaded; NWL loaded; NWN loaded; NWPX loaded; NWSA loaded; NWTN loaded; NX loaded; NXC loaded; NXDT loaded; NXE loaded; NXGL loaded; NXJ loaded; NXN l

VTEX loaded; VTGN loaded; VTLE loaded; VTMX loaded; VTNR loaded; VTOL loaded; VTR loaded; VTRS loaded; VTS loaded; VTSI loaded; VTVT loaded; VTYX loaded; VUZI loaded; VVI loaded; VVOS loaded; VVPR loaded; VVV loaded; VVX loaded; VWFB loaded; VYGR loaded; VYNE loaded; VYX loaded; VZ loaded; VZIO loaded; VZLA loaded; W loaded; WAB loaded; WABC loaded; WAFD loaded; WAL loaded; WALD loaded; WASH loaded; WAT loaded; WATT loaded; WAVE loaded; WAVS loaded; WB loaded; WBA loaded; WBD loaded; WBQNL loaded; WBS loaded; WBX loaded; WCC loaded; WCN loaded; WD loaded; WDAY loaded; WDC loaded; WDFC loaded; WDH loaded; WDS loaded; WEAV loaded; WEBNF loaded; WEC loaded; WEL loaded; WELL loaded; WEN loaded; WERN loaded; WES loaded; WEST loaded; WETH loaded; WEX loaded; WEYS loaded; WF loaded; WFC loaded; WFCF loaded; WFG loaded; WFRD loaded; WGO loaded; WGS loaded; WH loaded; WHD loaded; WHF loaded; WHG loaded; WHLM loaded; WHR loaded; WILC loaded; WINA loaded; WING loaded; WIT loaded; WIX loaded; WK l

In [None]:
#notest

getRecsFromPort(['RELL','ESP','TRT','TLF'],m[0],cemb,
                negWeight=0.001, n=50)

[('ESP', '* ESPEY MFG & ELECTRONICS CORP', 0.5103443778693106),
 ('TLF', '* TANDY LEATHER FACTORY INC', 0.5228942125882281),
 ('RELL', '* RICHARDSON ELECTRONICS, LTD.', 0.5283264842587194),
 ('POCI', 'PRECISION OPTICS CORPORATION, INC.', 0.5318226823118917),
 ('TAYD', 'TAYLOR DEVICES INC', 0.5336584962639619),
 ('TRT', '* TRIO-TECH INTERNATIONAL', 0.5337946397012571),
 ('SPRS', 'SURGE COMPONENTS INC', 0.5355219875040557),
 ('VALU', 'VALUE LINE INC', 0.5370922394598208),
 ('AMPG', 'AmpliTech Group, Inc.', 0.5377604920127741),
 ('GLSI', 'Greenwich LifeSciences, Inc.', 0.5389770590601708),
 ('ATNM', 'Actinium Pharmaceuticals, Inc.', 0.540936250396843),
 ('EPSN', 'Epsilon Energy Ltd.', 0.5412352003486124),
 ('CCEL', 'CRYO CELL INTERNATIONAL INC', 0.5435622612699902),
 ('TEL', 'TE Connectivity Ltd.', 0.5442406664409611),
 ('NEOV', 'NeoVolta Inc.', 0.5445194553388609),
 ('KTCC', 'KEY TRONIC CORP', 0.5447756174645937),
 ('LYTS', 'LSI INDUSTRIES INC', 0.544811406320904),
 ('TWIN', 'TWIN DISC I

In [None]:
#notest

getRecsFromPort(['RELL','ESP','TRT','TLF'],m[0],cemb,
                negWeight=0.001, n=50)

[('ESP', '* ESPEY MFG & ELECTRONICS CORP', 0.5149025714566895),
 ('TLF', '* TANDY LEATHER FACTORY INC', 0.5274361633259722),
 ('RELL', '* RICHARDSON ELECTRONICS, LTD.', 0.5326767101260544),
 ('POCI', 'PRECISION OPTICS CORPORATION, INC.', 0.5365172356060033),
 ('TAYD', 'TAYLOR DEVICES INC', 0.538388076615355),
 ('TRT', '* TRIO-TECH INTERNATIONAL', 0.5384500817639699),
 ('SPRS', 'SURGE COMPONENTS INC', 0.5402048803993464),
 ('VALU', 'VALUE LINE INC', 0.5418022449295139),
 ('AMPG', 'AmpliTech Group, Inc.', 0.5424905428362589),
 ('INRD', 'Inrad Optics, Inc.', 0.5431471911896562),
 ('GLSI', 'Greenwich LifeSciences, Inc.', 0.5437110570740555),
 ('ATNM', 'Actinium Pharmaceuticals, Inc.', 0.5456385213305739),
 ('EPSN', 'Epsilon Energy Ltd.', 0.5459450848375516),
 ('TSRI', 'TSR INC', 0.5466850595687373),
 ('CCEL', 'CRYO CELL INTERNATIONAL INC', 0.5483699233509016),
 ('TEL', 'TE Connectivity Ltd.', 0.5488421410865695),
 ('NEOV', 'NeoVolta Inc.', 0.5492950899801988),
 ('LYTS', 'LSI INDUSTRIES INC

In [None]:
#notest

getRecsFromPort(['RELL','ESP','TRT','TLF'],m[0],cemb,
                negWeight=0.001, n=50,
                model=RandomForestClassifier(n_estimators=1000, random_state=42))

  return np.log(proba)


[('ESP', '* ESPEY MFG & ELECTRONICS CORP', 0.3509999999999999),
 ('TLF', '* TANDY LEATHER FACTORY INC', 0.355),
 ('TRT', '* TRIO-TECH INTERNATIONAL', 0.357),
 ('RELL', '* RICHARDSON ELECTRONICS, LTD.', 0.377),
 ('GLSI', 'Greenwich LifeSciences, Inc.', 0.985),
 ('POCI', 'PRECISION OPTICS CORPORATION, INC.', 0.985),
 ('AMPG', 'AmpliTech Group, Inc.', 0.987),
 ('VALU', 'VALUE LINE INC', 0.989),
 ('EPSN', 'Epsilon Energy Ltd.', 0.992),
 ('TAYD', 'TAYLOR DEVICES INC', 0.992),
 ('TXN', 'TEXAS INSTRUMENTS INC', 0.992),
 ('ATNM', 'Actinium Pharmaceuticals, Inc.', 0.993),
 ('CMPR', 'CIMPRESS plc', 0.993),
 ('INRD', 'Inrad Optics, Inc.', 0.993),
 ('NEOV', 'NeoVolta Inc.', 0.993),
 ('SPRS', 'SURGE COMPONENTS INC', 0.993),
 ('FN', 'Fabrinet', 0.994),
 ('JBSAY', 'JBS S.A.', 0.994),
 ('CELZ', 'CREATIVE MEDICAL TECHNOLOGY HOLDINGS, INC.', 0.995),
 ('FALC', 'FALCONSTOR SOFTWARE INC', 0.995),
 ('HBIA', 'HILLS BANCORPORATION', 0.995),
 ('KTCC', 'KEY TRONIC CORP', 0.995),
 ('LDDD', 'Longduoduo Co Ltd', 0

In [None]:
#notest

getRecsFromPort(['TT'],m[0],cemb,
                negWeight=0.001, n=20)

[('TT', '* Trane Technologies plc', 0.7956323696200615),
 ('TSE', 'Trinseo PLC', 0.8256587275706493),
 ('TNC', 'TENNANT CO', 0.826817724818906),
 ('TROX', 'Tronox Holdings plc', 0.827081538653363),
 ('HON', 'HONEYWELL INTERNATIONAL INC', 0.8272827700481711),
 ('TRN', 'TRINITY INDUSTRIES INC', 0.8280596145304949),
 ('TSCO', 'TRACTOR SUPPLY CO /DE/', 0.8280821516863961),
 ('NVT', 'nVent Electric plc', 0.8286850256671003),
 ('ITT', 'ITT INC.', 0.828754304276702),
 ('ITRI', 'ITRON, INC.', 0.8288244104511093),
 ('TRS', 'TRIMAS CORP', 0.8291020863908336),
 ('TRU', 'TransUnion', 0.8298047962971123),
 ('TTI', 'TETRA TECHNOLOGIES INC', 0.8300957554397238),
 ('TNET', 'TRINET GROUP, INC.', 0.8303290243491801),
 ('CXT', 'Crane NXT, Co.', 0.830420969882765),
 ('TYL', 'TYLER TECHNOLOGIES INC', 0.8304801044982141),
 ('WTW', 'WILLIS TOWERS WATSON PLC', 0.8306393809810833),
 ('AGCO', 'AGCO CORP /DE', 0.8307468347811318),
 ('MPTI', 'M-tron Industries, Inc.', 0.8310697528985487),
 ('JCI', 'Johnson Contro

In [None]:
#notest

findClosestEmbs('TT', m[0], cemb)

[('TT', 1.0000000016680874),
 ('ITRI', 0.7669934195683893),
 ('TRN', 0.757928899837897),
 ('TRNS', 0.756768570159696),
 ('HON', 0.7548761482965816),
 ('TSE', 0.7512960897800307),
 ('MPTI', 0.7497710113532638),
 ('TROX', 0.7488504607593199),
 ('TRS', 0.7452240652159831),
 ('TACT', 0.7428685543299199),
 ('NVT', 0.7413787849785848),
 ('ITT', 0.7413341193540852),
 ('TNC', 0.7390700427766047),
 ('TTMI', 0.7381065794704046),
 ('RXT', 0.7341613749671928),
 ('TRT', 0.7339877752530977),
 ('FTI', 0.7332245872229285),
 ('TRU', 0.7326898761216825),
 ('TXT', 0.7290777573386786),
 ('TTI', 0.7268809876810808)]

In [None]:
#notest

getRecsFromPort(['ELV','PRI','FNF','DFS'],m[0],cemb,
                negWeight=0.001)

[('PRI', '* Primerica, Inc.', 0.5157342121322201),
 ('DFS', '* Discover Financial Services', 0.5200267005277418),
 ('FNF', '* Fidelity National Financial, Inc.', 0.5254892332307803),
 ('ELV', '* Elevance Health, Inc.', 0.5445726972920442),
 ('FNB', 'FNB CORP/PA/', 0.544944657359473),
 ('FG', 'F&G Annuities & Life, Inc.', 0.5459453171044197),
 ('COF', 'CAPITAL ONE FINANCIAL CORP', 0.5461629720972857),
 ('ALLY', 'Ally Financial Inc.', 0.5479493556743291),
 ('ALL', 'ALLSTATE CORP', 0.5483801290986048),
 ('SYF', 'Synchrony Financial', 0.549529425450311),
 ('GNW', 'GENWORTH FINANCIAL INC', 0.5508265648249211),
 ('FRAF', 'FRANKLIN FINANCIAL SERVICES CORP /PA/', 0.5512589972405304),
 ('FISI', 'FINANCIAL INSTITUTIONS INC', 0.5521895165822301),
 ('TFC', 'TRUIST FINANCIAL CORP', 0.5523641827036722),
 ('FDBC', 'FIDELITY D & D BANCORP INC', 0.5526955601156165),
 ('MKL', 'MARKEL GROUP INC.', 0.5529749568004889),
 ('GBNY', 'Generations Bancorp NY, Inc.', 0.5537906603059422),
 ('FHN', 'FIRST HORIZON 

In [None]:
#notest

getRecsFromPort(['ELV','PRI','FNF','DFS','APO','SNX',
                 'OMF', 'JPM', 'FG', 'LAD', 'LEN', 'AN', 'DHI', 'MPLX',
                ],m[0],cemb,
                negWeight=0.001, n=80)

[('FNF', '* Fidelity National Financial, Inc.', 0.24445361799505047),
 ('OMF', '* OneMain Holdings, Inc.', 0.24571814992726226),
 ('LEN', '* LENNAR CORP /NEW/', 0.2469936074260246),
 ('DFS', '* Discover Financial Services', 0.24721682521933352),
 ('LGIH', 'LGI Homes, Inc.', 0.2492440504453396),
 ('FG', '* F&G Annuities & Life, Inc.', 0.24970189191538672),
 ('LDI', 'loanDepot, Inc.', 0.2501259805255017),
 ('APO', '* Apollo Global Management, Inc.', 0.25249283840485615),
 ('PFSI', 'PennyMac Financial Services, Inc.', 0.2533775898232725),
 ('DHI', '* HORTON D R INC /DE/', 0.2535329137518526),
 ('LNC', 'LINCOLN NATIONAL CORP', 0.2538253731871031),
 ('ALLY', 'Ally Financial Inc.', 0.25388235411474736),
 ('DFH', 'Dream Finders Homes, Inc.', 0.2539150232840064),
 ('LPLA', 'LPL Financial Holdings Inc.', 0.2541089100547662),
 ('JPM', '* JPMORGAN CHASE & CO', 0.25493225268759667),
 ('CGBD', 'Carlyle Secured Lending, Inc.', 0.2558418544695642),
 ('AN', '* AUTONATION, INC.', 0.2558548717872401),
 

In [None]:
#notest

findClosestEmbs('RELL', m[0], cemb, 'zzrellemb')

saving serv list zzrellemb


[('RELL', 0.99999998520508),
 ('FEIM', 0.8033973201328413),
 ('LINK', 0.7948925790936721),
 ('FELE', 0.7919128218594447),
 ('MOD', 0.7805745192643543),
 ('POWI', 0.7776443612440531),
 ('FSLR', 0.7766557944190637),
 ('DAKT', 0.7761381632463071),
 ('GNRC', 0.7759018965718922),
 ('PFIE', 0.7736163270397118),
 ('RHI', 0.7723403163604735),
 ('AE', 0.7705252411208116),
 ('ASYS', 0.7703331818099399),
 ('VICR', 0.7701325901641378),
 ('HAYN', 0.7688475719398149),
 ('BHE', 0.7688410442780154),
 ('PLOW', 0.7682959551916562),
 ('JELD', 0.7678314585217809),
 ('ITRI', 0.767473160958188),
 ('RFIL', 0.7670078278427928)]

In [None]:
#notest

findClosestEmbs('NVEC', m[0], cemb, 'zznvecemb')

saving serv list zznvecemb


[('NVEC', 0.9999999595896119),
 ('NVR', 0.8148641465240714),
 ('NDSN', 0.8072408553665515),
 ('NVT', 0.7999009376151255),
 ('NHC', 0.7961864340735034),
 ('NC', 0.7961223346932811),
 ('VREX', 0.7944423962739553),
 ('VVOS', 0.7905214345714126),
 ('NINE', 0.7890582799778165),
 ('VTS', 0.7864372093691397),
 ('AE', 0.785869554745357),
 ('NVEE', 0.7858566252445665),
 ('VECO', 0.7852216365083438),
 ('NGVC', 0.7849770364752587),
 ('NR', 0.7849442946149126),
 ('EXTR', 0.7848996546840913),
 ('CNMD', 0.7829721286194089),
 ('XGN', 0.7826288895267541),
 ('GVP', 0.7826076538752202),
 ('PEN', 0.7815270140258042)]

In [None]:
#notest

findClosestEmbs('NVEC', m[0], cemb, 'zznvecemb2',
                distFunc=lambda x,y : ((x-y)**2).sum(), reverse=False)

saving serv list zznvecemb2


[('NVEC', 0.0),
 ('NVR', 0.3702717346752302),
 ('NDSN', 0.3855183483110959),
 ('NVT', 0.4001980889048715),
 ('NHC', 0.40762702514415294),
 ('NC', 0.40775524711728833),
 ('VREX', 0.4111151887723203),
 ('VVOS', 0.41895708554228944),
 ('NINE', 0.42188345079417494),
 ('VTS', 0.4271255425210745),
 ('AE', 0.4282609293643752),
 ('NVEE', 0.42828676168988744),
 ('VECO', 0.4295566309390183),
 ('NGVC', 0.43004594327517387),
 ('NR', 0.43011141929178504),
 ('EXTR', 0.43020058880706674),
 ('CNMD', 0.434055804031503),
 ('XGN', 0.43474218133036496),
 ('GVP', 0.43478468595671144),
 ('PEN', 0.43694592564665846)]

In [None]:
#notest

findClosestEmbs('ESP', m[0], cemb, 'zzespemb')

saving serv list zzespemb


[('ESP', 1.0000000651174648),
 ('POCI', 0.8425144684377223),
 ('CCEL', 0.8121768408997727),
 ('SPRS', 0.8119299463498335),
 ('GLSI', 0.803299929455882),
 ('CRMZ', 0.7814491082747064),
 ('EPSN', 0.7805094840289106),
 ('ATNM', 0.7723179956242227),
 ('NERV', 0.7685074065481914),
 ('AMPG', 0.7665394827122134),
 ('VALU', 0.7582190872170226),
 ('INRD', 0.7578145233414169),
 ('APLD', 0.7576054640611938),
 ('CELZ', 0.7529517204185117),
 ('FALC', 0.7480291547069043),
 ('CMTL', 0.742820786576228),
 ('EVTV', 0.7390334579039042),
 ('PSQH', 0.7340597552979069),
 ('GYRO', 0.7297576767136275),
 ('NEOV', 0.7284170765077527)]

In [None]:
#notest

findClosestEmbs('LGL', m[0], cemb, 'zzlglemb')

saving serv list zzlglemb


[('LGL', 1.0000000316916184),
 ('VRAR', 0.8329806846910496),
 ('GLPI', 0.8118543976831943),
 ('LQDT', 0.8060634884018472),
 ('GL', 0.8053673580930423),
 ('LPG', 0.7988023707652656),
 ('GNLX', 0.7953724846090395),
 ('LITE', 0.7933398646865277),
 ('LRCX', 0.7925885691546906),
 ('GEG', 0.7919369454783258),
 ('LPLA', 0.788723375644633),
 ('SLRC', 0.788719133872568),
 ('SWIM', 0.7884134817995698),
 ('DBGI', 0.7867347078095468),
 ('FGF', 0.7837447137585667),
 ('GVP', 0.7829309881951108),
 ('LAZ', 0.7827788901460306),
 ('GOOD', 0.7826216661924021),
 ('LAND', 0.7812661803642427),
 ('UPLD', 0.7812262562887007)]

In [None]:
# Testing on text 20230601 - 20240601:

In [None]:
#notest

m = getTextPriceDataset('20230601', '20240601',
                        '20240601', '20240620',
                        minPrice=0.5, priceCol='Open', lookback=30)

18575 stock symbols found
restricting to price >= 0.5 ... now 14308 stocks
restricting to CIKs ... now 6124 stocks
loading <class 'secscan.scrape8K.scraper8K'> ... Total filings: 71150
loading <class 'secscan.scrape6K.scraper6K'> ... Total filings: 26975
getting CIK texts ... 5662 stocks with CIK text found


In [None]:
#notest

cemb = cacheEmbeddings('2023June1Y-te3-small.pkl',m[0],m[-1])

A loaded; AA loaded; AAAU loaded; AACG loaded; AACT loaded; AADI loaded; AAL loaded; AAMC loaded; AAME loaded; AAN loaded; AAOI loaded; AAON loaded; AAP loaded; AAPL loaded; AAT loaded; AAWH loaded; AB loaded; ABAT loaded; ABBV loaded; ABCB loaded; ABCL loaded; ABEO loaded; ABEV loaded; ABG loaded; ABL loaded; ABLV loaded; getting ABLZF embedding text length 1077 START FORM 6-K -9 DA... text length 1080 START FORM 6-K -44 D... text length 1080 START FORM 6-K -65 D... text length 1080 START FORM 6-K -71 D... text length 1080 START FORM 6-K -99 D... text length 1082 START FORM 6-K -121 ... text length 1082 START FORM 6-K -183 ... text length 1082 START FORM 6-K -215 ... limit exceeded, stopping symbol ABLZF text length 7568


ABM loaded; ABNB loaded; ABOS loaded; ABR loaded; ABSI loaded; ABT loaded; ABTS loaded; ABUS loaded; ABVC loaded; ABVX loaded; AC loaded; ACA loaded; ACAB loaded; ACAC loaded; ACAD loaded; ACB loaded; ACCD loaded; ACCO loaded; ACDC loaded; ACEL loaded; ACET loaded; ACFN loaded; ACGL loaded; ACHC loaded; ACHL loaded; ACHR loaded; ACHV loaded; ACI loaded; ACIC loaded; ACIU loaded; ACIW loaded; ACLS loaded; ACLX loaded; ACM loaded; ACMR loaded; ACN loaded; ACNB loaded; ACNT loaded; ACR loaded; ACRE loaded; ACRG loaded; ACRS loaded; ACRV loaded; ACST loaded; ACT loaded; ACTG loaded; ACU loaded; ACVA loaded; ACXP loaded; ADAG loaded; ADAP loaded; ADBE loaded; ADC loaded; ADCT loaded; ADEA loaded; ADI loaded; ADIL loaded; ADM loaded; ADMA loaded; ADN loaded; ADNT loaded; ADP loaded; ADPT loaded; ADRT loaded; ADSE loaded; ADSK loaded; ADT loaded; ADTN loaded; ADTX loaded; ADUS loaded; ADV loaded; ADVM loaded; ADXN loaded; AE loaded; AEAE loaded; AEE loaded; AEG loaded; AEHL loaded; AEHR loade

 loaded; LTBR loaded; LTC loaded; LTCN loaded; LTH loaded; LTRN loaded; LTRX loaded; LTRY loaded; LU loaded; LUCD loaded; LULU loaded; LUMN loaded; LUMO loaded; LUNA loaded; LUNG loaded; LUNR loaded; LUV loaded; LVLU loaded; LVO loaded; LVRO loaded; LVS loaded; LVTX loaded; LVWR loaded; LW loaded; LWAY loaded; LWLG loaded; LX loaded; LXEO loaded; LXFR loaded; LXP loaded; LXRX loaded; LXU loaded; LYB loaded; LYEL loaded; LYFT loaded; LYG loaded; LYT loaded; LYTS loaded; LYV loaded; LZ loaded; LZB loaded; LZM loaded; M loaded; MA loaded; MAA loaded; MAC loaded; MACT loaded; MAG loaded; MAIA loaded; MAIN loaded; MAMA loaded; MAMO loaded; MAN loaded; MANH loaded; MANU loaded; MAPS loaded; MAQC loaded; MAR loaded; MARA loaded; MARPS loaded; MARX loaded; MAS loaded; MASI loaded; MASS loaded; MAT loaded; MATH loaded; MATV loaded; MATW loaded; MATX loaded; MAX loaded; MAXN loaded; MBBC loaded; MBC loaded; MBCN loaded; MBI loaded; MBIN loaded; MBLY loaded; MBOT loaded; MBRX loaded; MBUU loaded;

TVGN loaded; TVTX loaded; TW loaded; TWG loaded; TWI loaded; TWIN loaded; TWKS loaded; TWLO loaded; TWO loaded; TWST loaded; TX loaded; TXG loaded; TXMD loaded; TXN loaded; TXO loaded; TXRH loaded; TXT loaded; TY loaded; TYG loaded; TYGO loaded; TYL loaded; TYRA loaded; TZOO loaded; TZUP loaded; U loaded; UAA loaded; UAL loaded; UAN loaded; UAVS loaded; UBCP loaded; UBER loaded; UBFO loaded; UBS loaded; UBSI loaded; UBX loaded; UBXG loaded; UCAR loaded; UCL loaded; UCTT loaded; UDMY loaded; UDN loaded; UDR loaded; UE loaded; UEC loaded; UEIC loaded; UELMO loaded; UFCS loaded; UFI loaded; UFPI loaded; UFPT loaded; UG loaded; UGA loaded; UGI loaded; UGP loaded; UGRO loaded; UHAL loaded; UHG loaded; UHS loaded; UHT loaded; UI loaded; UIS loaded; UK loaded; UL loaded; ULBI loaded; ULCC loaded; ULH loaded; ULS loaded; ULTA loaded; ULY loaded; UMAC loaded; UMBF loaded; UMC loaded; UMH loaded; UNB loaded; UNCY loaded; UNF loaded; UNFI loaded; UNG loaded; UNH loaded; UNIT loaded; UNL loaded; U

In [None]:
#notest

getRecsFromPort(['RELL','ESP','TRT','TLF'],m[0],cemb,
                negWeight=0.001, n=50)

[('TLF', '* TANDY LEATHER FACTORY INC', 0.5439353793970153),
 ('RELL', '* RICHARDSON ELECTRONICS, LTD.', 0.5509400449276975),
 ('ESP', '* ESPEY MFG & ELECTRONICS CORP', 0.5555067970645569),
 ('TRT', '* TRIO-TECH INTERNATIONAL', 0.5589580165747611),
 ('TIKK', 'TEL INSTRUMENT ELECTRONICS CORP', 0.5691793867249226),
 ('TAYD', 'TAYLOR DEVICES INC', 0.5716039528827362),
 ('TR', 'TOOTSIE ROLL INDUSTRIES INC', 0.5729662931189359),
 ('FRD', 'FRIEDMAN INDUSTRIES INC', 0.5735350725090798),
 ('TXN', 'TEXAS INSTRUMENTS INC', 0.5735660709742564),
 ('TSM', 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD', 0.5757774277794379),
 ('JELD', 'JELD-WEN Holding, Inc.', 0.5760525556427224),
 ('TFII', 'TFI International Inc.', 0.5766927642059759),
 ('TAIT', 'TAITRON COMPONENTS INC', 0.5771052135728352),
 ('TSCO', 'TRACTOR SUPPLY CO /DE/', 0.5781890542260639),
 ('ELSE', 'ELECTRO SENSORS INC', 0.5783149075358451),
 ('TXT', 'TEXTRON INC', 0.5791805679500381),
 ('FEIM', 'FREQUENCY ELECTRONICS INC', 0.5796291802355151)

In [None]:
#notest

getRecsFromPort(['RELL','ESP','TRT','TLF'],m[0],cemb,
                negWeight=0.001, n=50,
                model=RandomForestClassifier(n_estimators=1000, random_state=42))

  return np.log(proba)


[('TRT', '* TRIO-TECH INTERNATIONAL', 0.356),
 ('RELL', '* RICHARDSON ELECTRONICS, LTD.', 0.357),
 ('TLF', '* TANDY LEATHER FACTORY INC', 0.375),
 ('ESP', '* ESPEY MFG & ELECTRONICS CORP', 0.394),
 ('SPRS', 'SURGE COMPONENTS INC', 0.988),
 ('TAYD', 'TAYLOR DEVICES INC', 0.992),
 ('SCIA', 'SCI Engineered Materials, Inc.', 0.993),
 ('TIKK', 'TEL INSTRUMENT ELECTRONICS CORP', 0.993),
 ('ATNM', 'Actinium Pharmaceuticals, Inc.', 0.994),
 ('INDI', 'indie Semiconductor, Inc.', 0.994),
 ('MTD', 'METTLER TOLEDO INTERNATIONAL INC/', 0.994),
 ('TBBB', 'BBB FOODS INC', 0.994),
 ('AVGO', 'Broadcom Inc.', 0.995),
 ('FRD', 'FRIEDMAN INDUSTRIES INC', 0.995),
 ('LEA', 'LEAR CORP', 0.995),
 ('MOV', 'MOVADO GROUP INC', 0.995),
 ('ALCO', 'ALICO, INC.', 0.996),
 ('DLTR', 'DOLLAR TREE, INC.', 0.996),
 ('ELSE', 'ELECTRO SENSORS INC', 0.996),
 ('EMR', 'EMERSON ELECTRIC CO', 0.996),
 ('ETN', 'Eaton Corp plc', 0.996),
 ('FOSL', 'Fossil Group, Inc.', 0.996),
 ('FXY', 'Invesco CurrencyShares Japanese Yen Trust', 

In [None]:
#notest

sorted(((k,m[3][k],m[1][k],m[2][k]) for k in m[3]), key=lambda x : x[1])[-200:]

[('FGPR', 0.5773333333333333, 7.5, 11.83),
 ('ADAG', 0.5789473684210529, 1.71, 2.7),
 ('HRTX', 0.5819209039548021, 1.77, 2.8),
 ('HEAR', 0.5831037649219466, 10.89, 17.24),
 ('SRZN', 0.5906562847608454, 8.99, 14.3),
 ('AHCO', 0.5928961748633879, 7.32, 11.66),
 ('CEG', 0.5936777178103314, 116.73, 186.03),
 ('POWL', 0.5957518166573506, 89.45, 142.74),
 ('ELMD', 0.5958702064896755, 10.17, 16.23),
 ('LADX', 0.596113809854268, 1.4409999999999998, 2.3),
 ('OSCR', 0.5967567567567567, 9.25, 14.77),
 ('TPC', 0.5969162995594715, 9.08, 14.5),
 ('ATNM', 0.5979591836734692, 4.9, 7.83),
 ('PRM', 0.6017130620985012, 4.67, 7.48),
 ('CVNA', 0.6022685693377241, 54.66, 87.58),
 ('CAVA', 0.6026044005388413, 44.54, 71.38),
 ('SRTS', 0.6041666666666667, 2.4, 3.85),
 ('PRPL', 0.6055045871559632, 1.09, 1.75),
 ('HRTG', 0.6087613293051359, 6.62, 10.65),
 ('KROS', 0.6108674463937622, 41.04, 66.11),
 ('ARM', 0.6125574272588055, 78.36, 126.36),
 ('IPX', 0.6153846153846154, 9.36, 15.12),
 ('EME', 0.6212886192042244

In [None]:
# import openai
# openai.version.VERSION

'1.33.0'

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()