In [None]:
# default_exp dailyList

# dailyList

> Parse the SEC's archived daily list of filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import csv
import os
import re

from secscan import utils,recentFeed

defaultDLDir = os.path.join(utils.stockDataRoot,'dlMaps')

Download and parse the SEC's archived daily list of filings:

In [None]:
#export

def getQStr(dateStr) :
    """
    Converts a date in YYYYMMDD format to YYYY/QTRn/
    where n is the quarter number from 1 to 4."
    """
    return dateStr[:4] + '/QTR' + str((int(dateStr[4:6])+2) // 3) + '/'

def getSecDailyIndexUrls(dateStr) :
    base = '/Archives/edgar/daily-index/'+getQStr(dateStr)
    return (base+'master.'+dateStr+'.idx', base+'index.json')

edgarTxtFPat = re.compile(
            # gets cik and accession no from a file url within the daily index
            r"\s*edgar/data/(\d+)" # cik - should be same as on the index line
            + r"/("+utils.accessNoPatStr+r")\.txt\s*$", # accession no
            re.IGNORECASE)

def getDailyFList(d, listIndexCache=None) :
    """
    Returns a list of SEC filed forms:
        [(cik, cikName, formType, fileDate, accNo), ... ]
    for the given date or ISO date string, retrieved from
    the SEC daily index.
    """
    dateStr = utils.toDateStr(d)
    listUrl, listIndexUrl = getSecDailyIndexUrls(dateStr)
    if listIndexCache is None or listIndexUrl not in listIndexCache :
        listIndexJson = utils.downloadSecUrl(listIndexUrl, toFormat='json')
        listIndex = set(item['name'] for item in listIndexJson['directory']['item']
                        if item['name'].startswith('master'))
        print(f'### list index {len(listIndex)}',end=' ')
        if listIndexCache is not None :
            listIndexCache[listIndexUrl] = listIndex
    else :
        listIndex = listIndexCache[listIndexUrl]
    if 'master.'+dateStr+'.idx' not in listIndex :
        print('HOLIDAY',end=' ')
        return []
    res = downloadSecFormList(listUrl)
    print('count for',dateStr+':', len(res), end=' ')
    return res

def downloadSecFormList(listUrl) :
    fListRes = utils.downloadSecUrl(listUrl)
    r = csv.reader(fListRes.splitlines(), delimiter='|')
    res = []
    for entry in r :
        if len(entry)==5 and entry[0].isdigit() :
            cik, cikName, formType, fileDate, txtF = entry
        else :
            if len(res) > 0 :
                print('invalid entry', entry)
            continue
        fileDate = fileDate.replace('-','').replace('/','')
        m = edgarTxtFPat.match(txtF)
        if not m :
            print('missing accession no in', entry)
            continue
        if m.group(1) != cik :
            print('cik mismatch in', entry)
        res.append((cik,cikName,formType,fileDate,m.group(2)))
    return res

Test downloading and parsing SEC's archived daily list of filings:

In [None]:
assert ((getQStr('20200101'),getQStr('20200401'),getQStr('20201201'))
        == ('2020/QTR1/','2020/QTR2/','2020/QTR4/')), "quarter string calc"
assert (getSecDailyIndexUrls('20201201')
        ==('/Archives/edgar/daily-index/2020/QTR4/master.20201201.idx',
           '/Archives/edgar/daily-index/2020/QTR4/index.json')), "daily index URL"
assert getDailyFList('20210531')==[],"daily list holiday test"
r = getDailyFList('20201201')
assert len(r)==3464 and r[-1]== ('97745', 'THERMO FISHER SCIENTIFIC INC.',
                                 '8-K', '20201201', '0000097745-20-000055'), "daily list regular day test"

### list index 64 HOLIDAY ### list index 61 count for 20201201: 3464 

Form classes (how to specify groups of form types):

In [None]:
#export

namedFormClasses = {  # readable names for some groups of form types
    'ALL' : '',
    'FINANCIAL' : re.compile('10-[KQ]',re.IGNORECASE),
    'ACTIVIST' : 'SC 13D',
    'FIVEPERCENT' : re.compile('SC 13[DG]',re.IGNORECASE),
    'INVESTOR' : '13F-HR',
    'ALLINVESTOR' : re.compile('13F-HR|SC 13[DG]',re.IGNORECASE),
    'INSIDER' : re.compile('4(?:/A)?$',re.IGNORECASE),
}

noPeriodFormTypes = re.compile('SC 13[DG]|424',re.IGNORECASE)

def isInFormClass(formClass,formType) :
    """
    Says if formType is in formClass, where formClass can be one of the following:
        - None or '' - includes all formTypes
        - namedFormClass - one of the ones above
        - other string - includes formTypes starting with that string
        - regex - includes matching formTypes
    """
    if formClass is None :
        return True
    if isinstance(formClass,str) :
        formClass = formClass.upper()
        if formClass in namedFormClasses :
            formClass = namedFormClasses[formClass]
    if isinstance(formClass,str) :
        return formType.startswith(formClass)
    # else assume it's a regex
    return formClass.match(formType) is not None

Test form classes:

In [None]:
assert ((isInFormClass(None,'10-Q'),isInFormClass('10','10-Q'),isInFormClass('4','10-Q'))
        == (True,True,False)), "string form classes"
assert ((isInFormClass('INVESTOR','13F-HR'),isInFormClass('investor','10-Q'))
        == (True,False)), "names form classes"
assert ((isInFormClass(noPeriodFormTypes,'SC 13D'),isInFormClass(noPeriodFormTypes,'10-Q'))
        == (True,False)), "regex form class"

The dailyList class maintains a trio of dicts:
```
    dl: dateStr -> list of filings [(cik, formType, accNo, fDate), ... ]
    cikNames: cik -> (name, latestDateStr)
    cikOldNames: cik -> [name1, name2, ...]
```
using the SEC's archived daily list of filings.

In [None]:
#export 

def findCikName(cikName,oldNames) :
    """
    Checks if a CIK name already appears in a list of old names (case insensitive).
    Returns the position if it appears, else -1.
    """
    cikName = cikName.casefold()
    for i,oldName in enumerate(oldNames) :
        if cikName == oldName.casefold() :
            return i
    return -1

def checkMapDates(dMap, verbose=True) :
    "Prints info on dates present present in a map, checking for missing dates."
    startD = min(dMap.keys())
    endD = max(dMap.keys())
    print(f'start date: {startD}, end date: {endD}')
    nNotPresent = 0
    for dStr in utils.dateStrsBetween(startD,endD) :
        if dStr not in dMap :
            nNotPresent += 1
            if verbose :
                print(dStr,'not present!')
    print(f'total of {len(dMap)} dates, {nNotPresent} missing')

class dailyList(object) :
    def __init__(self, dlDir=defaultDLDir, startD=None, endD=None, fSuff='m.pkl', **pickle_kwargs) :
        """
        Creates a dailyList object and loads lists for the date range [startD..endD), along with
        the full CIK name maps. By default (startD=None, endD=None) loads all dates present.
        Use startD='empty' to create an empty object.
        """
        self.dlDir = dlDir
        self.fSuff = fSuff
        self.pickle_kwargs = dict(pickle_kwargs)
        self.dl = {}
        if startD=='empty' :
            self.cikNames, self.cikOldNames = {}, {}
            return
        self.loadDays(startD=startD, endD=endD)
        self.cikNames = utils.loadPklFromDir(dlDir, 'cikNames.pkl', {}, **pickle_kwargs)
        self.cikOldNames = utils.loadPklFromDir(dlDir, 'cikOldNames.pkl', {}, **pickle_kwargs)
    def loadDays(self, startD=None, endD=None) :
        """
        Loads lists for the given date range into an already created dailyList object.
        """
        self.dl.update(utils.loadSplitPklFromDir(self.dlDir, startK=startD, endK=endD,
                                                 fSuff=self.fSuff, **self.pickle_kwargs))
    def save(self, dirtySet=None) :
        """
        Saves daily lists and name maps to self.dlDir.
        By default just saves days with no list already present.
        See utils.saveSplitPklToDir for other possibilities based on the dirtySet argument.
        """
        utils.saveSplitPklToDir(self.dl, self.dlDir, dirtySet=dirtySet, fSuff=self.fSuff, **self.pickle_kwargs)
        utils.savePklToDir(self.dlDir, 'cikNames.pkl', self.cikNames, **self.pickle_kwargs)
        utils.savePklToDir(self.dlDir, 'cikOldNames.pkl', self.cikOldNames, **self.pickle_kwargs)
    def updateCikNamesFromEntry(self, dStr, cik, cikName) :
        """
        Updates the name maps
            self.cikNames: cik -> (name, latestDateStr)
            self.cikOldNames: cik -> [oldname1, oldname2, ... ]
        to reflect an entry (cik, cikName, ... ) from the daily index for dStr.
        """
        if cik not in self.cikNames : # completely new name
            self.cikNames[cik] = (cikName, dStr)
            return
        # make sure self.cikNames contains the latest name
        if dStr < self.cikNames[cik][1] :
            # name in self.cikNames is newer than this entry
            oldName = cikName
        else :
            # this entry is newer than the name in self.cikNames - update it
            oldName = self.cikNames[cik][0]
            self.cikNames[cik] = (cikName, dStr)
        curName = self.cikNames[cik][0]
        if curName.casefold() == oldName.casefold() :
            # new and old names are the same (case insensitive)
            return
        # oldName is different from curName - update self.cikOldNames:
        if cik not in self.cikOldNames :
            self.cikOldNames[cik] = [oldName]
            return
        # add the old name if it's not in the list of old names:
        oldNames = self.cikOldNames[cik]
        if findCikName(oldName, oldNames) < 0 :
            oldNames.append(oldName)
        # delete the current name if it's in the list of old names:
        i = findCikName(curName, oldNames)
        if i >= 0 :
            del oldNames[i]
    def updateDayUsingL(self, dStr, dailyL, clearDay=True) :
        if clearDay :
            self.dl[dStr] = []
        for cik, cikName, formType, fileDate, accNo in dailyL :
            self.dl[dStr].append((cik, formType, accNo, fileDate))
            self.updateCikNamesFromEntry(dStr, cik, cikName)
    def checkAgainstMaster(self, year=None, quarter=None, fixMissingDate=False,
                           returnMissing=False) :
        """
        Checks this list against the SEC combined master list.
        Returns True if no missing filings (filings in master but not in this list).
        If missing filings are found:
        - if fixMissingDate is False, returns False.
        - otherwise tries to fix this list by adding the missing filings and adding it,
          and returns True if successful.
        """
        if year is None :
            url = '/Archives/edgar/full-index/master.idx'
        else :
            url = f'/Archives/edgar/full-index/{year}/QTR{quarter}/master.idx'
        masterL = downloadSecFormList(url)
        allAccNos = self.getAllAccNos()
        print('checking against master ...')
        missingL = [tup for tup in masterL if tup[-1] not in allAccNos]
        if returnMissing :
            return missingL
        if len(missingL) == 0 :
            print('no missing filings found!')
            return True
        missingFDates = sorted(set(tup[-2] for tup in missingL))
        print(len(missingL),'missing filings found, fDates',missingFDates)
        print('fTypes',sorted(set(tup[2] for tup in missingL)))
        print('accNos[:50]',sorted(set(tup[-1] for tup in missingL))[:50])
        if not fixMissingDate :
            print('*** RUN WITH fixMissingDate=True TO FIX ***')
            return False
        if len(missingFDates) != 1 :
            print('unable to fix missing dates - ambiguous')
            return False
        dStr = missingFDates[0]
        if dStr not in self.dl :
            print('unable to fix missing dates - unexpected day, not in daily map')
            return False
        print('adding',len(missingL),'entries to',dStr)
        self.updateDayUsingL(dStr, missingL, clearDay=False)
        self.save(dirtySet={dStr})
        return True
    def updateForDays(self, startD=None, endD=None) :
        """
        Update to reflect the filings for dates between startD (inclusive)
        and endD (exclusive). If startD is None, uses the last date already
        in self.dl, or the start of the current year if self.dl is empty.
        If endD is None, uses today.
        Retrieves the filings info from the SEC daily index using getDailyFList.
        """
        if startD is None :
            if len(self.dl) == 0 :
                startD = utils.toDateStr()[:4]+'0101' # start of current year
            else :
                startD = max(self.dl.keys())
        listIndexCache = {}
        for dStr in reversed(utils.dateStrsBetween(startD,endD)) :
            if dStr in self.dl :
                print('SKIP'+dStr, end=' ')
            elif utils.isWeekend(dStr) :
                self.dl[dStr] = []
                print('WEEKEND'+dStr, end=' ')
            else :
                print('UPDATE'+dStr, end=' ')
                self.updateDayUsingL(dStr, getDailyFList(dStr,listIndexCache))
                print('*',end=' ')
    def getAllFormTypes(self) :
        "Returns all form types found."
        res = set()
        for l in self.dl.values() :
            res.update(formType for _,formType,_,_ in  l)
        return res
    def getCiksFiling(self, formClass=None) :
        "Returns all ciks who have filed any forms in formClass."
        res = set()
        for l in self.dl.values() :
            res.update(cik for cik,formType,_,_ in l
                       if isInFormClass(formClass,formType))
        return res
    def getAllAccNos(self) :
        res = set()
        for l in self.dl.values() :
            res.update(accNo for _,_,accNo,_ in l)
        return res
    def getFilingsList(self, ciks=None, formClass=None) :
        """
        Returns a list of filings for the given ciks (may be a set or None for all CIKs),
        that are in formClass. The list is is the form:
            [(dlDate, name, formType, accNo, fileDate),
             ... ]
        and is sorted lexicographically on those fields, except in reverse order on dlDate
        (most recent dates appear first).
        Also returns a dict mapping each cik -> the set of accession numbers for its filings.
        """
        res = []
        accNosByCik = collections.defaultdict(set)
        for dDate,l in self.dl.items() :
            for cik, formType, accNo, fileDate in l :
                if (ciks is None or cik in ciks) and isInFormClass(formClass,formType):
                    res.append((dDate,self.cikNames[cik][0],formType,accNo,fileDate))
                    accNosByCik[cik].add(accNo)
        res.sort()
        res.sort(key = lambda x : x[0], reverse=True)
        return res, accNosByCik
    def restrictedCikNameMap(self, ciks) :
        "Create a dict : cik -> latest name restricted to a given list or set of ciks."
        return dict((cik,self.cikNames[cik][0]) for cik in ciks)
    def checkDates(self, verbose=True) :
        "Prints info on dates present, checking for missing dates."
        checkMapDates(self.dl, verbose=verbose)

def dlCountFilings(dlDir=defaultDLDir, startD=None, endD=None, ciks=None,
                   formClass=None, noAmend=False,
                   fSuff='m.pkl', **pickle_kwargs) :
    """
    Convenience function to count number of filings in a date range, optionally restricting
    to ciks and formClass.
    """
    dl = dailyList(dlDir=dlDir, startD=startD, endD=endD, fSuff=fSuff, **pickle_kwargs)
    if noAmend :
        formClass = re.compile(formClass+'(?!/)',re.IGNORECASE)
    fList,_ = dl.getFilingsList(ciks=ciks, formClass=formClass)
    return len(fList)

def loadAndUpdateDL(dlDir=defaultDLDir, startD=None, endD=None, uStartD=None, uEndD=None,
                    fSuff='m.pkl', dirtySet=None, **pickle_kwargs) :
    """
    Creates a dailyList object and loads lists for the date range [startD..endD), along with
    the full CIK name maps. By default (startD=None, endD=None) loads all dates present.
    Then updates to reflect the filings for dates between uStartD (inclusive)
    and uEndD (exclusive), and saves. If uStartD is None, uses the last date already
    in the loaded dailyList, or the start of the current year if the loaded dailyList
    is empty. If uEndD is None, uses today.
    Use startD='empty' to start with an empty dailyList.

    A dailyList can be initialized for a date range starting from an empty directory by:
        dl = loadAndUpdateDL(dlDir=emptyDir, startD='empty', uStartD=drangeStart, uEndD=drangeEnd)
    An dailyList in an existing directory can be updated up to and including yesterday by:
        dl = loadAndUpdateDL(dlDir=existingDir)
    """
    dl = dailyList(dlDir=dlDir, startD=startD, endD=endD, fSuff=fSuff, **pickle_kwargs)
    dl.updateForDays(startD=uStartD, endD=uEndD)
    dl.save(dirtySet=dirtySet)
    return dl

In [None]:
# hide
# # checking for missing filings in old quarter master list (Q4 2021)
# # these seem to be specific types (DRS, EFFECT, etc)
# dl = dailyList(startD='20211001',endD='20220101')
# dl.checkAgainstMaster(2021,4)

3081 missing filings found, fDates ['20211001', '20211004', '20211005', '20211006', '20211007', '20211008', '20211011', '20211012', '20211013', '20211014', '20211015', '20211018', '20211019', '20211020', '20211021', '20211022', '20211024', '20211025', '20211026', '20211027', '20211028', '20211029', '20211101', '20211102', '20211103', '20211104', '20211105', '20211108', '20211109', '20211110', '20211111', '20211112', '20211115', '20211116', '20211117', '20211118', '20211119', '20211122', '20211123', '20211124', '20211126', '20211129', '20211130', '20211201', '20211202', '20211203', '20211206', '20211207', '20211208', '20211209', '20211210', '20211213', '20211214', '20211215', '20211216', '20211217', '20211220', '20211221', '20211222', '20211223', '20211227', '20211228', '20211229', '20211230', '20211231']
fTypes ['19B-4E', '8-M', 'ABS-EE', 'CFPORTAL', 'CFPORTAL-W', 'CFPORTAL/A', 'CORRESP', 'D/A', 'DOS', 'DOS/A', 'DRS', 'DRS/A', 'DRSLTR', 'EFFECT', 'FOCUSN', 'FOCUSN/A', 'MA-A', 'MA-W', '

In [None]:
# hide
# # fixing missing filings from a specific day (20220804) in the SEC daily index
# # using the current quarter master index:
# dl = dailyList(startD='20220701') # start of current quarter
# dl.checkAgainstMaster()  # check missing fDates and filings, then run next line:
# # dl.fixDayFromMaster(dStr='20220804')

36 missing filings found, fDates ['20220804']
fTypes ['10-Q', '13F-HR', '3', '4', '8-K', 'D', 'DEFA14A', 'POS EX']
accNos[:50] ['0000709283-22-000033', '0000733269-22-000032', '0000899243-22-027699', '0000950170-22-014749', '0001062993-22-017282', '0001069258-22-000053', '0001104659-22-086131', '0001171843-22-005361', '0001171843-22-005362', '0001174947-22-000922', '0001193125-22-212520', '0001209191-22-044415', '0001213900-22-044758', '0001287032-22-000254', '0001289636-22-000026', '0001370880-22-000033', '0001393311-22-000028', '0001443646-22-000155', '0001500435-22-000045', '0001558370-22-012193', '0001604028-22-000051', '0001606366-22-000048', '0001606587-22-001606', '0001628280-22-021070', '0001654954-22-010601', '0001683168-22-005337', '0001683168-22-005338', '0001706946-22-000124', '0001759655-22-000111', '0001939971-22-000001', '0001941353-22-000001']


Test dailyList class:

In [None]:
tdl = dailyList(startD='empty')
tdl.updateForDays('20210611','20210612')
assert (len(tdl.dl)==1
        and len(tdl.dl['20210611'])==4263
        and len(tdl.cikNames)==2821
        and tdl.dl['20210611'][0]==('1000045', '8-K', '0001564590-21-032560', '20210611')
       ), "initializing daily list"

tdl.updateForDays('20210611','20210615')
assert (len(tdl.dl)==4
        and [len(tdl.dl[dStr]) for dStr in ['20210611','20210612','20210613','20210614']]==[4263,0,0,4379]
        and len(tdl.cikNames)==5063
        and tdl.dl['20210614'][0]==('1000230', '10-Q', '0001437749-21-014672', '20210614')
       ), "continuing daily list maps"

assert (len(tdl.getAllFormTypes())==157
        and len(tdl.getCiksFiling())==5063
        and len(tdl.getCiksFiling('financial'))==110
        and min(tdl.getCiksFiling('financial'))=='1000230'), "checking daily list filings/CIKs"

20210611 ### list index 64 filings for 20210611: 4263 * 20210614 ### list index 64 filings for 20210614: 4379 * 20210613 WEEKEND 20210612 WEEKEND SKIP20210611 

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()