In [None]:
# default_exp recentFeed

# recentFeed

> Parse the SEC's recent filings feed.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import re
import xml.etree.cElementTree as cElTree

from secscan import utils

In [None]:
#export

def secMostRecentListUrl(count=100) :
    "Returns the URL for the SEC's atom-format feed of most recent filings."
    return ('/cgi-bin/browse-edgar?'
            +('' if count is None else f'count={count}&')
            +'action=getcurrent&output=atom')

def printXmlParseWarning(msg,el) :
    print('***',msg,'***')
    print(cElTree.tostring(el))
    print('************************')

titlePat = re.compile(
        r"\s*(.+?)\s+-" # formType, ignoring surrounding whitespace
        + r"\s+(.+?)\s*" # cikName, ignoring surrounding whitespace
        + r"\((\d{10})\)") # cik
filedPat = re.compile(
        r"filed\D+?\s(\d\d\d\d[-/]?\d\d[-/]?\d\d)\s.*"
        + r"accno\D+?\s("+utils.accessNoPatStr+r")\s",
        re.IGNORECASE)
def getRecentChunk(count=100) :
    """
    Parses the SEC's atom-format feed of most recent filings and returns a list of tuples:
        [(fileDate, cikName, accNo, formType, cik),
         ... ]
    with the most recent filings first
    """
    mrListXml = cElTree.fromstring(utils.downloadSecUrl(secMostRecentListUrl(count=count)))
    res = []
    for listEntry in mrListXml :
        if not listEntry.tag.lower().endswith("entry") :
            continue
        cik = formType = accNo = fDate = cikName = None
        for entryItem in listEntry :
            itemTag = entryItem.tag.lower()
            if itemTag.endswith('title') :
                # print('"'+entryItem.text.strip()+'"')
                m = titlePat.match(entryItem.text)
                if m is None :
                    printXmlParseWarning('unable to parse title element',listEntry)
                    continue
                formType,cikName,cik = m.groups()
                cik = cik.lstrip('0')
                # print(repr(formType),repr(cikName),repr(cik))
            elif itemTag.endswith('summary') :
                # print('"'+entryItem.text.strip()+'"')
                m = filedPat.search(entryItem.text)
                if m is None :
                    printXmlParseWarning('unable to parse summary element',listEntry)
                    continue
                fDate,accNo = m.groups()
                # print(repr(fDate),repr(accNo))
        fTup = (fDate, cikName, accNo, formType, cik)
        if all(fTup) :
            res.append(fTup)
    return res

Test downloading and parsing the recent filings feed:

In [None]:
l = getRecentChunk()
utils.printSamp(l,5)
assert len(l)==100, 'parsing recent SEC filings feed'

0 ('2021-06-08', 'Ajila Rohan', '0001104659-21-078439', '3', '1864266')
1 ('2021-06-08', 'Global Consumer Acquisition Corp', '0001104659-21-078439', '3', '1846288')
2 ('2021-06-08', 'Pai Gautham', '0001104659-21-078438', '3', '1863979')
3 ('2021-06-08', 'Global Consumer Acquisition Corp', '0001104659-21-078438', '3', '1846288')
4 ('2021-06-08', 'Clausen Tom', '0001104659-21-078437', '3', '1864275')


Accumulating the recent filings feed to an S3 bucket:

In [None]:
#export

def curEasternTimeStampAndDate() :
    nowET = utils.curEasternUSTime()
    ts = nowET.isoformat().replace('T',' ')
    return nowET, ts[:19], ts[:10]

def initRecentFeedS3(bucket, prevDay=None) :
    _, curTS, today = curEasternTimeStampAndDate()
    utils.pickSaveToS3(bucket, 'today-feed.pkl',
                       {'updated':curTS, 'filings':set(), 'curDay':today, 'prevDay':None},
                       use_gzip=True, make_public=True, protocol=2)

def updateRecentFeedS3(bucket, skipOffHours=True) :
    nowET, curTS, today = curEasternTimeStampAndDate()
    print('updating at', curTS, end='; ')
    if skipOffHours and (utils.isWeekend(nowET)
                         or nowET.hour<6 or nowET.hour>22
                         or (nowET.hour==22 and nowET.minute>10)) :
        print('SEC off hours, skipping update')
        return
    l = getRecentChunk()
    curFeed = utils.pickLoadFromS3(bucket, 'today-feed.pkl', use_gzip=True)
    print('last update', curFeed['updated'])
    if today != curFeed['curDay'] :
        print('starting new day; last day found was',curFeed['curDay'])
        utils.pickSaveToS3(bucket, curFeed['curDay']+'-feed.pkl', curFeed,
                           use_gzip=True, make_public=True, protocol=2)
        prevFilings, prevDay = curFeed['filings'], curFeed['curDay']
        curFeed = {'filings':set(), 'curDay':today, 'prevDay':prevDay}
    elif curFeed['prevDay'] is not None :
        print('continuing current day; most recent previous day was',curFeed['prevDay'])
        prevFeed = utils.pickLoadFromS3(bucket, curFeed['prevDay']+'-feed.pkl', use_gzip=True)
        prevFilings, prevDay = prevFeed['filings'], prevFeed['curDay']
    else :
        print('continuing current day; no previous day found')
        prevFilings, prevDay = set(), None
    newCount = 0
    for tup in l :
        fDate = tup[0]
        fTup = tup[1:]
        if fDate == today :
            if fTup not in curFeed['filings'] :
                newCount += 1
                curFeed['filings'].add(fTup)
        elif fDate == prevDay :
            if fTup not in prevFilings :
                print('*** new filing from previous day',tup)
        else :
            print('*** unexpected filing date',tup)
    print(len(l), 'feed filings,', newCount, 'new, total now',len(curFeed['filings']))
    curFeed['updated'] = curTS
    utils.pickSaveToS3(bucket, 'today-feed.pkl', curFeed,
                       use_gzip=True, make_public=True, protocol=2)
    print('--- update complete at',curEasternTimeStampAndDate()[1])

def getRecentFromS3(bucket, key='today') :
    return utils.pickLoadFromS3(bucket, key+'-feed.pkl', use_gzip=True)

def getRecentFromS3Public(bucket, key='today') :
    return utils.pickLoadFromS3Public(bucket, key+'-feed.pkl', use_gzip=True)

In [None]:
# initRecentFeedS3('bucket_name')

In [None]:
# r = getRecentFromS3Public('bucket_name')
# print(len(r['filings']))
# utils.printSamp(sorted(r['filings']))

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()