In [None]:
# default_exp recentFeed

# recentFeed

> Parse the SEC's recent filings feed, and XBRL filings feed.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import re
import xml.etree.cElementTree as cElTree

from secscan import utils

Download and parse the SEC's recent filings feed, and XBRL filings feed:

In [None]:
#export

def secMostRecentListUrl(count=100) :
    "Returns the URL for the SEC's atom-format feed of most recent filings."
    return ('/cgi-bin/browse-edgar?'
            +('' if count is None else f'count={count}&')
            +'action=getcurrent&output=atom')

def printXmlParseWarning(msg,el) :
    print('***',msg,'***')
    print(cElTree.tostring(el))
    print('************************')

titlePat = re.compile(
        r"\s*(.+?)\s+-" # formType, ignoring surrounding whitespace
        + r"\s+(.+?)\s*" # cikName, ignoring surrounding whitespace
        + r"\((\d{10})\)") # cik
filedPat = re.compile(
        r"filed\D+?\s(\d\d\d\d[-/]?\d\d[-/]?\d\d)\s.*"
        + r"accno\D+?\s("+utils.accessNoPatStr+r")\s",
        re.IGNORECASE)
def getRecentChunk(count=100) :
    """
    Parses the SEC's atom-format feed of most recent filings and returns a list of tuples:
        [(fileDate, cikName, accNo, formType, cik),
         ... ]
    with the most recent filings first
    """
    mrListXml = utils.downloadSecUrl(secMostRecentListUrl(count=count), toFormat='xml')
    res = []
    for listEntry in mrListXml :
        if not listEntry.tag.lower().endswith("entry") :
            continue
        cik = formType = accNo = fDate = cikName = None
        for entryItem in listEntry :
            itemTag = entryItem.tag.lower()
            if itemTag.endswith('title') :
                # print('"'+entryItem.text.strip()+'"')
                m = titlePat.match(entryItem.text)
                if m is None :
                    printXmlParseWarning('unable to parse title element',listEntry)
                    continue
                formType,cikName,cik = m.groups()
                cik = cik.lstrip('0')
                # print(repr(formType),repr(cikName),repr(cik))
            elif itemTag.endswith('summary') :
                # print('"'+entryItem.text.strip()+'"')
                m = filedPat.search(entryItem.text)
                if m is None :
                    printXmlParseWarning('unable to parse summary element',listEntry)
                    continue
                fDate,accNo = m.groups()
                # print(repr(fDate),repr(accNo))
        fTup = (fDate, cikName, accNo, formType, cik)
        if all(fTup) :
            res.append(fTup)
    return res

secXbrlFeedUrl = '/Archives/edgar/xbrlrss.all.xml'
dateStrMMDDPat = re.compile(r"(\d\d)[/\-](\d\d)[/\-](\d\d\d\d)$")
def getXbrlFeed() :
    s = utils.downloadSecUrl(secXbrlFeedUrl, toFormat='soup')
    l = s.find_all('item')
    # print(len(l),'XBRL items')
    res = []
    for item in l :
        try :
            itemL = [item.find('edgar:'+tag).string.strip()
                     for tag in ['filingdate','companyname','accessionnumber','formtype','ciknumber']]
            m = dateStrMMDDPat.match(itemL[0])
            if m is None :
                raise Exception("MM/DD/YYYY format expected for filingdate")
            itemL[0] = m.group(3)+'-'+m.group(1)+'-'+m.group(2)
            itemL[4] = itemL[4].lstrip('0')
            res.append(tuple(itemL))
        except Exception as e:
            print('**** ERROR',e)
            print('**** PARSING',item)
    return res

Test downloading and parsing the SEC's recent filings feed and XBRL filings feed:

In [None]:
l = getRecentChunk()
print('recent filings feed:')
utils.printSamp(l,5)
assert len(l)==100, 'parsing SEC recent filings feed'
print()

lxbrl = getXbrlFeed()
print('XBRL filings feed:')
utils.printSamp(lxbrl,5)
assert len(lxbrl)==200, 'parsing SEC XBRL filings feed'

recent filings feed:
0 ('2022-01-14', 'Brannan Michael A.', '0001654954-22-000545', '4', '1807248')
1 ('2022-01-14', 'AEHR TEST SYSTEMS', '0001654954-22-000545', '4', '1040470')
2 ('2022-01-14', 'Burdick Kenneth A', '0001071739-22-000032', '3', '1498992')
3 ('2022-01-14', 'CENTENE CORP', '0001071739-22-000032', '3', '1071739')
4 ('2022-01-14', 'SPINK KENNETH B.', '0001654954-22-000544', '4', '1652627')

XBRL filings feed:
0 ('2022-01-14', 'Recro Pharma, Inc.', '0000950170-22-000333', '8-K/A', '1588972')
1 ('2022-01-14', 'CXJ GROUP CO., Ltd', '0001493152-22-001328', '10-Q', '1823635')
2 ('2022-01-14', 'CAMBER ENERGY, INC.', '0001477932-22-000282', '8-K', '1309082')
3 ('2022-01-14', 'Coursera, Inc.', '0001193125-22-010064', '8-K', '1651562')
4 ('2022-01-14', 'Breeze Holdings Acquisition Corp.', '0001564590-22-001386', '10-Q', '1817640')


Accumulating the recent filings feed to an S3 bucket:

In [None]:
#export

def curEasternTimeStampAndDate() :
    nowET = utils.curEasternUSTime()
    ts = nowET.isoformat().replace('T',' ')
    return nowET, ts[:19], ts[:10]

def initRecentFeedS3(bucket, prevDay=None) :
    _, curTS, today = curEasternTimeStampAndDate()
    utils.pickSaveToS3(bucket, 'today-feed.pkl',
                       {'updated':curTS, 'filings':set(), 'curDay':today, 'prevDay':None},
                       use_gzip=True, make_public=True, protocol=2)

def updateRecentFeedS3(bucket, skipOffHours=True) :
    nowET, curTS, today = curEasternTimeStampAndDate()
    print('updating at', curTS, end='; ')
    if skipOffHours and (utils.isWeekend(nowET)
                         #or nowET.hour<6 or nowET.hour>22
                         #or (nowET.hour==22 and nowET.minute>10)
                        ) :
        print('SEC off hours, skipping update')
        return
    l = getRecentChunk()
    curFeed = utils.pickLoadFromS3(bucket, 'today-feed.pkl', use_gzip=True)
    print('last update', curFeed['updated'])
    if today != curFeed['curDay'] :
        print('starting new day; last day found was',curFeed['curDay'])
        utils.pickSaveToS3(bucket, curFeed['curDay']+'-feed.pkl', curFeed,
                           use_gzip=True, make_public=True, protocol=2)
        prevFilings, prevDay = curFeed['filings'], curFeed['curDay']
        curFeed = {'filings':set(), 'curDay':today, 'prevDay':prevDay}
    elif curFeed['prevDay'] is not None :
        print('continuing current day; most recent previous day was',curFeed['prevDay'])
        prevFeed = utils.pickLoadFromS3(bucket, curFeed['prevDay']+'-feed.pkl', use_gzip=True)
        prevFilings, prevDay = prevFeed['filings'], prevFeed['curDay']
    else :
        print('continuing current day; no previous day found')
        prevFilings, prevDay = set(), None
    prevDayCount = newFTodayCount = newFOtherDayCount = 0
    for tup in l :
        if tup in curFeed['filings'] :
            continue
        if tup in prevFilings :
            prevDayCount += 1
            continue
        curFeed['filings'].add(tup)
        fDate = tup[0]
        if fDate == today :
            newFTodayCount += 1
        else :
            newFOtherDayCount += 1
            if fDate < today :
                print('*** old filing date',tup)
            else :
                print('*** unexpected future filing date',tup)
    print(len(l),'filings,',
          prevDayCount,'from prev day,',newFTodayCount,'new fToday,',newFOtherDayCount,'new fOther,',
          'total now',len(curFeed['filings']))
    curFeed['updated'] = curTS
    utils.pickSaveToS3(bucket, 'today-feed.pkl', curFeed,
                       use_gzip=True, make_public=True, protocol=2)
    print('--- update complete at',curEasternTimeStampAndDate()[1])

def getRecentFromS3(bucket, key='today') :
    return utils.pickLoadFromS3(bucket, key+'-feed.pkl', use_gzip=True)

def getRecentFromS3Public(bucket, key='today') :
    return utils.pickLoadFromS3Public(bucket, key+'-feed.pkl', use_gzip=True)

In [None]:
#hide
# initRecentFeedS3('bucket_name')
# updateRecentFeedS3('bucket_name')

# r = getRecentFromS3Public('bucket_name')
# print(len(r['filings']))
# utils.printSamp(sorted(r['filings']))

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()