In [None]:
# default_exp scrape8K

# scrape8K

> Scrape item summaries from 8-K SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import itertools
import os
import re

from secscan import utils, dailyList, basicInfo, infoScraper

default8KDir = os.path.join(utils.stockDataRoot,'scraped8K')

8K scraper class - scrape items summary from the SEC filing:

In [None]:
#export

itemPat = re.compile(r'item\s*(\d+(?:\.\d*)?)',re.IGNORECASE)
explanPat = re.compile(r'explanatory\s*note',re.IGNORECASE)
def parse8K(accNo, formType=None, textLimit=basicInfo.defaultTextLimit) :
    info = basicInfo.getSecFormInfo(accNo, formType=formType, get99=True, textLimit=textLimit)
    links = info['links']
    if len(links) == 0 :
        utils.printErrInfoOrAccessNo('NO LINKS LIST in',accNo)
        return info
    if formType is None :
        formType = links[0][2]
    items = info.get('items',[])
    if len(items) == 0 :
        return info
    mainText = utils.downloadSecUrl(links[0][3], toFormat='souptext')
    if formType.lower() == '8-k/a' :
        m = explanPat.search(mainText)
        if m is not None :
            info['explanatoryNote'] = mainText[m.start():m.start()+textLimit]
    itemPosL = [0]
    info['itemTexts'] = itemTexts = [None for item in items]
    for i,item in enumerate(items) :
        m = itemPat.match(item)
        if m is None :
            utils.printErrInfoOrAccessNo(f"unexpected format for item header {item}",accNo)
            continue
        m = re.search(r'item[\s\-\.]*' + r'\s*'.join(m.group(1)).replace('.',r'\.'),
                      mainText[itemPosL[-1]:], re.IGNORECASE)
        if m is None :
            utils.printErrInfoOrAccessNo(f"couldn't find {item}",accNo)
            continue
        itemPosL.append(itemPosL[-1]+m.start())
        itemTexts[i] = ''
        # print('pos for',item,itemPosL[-1])
    itemPosL.append(len(mainText))
    j = 1
    for i in range(len(itemTexts)) :
        if itemTexts[i] is None :
            itemTexts[i] = items[i] + ' ???'
        else :
            itemTexts[i] = mainText[itemPosL[j] : min(itemPosL[j]+textLimit, itemPosL[j+1])]
            j += 1
    return info

class scraper8K(infoScraper.scraperBase) :
    @utils.delegates(infoScraper.scraperBase.__init__)
    def __init__(self, infoDir=default8KDir, **kwargs) :
        super().__init__(infoDir, '8-K', **kwargs)
    def scrapeInfo(self, accNo, formType=None) :
        return parse8K(accNo, formType), None    
    def getTextDigest(self, info) :
        res = []
        if 'explanatoryNote' in info :
            res.extend(['START NOTE.',info['explanatoryNote'].strip(),'END NOTE.'])
        for itemText in info.get('itemTexts',[]) :
            if len(itemText.strip()) > 0 :
                res.extend(['START ITEM.',itemText.strip(),'END ITEM.'])
        for prText in info.get('text99',[]) :
            if len(prText.strip()) > 0 :
                res.extend(['START PRESS RELEASE.',prText.strip(),'END PRESS RELEASE.'])
        return ' '.join(res)

Test 8-K scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210701','20210704')
assert len(dl.getFilingsList(None,'8-K')[0])==600,"testing 8-K scraper class (daily list count)"
info = parse8K('0001165002-21-000068', formType='8-K', textLimit=1000)
assert (info['itemTexts'][0].startswith('ITEM 2.02: RESULTS OF OPERATIONS AND FINANCIAL CONDITION '
                                        +'On July 27, 2021, Westwood')
        and info['itemTexts'][0].endswith('otherwise expressly stated in such filing. ')
        and info['itemTexts'][1].startswith('ITEM 7.01: REGULATION FD DISCLOSURE Westwood')
        and info['itemTexts'][1].endswith('of record on August 6, 2021. ')
        and info['itemTexts'][2].startswith('ITEM 9.01: FINANCIAL STATEMENTS AND EXHIBITS (d) ')
        and info['itemTexts'][2].endswith('Financial Officer and Treasurer')
        and info['text99'][1].startswith('EX-99.1 2 a2q21earningsrelease.htm EX-99.1 '
                                         +'Document Westwood Holdings Group, Inc. Reports')
        and info['text99'][1].endswith('High Income achieved a top decile ranking, Income Opportunity and Total Retur')
    ),"testing 8-K scraper class (parsing)"

info = parse8K('0001606757-21-000040', formType='8-K/A', textLimit=1000)
assert (info['explanatoryNote'].startswith('Explanatory Note This Amendment No. 1')
        and info['explanatoryNote'].endswith('Ms. Croom accepted a written offer ')
    ),"testing 8-K scraper class (parsing explanatory note)"

WEEKEND20210703 UPDATE20210702 ### list index 64 count for 20210702: 6569 * UPDATE20210701 count for 20210701: 5573 * 

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()