In [None]:
# default_exp scrape6K

# scrape6K

> Scrape item summaries from 6-K SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import itertools
import os
import re

from secscan import utils, dailyList, basicInfo, infoScraper

default6KDir = os.path.join(utils.stockDataRoot,'scraped6K')

6-K scraper class - scrape text from the first sublink and from any EX-99 links in the SEC filing:

In [None]:
#export

extract6KPats = [
    re.compile(r'12g.*?no(?:.{1,20}is marked.{1,100}82.)?(.*)(?:signature|pursuant)',re.IGNORECASE),
    re.compile(r'101\s*\(b\)\s*\(7\)(?:.{1,20}note\s*:.*?on edgar.)?(.*)(?:signature|pursuant)',re.IGNORECASE),
    re.compile(r'20-Fb\)\s*\(1\)(?:.{1,20}note\s*:.*?on edgar.)?(.*)(?:signature|pursuant)',re.IGNORECASE),
    re.compile(r'20-F.{1,40}40-F(.*)(?:signature|pursuant)',re.IGNORECASE),
    re.compile(r'announce[s|d]?(.*)',re.IGNORECASE),
    re.compile(r'explanatory\s+note(.*)',re.IGNORECASE),
    re.compile(r'contents(.*)',re.IGNORECASE),
    re.compile(r'exhibits?(?:\s+index)?(.*)',re.IGNORECASE),
]

def parse6K(accNo, formType=None, textLimit=basicInfo.defaultTextLimit) :
    info = basicInfo.getSecFormInfo(accNo, formType=formType, get99=True, textLimit=textLimit)
    mainText = utils.downloadSecUrl(info['links'][0][3], toFormat='souptext')
    print(mainText)
    for extract6KPat in extract6KPats :
        print('PAT',extract6KPat)
        m = extract6KPat.search(mainText)
        if m :
            m = m.group(1).strip()
            if len(m)>20 :
                break
    if m and len(m)>20 :
        info['mainText'] = m[:textLimit]
    else :
        print('*** no main text')
    return info

class scraper6K(infoScraper.scraperBase) :
    def __init__(self, infoDir=default6KDir, startD=None, endD=None, fSuff='m.pkl', **pickle_kwargs) :
        super().__init__(infoDir, '6-K', startD=startD, endD=endD, fSuff=fSuff, **pickle_kwargs)
    def scrapeInfo(self, accNo, formType=None) :
        return parse6K(accNo, formType), None

Test 6-K scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210701','20210704')
len(dl.getFilingsList(None,'6-K')[0])#==188,"testing 6-K scraper class (daily list count)"

20210703 WEEKEND 20210702 ### list index 15 filings for 20210702: 6569 * 20210701 filings for 20210701: 5573 * 

188

In [None]:
# dl = dailyList.dailyList(startD='20200101')

In [None]:
# s = scraper6K(startD='empty')

In [None]:
# s.updateForDays(dl,startD='20210719',endD='20210722')
# print(); s.getCounts()

=====20210721===== =====20210720===== =====NEW 20210719===== [0000910680-21-000052] [0001292814-21-003001] [0001178913-21-002304] [0001193125-21-218580] [0001193125-21-218585] [0001193125-21-218103] [0001193125-21-218466] [0001279569-21-000986] [0001292814-21-002998] [0001292814-21-002999] [0001493152-21-017121] [0001062993-21-006597] [0001683168-21-003004] [0001140361-21-024697] [0001564590-21-036947] [0001171843-21-004958] [0001217160-21-000049] [0001193125-21-218153] [0001279569-21-000981] [0001193125-21-218375] [0001193125-21-218152] [0001178913-21-002300] [0001395064-21-000155] [0001292814-21-002995] [0001279569-21-000987] [0001193125-21-217659] [0001171843-21-004934] [0001171843-21-004944] [0001279569-21-000982] [0001292814-21-002993] [0001062993-21-006604] [0001437749-21-017152] [0001654954-21-008034] [0001279569-21-000980] [0001178913-21-002302] [0001628280-21-014005] [0001104659-21-093230] [0001104659-21-093479] [0001493152-21-017207] [0001605484-21-000070] [0001213900-21-0374

In [None]:
# utils.secBrowse('0001193125-21-219747')

In [None]:
# parse6K('0001157523-21-000888')

In [None]:
# for k,v in s.infoMap['20210720'].items() :
#     print(f"'{k}'")
#     print(v['mainText'])

In [None]:
# dailyList.dlCountFilings(startD='20210101',endD='20210331',formClass='6-K',noAmend=False)

5788

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()