In [None]:
# default_exp scrape6K

# scrape6K

> Scrape item summaries from 6-K SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import itertools
import os
import re

from secscan import utils, dailyList, basicInfo, infoScraper

default6KDir = os.path.join(utils.stockDataRoot,'scraped6K')

6-K scraper class - scrape text from the first sublink and from any EX-99 links in the SEC filing:

In [None]:
#export

reg12gStr = r'12g3(?:\s*.\s*2\s*\(b\))?'
header6KPat = re.compile(r'.*(?:'
               + r'pursuant.{1,20}'+reg12gStr+r'.{1,100}?\b1934\b(?:.{1,40}\bno\b..)?'
               + r'|' + r'is marked.{1,100}'+reg12gStr+'..'
               + r'|' + r'101\s*\(b\)\s*\(7\)(?:\s*only\s*permits.{1,700}on edgar.)?'
               + r'|' + r'101\s*\(b\)\s*\(1\)(?:\s*only\s*permits.{1,150}holders.)?'
               + r'|' + r'20-Fb\)\s*\(1\)'
               + r'|' + r'20-F.{1,40}40-F'
               + r')',re.IGNORECASE)
signaturePat = re.compile(r'.{1,20}signatures?\s*pursuant.{1,200}authorized.(?:.{1,300}(?:officer|president|ceo))?',
                         re.IGNORECASE)
skipJunkPat = re.compile(r'[^a-z]{1,40}',re.IGNORECASE)

def parse6K(accNo, formType=None, textLimit=basicInfo.defaultTextLimit) :
    info = basicInfo.getSecFormInfo(accNo, formType=formType, get99=True, textLimit=textLimit)
    mainText = utils.downloadSecUrl(info['links'][0][3], toFormat='souptext')
    m = header6KPat.match(mainText)
    if m :
        mainText = mainText[m.end():]
        # print(endPos, mainText[endPos:endPos+400])
        # print()
    else :
        print('no header')
    for pat in [signaturePat, skipJunkPat] : 
        m = pat.match(mainText)
        if m :
            mainText = mainText[m.end():]
    info['mainText'] = mainText[:textLimit].strip()
    return info

class scraper6K(infoScraper.scraperBase) :
    @utils.delegates(infoScraper.scraperBase.__init__)
    def __init__(self, infoDir=default6KDir, **kwargs) :
        super().__init__(infoDir, '6-K', **kwargs)
    def scrapeInfo(self, accNo, formType=None) :
        return parse6K(accNo, formType), None

Test 6-K scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210701','20210704')
assert len(dl.getFilingsList(None,'6-K')[0])==188,"testing 6-K scraper class (daily list count)"
info = parse6K('0001178913-21-002357', formType='6-K', textLimit=1000)
assert (info['text99'][1].startswith('TAT TECHNOLOGIES LTD. NOTICE OF ANNUAL')
        and info['text99'][1].endswith('PricewaterhouseCoopers International Ltd., as our i')
        and info['text99'][2].startswith('TAT Technologies Ltd. P.O. Box 80, Gedera')
        and info['text99'][2].endswith('be entitled to vote, with all powers the un')
        and info['mainText'].startswith('TAT Technologies Ltd. 6-K Exhibits: 1. Notice')
        and info['mainText'].endswith('Ehud Ben-Yair Chief Financial Officer Date: July 26, 2021 3')
       ),"testing 6-K scraper class (parsing)"

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()