In [None]:
# default_exp infoScraper

# infoScraper

> Base class to scrape and save information from SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import os
import re

from secscan import utils, dailyList, basicInfo

defaultBaseScrapeDir = os.path.join(utils.stockDataRoot,'scrapedBase')

Base scraper class - just scrapes basic filing information using `basicInfo.getSecFormInfo`:

In [None]:
#export

class scraperBase(object) :
    def __init__(self, infoDir, formClass, startD=None, endD=None, fSuff='m.pkl', **pickle_kwargs) :
        self.infoDir = infoDir
        self.formClass = formClass
        self.fSuff = fSuff
        self.pickle_kwargs = dict(pickle_kwargs)
        self.infoMap = {}
        self.dirtySet = set()
        if startD=='empty' :
            return
        self.loadDays(startD=startD, endD=endD)
    def loadDays(self, startD=None, endD=None) :
        self.infoMap.update(utils.loadSplitPklFromDir(self.infoDir, startK=startD, endK=endD,
                                                      fSuff=self.fSuff, **self.pickle_kwargs))
    def save(self) :
        utils.saveSplitPklToDir(self.infoMap, self.infoDir, dirtySet=self.dirtySet,
                                fSuff=self.fSuff, **self.pickle_kwargs)
        self.dirtySet.clear()
    def saveDays(self, daySet) :
        utils.saveSplitPklToDir(self.infoMap, self.infoDir, dirtySet=daySet,
                                fSuff=self.fSuff, **self.pickle_kwargs)
        self.dirtySet.difference_update(daySet)
    def scrapeInfo(self, accNo, formType=None) :
        return basicInfo.getSecFormInfo(accNo, formType), None
    def saveXInfo(self, dStr, accNo, xInfo) :
        utils.savePklToDir(os.path.join(self.infoDir,dStr), accNo+'-xinfo.pkl', xInfo, **self.pickle_kwargs)
    def loadXInfo(self, dStr, accNo) :
        return utils.loadPklFromDir(os.path.join(self.infoDir,dStr), accNo+'-xinfo.pkl', None, **self.pickle_kwargs)
    def retryErrs(self, startD=None, endD=None, justShow=False) :
        for dStr,dInfo in self.infoMap.items() :
            if ((startD is not None and dStr<startD)
                or (endD is not None and endD<=dStr)) :
                continue
            for accNo in dInfo :
                if dInfo[accNo] == 'ERROR' :
                    if justShow :
                        print(accNo,end=' ')
                        continue
                    print('retrying',accNo)
                    dInfo[accNo] = self.scrapeForAccNo(accNo)
                    if dInfo[accNo] != 'ERROR' :
                        self.dirtySet.add(dStr)
    def retryErrsAndSave(self, startD=None, endD=None) :
        self.retryErrs(startD=startD, endD=endD)
        self.save()
    def showErrs(self, startD=None, endD=None) :
        self.retryErrs(startD=startD, endD=endD, justShow=True)
    def checkDates(self, verbose=True) :
        "Prints info on dates present, checking for missing dates."
        dailyList.checkMapDates(self.infoMap, verbose=verbose)
    def printCounts(self, startD=None, endD=None, verbose=True) :
        if verbose :
            print()
            print('Counts by day:')
        tot = 0
        for dStr in sorted(self.infoMap.keys()) :
            if ((startD is not None and dStr<startD)
                or (endD is not None and endD<=dStr)) :
                continue
            dCount = len(self.infoMap[dStr])
            if verbose :
                print(f'{dStr}: {dCount}')
            tot += dCount
        print('Total filings:',tot)
    def scrapeForAccNo(self, accNo, formType=None) :
        try :
            info, xInfo = self.scrapeInfo(accNo, formType)
            if xInfo is not None :
                info['hasXInfo'] = True
                self.saveXInfo(dStr, accNo, xInfo)
            return info
        except Exception as e :
            print('*** ERROR ***',e)
            return 'ERROR'
    def updateForDays(self, dl, startD=None, endD=None, ciks=None, errLimitPerDay=25,
                      verbose=True, saveAfterEachDay=False) :
        """
        Update to reflect the filings for dates between startD (inclusive)
        and endD (exclusive). If startD is None, uses the last date already
        in self.infoMap, or the start of the current year if self.infoMap is empty.
        If endD is None, uses today. The dl argument should be a dailyList object
        that includes those dates.
        Optionally restricts to a given set of CIKs.
        """
        if startD is None :
            if len(self.infoMap) == 0 :
                startD = utils.toDateStr()[:4]+'0101' # start of current year
            else :
                startD = max(self.infoMap.keys())
        for dStr in reversed(utils.dateStrsBetween(startD,endD)) :
            if dStr not in dl.dl :
                print('date',dStr,'not found in dailyList, aborting update!')
                return
            dayIsDirty = (dStr not in self.infoMap)
            if dayIsDirty :
                self.infoMap[dStr] = {}
            if verbose or dayIsDirty :
                print(f'=========={"NEW " if dayIsDirty else ""}{dStr}==========', end=' ', flush=True)
            errCount = 0
            dInfo = self.infoMap[dStr]
            for cik, formType, accNo, fileDate in dl.dl[dStr] :
                if (accNo in dInfo
                    or (ciks is not None and cik not in ciks)
                    or not dailyList.isInFormClass(self.formClass, formType)) :
                    continue
                print(f"'{accNo}'", end=' ', flush=True)
                dInfo[accNo] = self.scrapeForAccNo(accNo,formType)
                dayIsDirty = True
                if dInfo[accNo] == 'ERROR' :
                    errCount += 1
                    if errCount >= errLimitPerDay :
                        print('Error limit exceeded, aborting update!')
                        return
            if dayIsDirty :
                self.dirtySet.add(dStr)
                if saveAfterEachDay :
                    self.save()
    def loadAndUpdate(self, dlOrDir=dailyList.defaultDLDir,
                      startD=None, endD=None, ciks=None, errLimitPerDay=10,
                      verbose=True, saveAfterEachDay=False) :
        """
        Loads a dailyList for the given date range (this must already have been saved),
        and then updates the scraper for the given date range and saves it.
        If startD is None, uses the last date already in self.infoMap, or the start of
        the current year if self.infoMap is empty. If endD is None, uses today.
        Optionally restricts to a given set of CIKs.
        A scraperBase or subclass can be initialized for a date range starting from an empty directory by:
            s = scraperBase(emptyDir, formClass, startD='empty')  # or s = subclass(startD='empty', ...)
            s.loadAndUpdate(startD=drangeStart, endD=drangeEnd)
            s.printCounts()
        assuming the dailyList is already saved for that date range.
        This will also work to extend a scraperBase or subclass to a new date range.
        """
        if isinstance(dlOrDir, dailyList.dailyList) :
            dl = dlOrDir
        else :
            dl = dailyList.dailyList(dlDir=dlOrDir, startD=startD, endD=endD)
        self.loadDays(startD=startD, endD=endD)
        self.updateForDays(dl, startD=startD, endD=endD, ciks=ciks, errLimitPerDay=errLimitPerDay,
                           verbose=verbose, saveAfterEachDay=saveAfterEachDay)
        self.save()

Test base scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210702','20210703')
assert len(dl.getFilingsList(None,'10-K')[0])==6,"testing base scraper class (daily list count)"

b = scraperBase(defaultBaseScrapeDir,'10-K')
b.updateForDays(dl,'20210702','20210703')
assert len(b.infoMap['20210702'])==6,"testing base scraper class (info count)"

links = b.infoMap['20210702']['0001640334-21-001482']['links']
assert (len(links)==9
        and links[0]==('ptco_10k.htm','FORM 10-K','10-K',
                       '/Archives/edgar/data/1609258/000164033421001482/ptco_10k.htm')
        and links[-1]== ('ptco-20210331_def.xml','XBRL TAXONOMY EXTENSION DEFINITION LINKBASE','EX-101.DEF',
                         '/Archives/edgar/data/1609258/000164033421001482/ptco-20210331_def.xml')
       ), "testing base scraper class (scraped info)"

20210702 ### list index 6 filings for 20210702: 6569 * 20210702 

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()