In [None]:
# default_exp basicInfo

# basicInfo

> Parse basic info from an SEC filing's index page.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import re

from secscan import utils,dailyList

Parse basic info from a filing's index page HTML using BeautifulSoup:

In [None]:
#export

def getSecFormLinkList(indexSoup,accessNo) :
    """
    Returns
        sublinkList,completeTextLink
    where sublinkList is:
        [(name, description, type, sublink), ... ]
    Prints a warning message if the complete text link is missing.
    """
    linkList = []
    completeLink = None
    for row in indexSoup.find_all('tr') :
        entries = row.find_all('td')
        if len(entries)>1 and row.a is not None :
            #print utils.getCombSoupText(entries[3])
            if utils.getCombSoupText(entries[0]).isdigit() and len(entries)>=4 :
                linkList.append((utils.getCombSoupText(row.a),
                                 utils.getCombSoupText(entries[1]),
                                 utils.getCombSoupText(entries[3]),
                                 row.a.get('href','')))
            elif utils.getCombSoupText(entries[1]).lower().startswith('complete') :
                completeLink = row.a.get('href','')
    if not completeLink :
        print('missing complete text link in',utils.secIndexUrl(accessNo,True))
    return linkList, completeLink

companyNameAndCikPat = re.compile(r'(.*)\s*\(.*cik[\s:]*(\d+)',re.IGNORECASE)
def getSecFormCikList(indexSoup,accessNo) :
    """
    Returns list of CIKs for the form: [cik, ... ]
    """
    cikList = []
    for companyNameTag in indexSoup.find_all('span','companyName') :
        companyNameStr = utils.getCombSoupText(companyNameTag)
        m = companyNameAndCikPat.match(companyNameStr)
        if m is None :
            print('missing company name or CIK in',companyNameStr)
            print(utils.secIndexUrl(accessNo,True))
        elif m.group(2) not in cikList :
            cikList.append(m.group(2))
    if len(cikList) == 0 :
        print('no company names in',utils.secIndexUrl(accessNo,True))
    return cikList

def getTextAfterTag(resDict, resKey, top, firstTagPat,
                    firstTagName='div', firstTagClass='infoHead',
                    nextTagName='div', nextTagClass='info',
                    missingMessage=None) :
    """
    Looks for a sequence of two HTML elements specified by name and class,
    with the text of the first element matching a given regular expression.
    If found, stores the text of the second element in resDict[resKey].
    If not found and missingMessage is not None, prints it.
    """
    for firstTag in top.find_all(firstTagName,firstTagClass) :
        if firstTagPat.match(utils.getCombSoupText(firstTag)) :
            nextTag = firstTag.find_next_sibling(nextTagName,nextTagClass)
            if nextTag is not None :
                resDict[resKey] = utils.getCombSoupText(nextTag)
                return
    if missingMessage is not None :
        print(missingMessage)

periodPat = re.compile('period',re.IGNORECASE)
periodDatePatStr = r'\d\d\d\d-\d\d-\d\d'
periodDatePat = re.compile(periodDatePatStr)
acceptedPat = re.compile('accepted',re.IGNORECASE)
acceptedDateTimePat = re.compile('('+periodDatePatStr+r')[ t](\d\d:\d\d:\d\d)',
                                 re.IGNORECASE)

def getSecFormInfo(accessNo) :
    indexSoup = utils.downloadSecUrl(accessNo, toFormat='soup')
    indexFullUrl = utils.secIndexUrl(accessNo,True)
    links, completeLink = getSecFormLinkList(indexSoup,accessNo)
    res = {
        'links': links,
        'complete': completeLink,
        'ciks' : getSecFormCikList(indexSoup,accessNo)
    }
    if (links and dailyList.noPeriodFormTypes.match(links[0][2])) :
        missingPeriodMessage = None
    else :
        missingPeriodMessage = 'missing period in ' + indexFullUrl
    getTextAfterTag(res, 'period', indexSoup, periodPat,
                    missingMessage=missingPeriodMessage)
    if 'period' in res and not periodDatePat.match(res['period']) :
        print('malformed period',res['period'],'in',indexFullUrl)
        del res['period']
    getTextAfterTag(res, 'acceptDateTime', indexSoup, acceptedPat,
                    missingMessage='missing accepted in ' + indexFullUrl)
    if 'acceptDateTime' in res :
        m = acceptedDateTimePat.match(res['acceptDateTime'])
        if not m :
            print('malformed accept date/time',res['acceptDateTime'])
            print('in',indexFullUrl)
        else :
            res['acceptDate'] = m.group(1)
            res['acceptTime'] = m.group(2)
        del res['acceptDateTime']
    return res

Test parsing basic info:

In [None]:
s = getSecFormInfo('0001140361-18-003143')
assert s == {
    'links': [('doc1.html','FORM 4','4',
               '/Archives/edgar/data/83350/000114036118003143/xslF345X03/doc1.xml'),
              ('doc1.xml','FORM 4','4',
               '/Archives/edgar/data/83350/000114036118003143/doc1.xml')],
    'complete': '/Archives/edgar/data/83350/000114036118003143/0001140361-18-003143.txt',
    'ciks': ['0000083350', '0001436951'],
    'period': '2010-03-04',
    'acceptDate': '2018-01-25',
    'acceptTime': '11:20:33'}, "parsing form 4 basic info"

s = getSecFormInfo('0001086763-21-000007')
assert s == {
    'links': [('ffa13g03312021vidler.htm','FIRST FOUNDATION ADVISORS_VIDLER','SC 13G',
               '/Archives/edgar/data/830122/000108676321000007/ffa13g03312021vidler.htm')],
    'complete': '/Archives/edgar/data/830122/000108676321000007/0001086763-21-000007.txt',
    'ciks': ['0000830122', '0001086763'],
    'acceptDate': '2021-05-17',
    'acceptTime': '19:32:43'}, "parsing form 13G basic info"

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()