In [None]:
# default_exp scrape13F

# scrape13F

> Scrape investor holdings from 13F-HR SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import itertools
import numpy as np
import os
import re

from secscan import utils, dailyList, basicInfo, infoScraper

default13FDir = os.path.join(utils.stockDataRoot,'scraped13F')

13F-HR scraper class - scrape table of investor holdings from XML format in the SEC filing:

In [None]:
#export

def findChildEndingWith(el,tagEnd) :
    "Finds first child of an XML element with tag ending in tagEnd (case insensitive)."
    tagEnd = tagEnd.lower()
    for child in el :
        if child.tag.lower().endswith(tagEnd) :
            return child
    return None

def findChildSeries(el,tagEnds) :
    "Finds a nested series of children by tag using findChildEndingWith"
    for tagEnd in tagEnds :
        el = findChildEndingWith(el,tagEnd)
    return el

callOptPat = re.compile(r'call\s*opt',re.IGNORECASE)
putOptPat = re.compile(r'put\s*opt',re.IGNORECASE)

def getRowInfo(row) :
    """
    Returns information for a row in a 13F table in the form:
        (cusip, name, value, title, count, putCall)
    where the field values are as given in the table,
    except putCall is 'CALL', 'PUT', or ''.
    """
    cusip = findChildEndingWith(row,'cusip').text.upper().strip()
    name = findChildEndingWith(row,'issuer').text.strip()
    value = findChildEndingWith(row,'value').text.strip()
    title = findChildEndingWith(row,'titleOfClass').text.upper().strip()
    shrsOrPrnEl = findChildEndingWith(row,'shrsOrPrnAmt')
    count = findChildEndingWith(shrsOrPrnEl,'sshPrnamt').text.strip()
    #countType = findChildEndingWith(shrsOrPrnEl,'sshPrnamtType').text.upper()
    putCallEl = findChildEndingWith(row,'putCall')
    if putCallEl is None :
        putCallEl = findChildEndingWith(shrsOrPrnEl,'putCall')
        #if putCallEl is not None :
        #    print('found putcall in shrsOrPrn')
    if putCallEl is not None :
        putCall = putCallEl.text.upper().strip()
    elif callOptPat.search(name) or title.startswith('CALL') or title=='CAL' :
        putCall = 'CALL'
    elif putOptPat.search(name) or title.startswith('PUT') :
        putCall = 'PUT'
    else :
        putCall = ''
    return (cusip, name, value, title, count, putCall)

def parse13FHoldings(accNo, formType=None) :
    """
    Parses a 13F filing, returning the result in the form:
    {
        'period': 'YYYY-MM-DD',
        'acceptDate': 'YYYY-MM-DD',
        'acceptTime': 'HH:MM:SS',
        'cik' : 'DDDDDDDDDD',
        'holdings': [(cusip, name, value, title, count, putCall), ... ]
    }
    where the field values are as given in the table,
    except putCall is 'CALL', 'PUT', or ''.
    """
    info = basicInfo.getSecFormInfo(accNo, formType)
    xmlUrls = [l[-1] for l in info['links'] if l[0].lower().endswith('xml')]
    if len(xmlUrls) == 1 :
        xmlSummTab = utils.downloadSecUrl(xmlUrls[0],toFormat='xml')
        tot = int(findChildSeries(xmlSummTab,['formdata','summarypage','tableentrytotal']).text.strip())
        if tot == 0 :
            print('*** zero total, table not present')
        else :
            print('*** nonzero total, but table not present')
        holdings = []
    else :
        xmlTab = utils.downloadSecUrl(xmlUrls[-1],toFormat='xml')
        tabRows = [tabRow for tabRow in xmlTab
                   if tabRow.tag.lower().endswith('infotable')]
        if len(xmlTab) != len(tabRows) :
            print('*** #rows mismatch',len(xmlTab),'all children',len(tabRows),'table rows')
        if len(tabRows) == 0 :
            print('*** no holdings in table')
        holdings = [getRowInfo(tabRow) for tabRow in tabRows]
    if len(info['ciks']) != 1 :
        print('*** unexpected number of CIKs!=1',info['ciks'])
    return {'period': info['period'],
            'acceptDate': info['acceptDate'],
            'acceptTime': info['acceptTime'],
            'cik': info['ciks'][0],
            'holdings': holdings}

class scraper13F(infoScraper.scraperBase) :
    def __init__(self, infoDir=default13FDir, startD=None, endD=None, fSuff='m.pkl', **pickle_kwargs) :
        super().__init__(infoDir, '13F-HR', startD=startD, endD=endD, fSuff=fSuff, **pickle_kwargs)
    def scrapeInfo(self, accNo, formType=None) :
        return parse13FHoldings(accNo, formType), None

Test 13F-HR scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210614','20210615')
assert len(dl.getFilingsList(None,'13F-HR')[0])==4, "testing 13F scraper class (daily list count)"

s = scraper13F(startD='empty')
s.updateForDays(dl,'20210614','20210615')
hList = s.infoMap['20210614']['0001104659-21-080656']['holdings']
assert (len(hList)==39
        and hList[0]==('00289Y107', 'Abeona Therapeutics Inc', '5376', 'COM', '2859767', '')
        and hList[-1]==('82835P103', 'SILVERCORP METALS INC', '179', 'COM', '36500', '')
       ), "testing 13F scraper class (parsing holdings table)"

20210614 ### list index 64 filings for 20210614: 4379 * =====NEW 20210614===== [0001104659-21-080656] [0001104659-21-080840] [0000921895-21-001608] [0000921895-21-001612] 

Generate a combined holdings matrix based on all 13F-HR filings in a quarter:

In [None]:
#export

def condenseHoldings(holdings, cutoff=0.0) :
    """
    Converts a list of of stock and option holdings as parsed from the 13F:
        [(cusip, name, value, title, count, putCall), ... ]
    that may have multiple entries per stock into a condensed list that omits
    options and only has one combined entry per stock:
        [(cusip, val, frac) ... ]
    sorted in descending order by value, and restricted to stocks with fraction
    of total portfolio >= cutoff.
    """
    holdings = sorted((cusip, float(value))
                      for cusip, name, value, shType, nShares, putCall in holdings
                      if putCall=='')
    holdings = [(cusip, sum(val for _,val in it))
                for cusip,it in itertools.groupby(holdings, key=lambda x : x[0])]
    holdings.sort(key = lambda x : x[1], reverse=True)
    tot = sum(val for _,val in holdings)
    res = []
    for cusip,val in holdings :
        frac = val/tot if tot>0.0 else 0.0
        if frac >= cutoff :
            res.append((cusip,val,frac))
        else :
            break
    return res

def get13FAmendmentType(accNo, formType=None) :
    """
    Gets the amendment type for a 13F-HR/A filing - may be RESTATEMENT or NEW HOLDINGS.
    This turned out to be unreliable (often missing or wrong), so I don't use it to get
    the combined holdings for an investor. Instead I just look at the number of holdings
    in an amendment compared to the previous filing, and treat it as a restatement
    if the new number of holdings is more than half the old number.
    """
    info = basicInfo.getSecFormInfo(accNo, formType)
    xmlUrls = [l[-1] for l in info['links'] if l[0].lower().endswith('xml')]
    xmlSummTab = utils.downloadSecUrl(xmlUrls[0],toFormat='xml')
    coverPage = findChildSeries(xmlSummTab,['formdata','coverpage'])
    isAmendment = findChildEndingWith(coverPage,'isamendment')
    if isAmendment is None or isAmendment.text.strip().lower() not in ['true','yes'] :
        return None
    return findChildSeries(coverPage,['amendmentinfo','amendmenttype']).text.strip()

class cikConsolidatedHoldings(object) :
    def __init__(self, scraped13F, period, cutoff=0.0) :
        """
        Consolidate holdings for each CIK based on all filings with a given period.
        Restrict to stocks only (no options) with fraction of total portfolio >= cutoff.
        Sets self.cikToPosList: cik -> [(cusip, val, frac) ... ]
        and  self.investorsPerStock: cusip -> nInvestors
        """
        #
        # Map cik to a list [(dateStr, accNo, holdingsList) ... ]
        # of all 13F filings from that cik with the given period.
        cikTo13Fs = collections.defaultdict(list)
        count = 0
        for dStr, accNoToInfo in scraped13F.infoMap.items() :
            for accNo, info in accNoToInfo.items() :
                if info == 'ERROR' :
                    print('ERR',accNo)
                    continue
                if info['period'] != period :
                    continue
                cikTo13Fs[info['cik']].append((dStr, accNo, info['holdings']))
                count += 1
        print('period',period,'- total of',len(cikTo13Fs),'ciks,',count,'13F filings')
        #
        # Get a consolidated list of positions for each cik. For ciks with multiple filings
        # this may involve combining amended filings. Since the filed amendment type is
        # unreliable I use a simple rule of thumb - if the amendment has more than half
        # as many positions as the previous filing I assume it's a restatement, otherwise
        # I add its positions to the previous filing.
        self.cikToPosList = {}
        for cik, cik13FList in cikTo13Fs.items() :
            cik13FList.sort()  # sort by day and then by accession number
            i = 0
            j = 1
            while j < len(cik13FList) :
                if len(cik13FList[j][2]) > len(cik13FList[i][2])//2 :
                    # relatively many new positions - assume filing j is a restatement
                    i = j
                j += 1
            if j != 1 :
                print('CIK',cik,i,'-',j,[(dStr,accNo,len(holdings))
                                         for dStr,accNo,holdings in cik13FList])
            combHoldings = cik13FList[i][2]
            while i+1 < j :
                i += 1
                combHoldings = combHoldings + cik13FList[i][2]
            self.cikToPosList[cik] = condenseHoldings(combHoldings, cutoff)
        # generate a count of investors per stock:
        self.investorsPerStock = collections.Counter()
        for posList in self.cikToPosList.values() :
            self.investorsPerStock.update(cusip for cusip,_,_ in posList)
    def getHoldingsMatrix(self, minInvestorsPerStock=3, minStocksPerInvestor=1, dtype=np.float64) :
        """
        Calculates a combined matrix of investor holdings.
        Returns ciks, cusips, mat
        where mat is a matrix of shape (len(ciks), len(cusips))
        in which each row has the fractions held by the corresponding cik in each cusip.
        """
        print('requiring ', minInvestorsPerStock, 'investors per stock,',
              minStocksPerInvestor, 'stocks per investor')
        cusipsToKeep = set(cusip for cusip,nInvestors in self.investorsPerStock.items()
                           if nInvestors >= minInvestorsPerStock)
        ciksToKeepPosLists = {}
        for cik,posList in self.cikToPosList.items() :
            keepPosList = [tup for tup in posList if tup[0] in cusipsToKeep]
            if len(keepPosList) >= minStocksPerInvestor :
                ciksToKeepPosLists[cik] = keepPosList
        print(len(ciksToKeepPosLists), 'investors,', len(cusipsToKeep), 'stocks')
        cusips = sorted(cusipsToKeep)
        cusipToCol = dict((cusip,i) for i,cusip in enumerate(cusips))
        ciks = sorted(ciksToKeepPosLists.keys())
        cikToRow = dict((cik,i) for i,cik in enumerate(ciks))
        res = np.zeros((len(ciks), len(cusips)), dtype=dtype)
        count = 0
        for cik,posList in ciksToKeepPosLists.items() :
            for cusip,_,frac in posList :
                res[cikToRow[cik], cusipToCol[cusip]] = frac
                count += 1
        print('total of',count,'positions')
        return ciks,cusips,res

qStartEnds = ['0101','0401','0701','1001','0101']
qPeriods = ['-03-31','-06-30','-09-30','-12-31']
def getPeriodAndNextQStartEnd(y, qNo) :
    """
    Returns the 13F period date for a given year and quarter number (this is the
    last day in the quarter), along with the start and end dateStrs for the next
    quarter (this is the date range when the 13Fs for this year should be filed).
    Quarters are numbered 1-4.
    """
    nextY = y+1 if qNo==4 else y
    nextQNo = 1 if qNo==4 else qNo+1
    return (str(y)+qPeriods[qNo-1],
            {'startD' : str(nextY) + qStartEnds[nextQNo-1],
             'endD' : str(nextY+1 if nextQNo==4 else nextY) + qStartEnds[nextQNo]})

def getMatrixFor(y, qNo, cutoff=0.0, minInvestorsPerStock=3, minStocksPerInvestor=1) :
    period, nextQStartEnd = getPeriodAndNextQStartEnd(y,qNo)
    cikConsHoldings = cikConsolidatedHoldings(scraper13F(**nextQStartEnd), period, cutoff=cutoff)
    return cikConsHoldings.getHoldingsMatrix(minInvestorsPerStock, minStocksPerInvestor)

Test generating a combined holdings matrix:

In [None]:
assert getPeriodAndNextQStartEnd(2020,1)==('2020-03-31', {'startD': '20200401', 'endD': '20200701'}),"13F qstart/end"
assert getPeriodAndNextQStartEnd(2020,4)==('2020-12-31', {'startD': '20210101', 'endD': '20210401'}),"13F qstart/end"

c = cikConsolidatedHoldings(s,'2021-03-31')
ciks,cusips,m = c.getHoldingsMatrix(minInvestorsPerStock=1)
assert (ciks==['0001325083', '0001867040']
        and cusips[:2]==['00289Y107','003279730'] and cusips[-2:]==['G11196105','M40527109']
        and m.shape==(2,42) and abs(m[0,0]-0.03502463)<1e-6 and abs(m[1,13]-0.9675410812420477)<1e-6
       ), "combined holdings matrix"

period 2021-03-31 - total of 2 ciks, 2 13F filings
requiring  1 investors per stock, 1 stocks per investor
2 investors, 42 stocks
total of 42 positions


In [None]:
# dailyList.dlCountFilings(startD='20201001',endD='20210101',formClass='13F-HR')

5686

In [None]:
# s = scraper13F(startD='empty')
# s.loadAndUpdate(startD='20201001',endD='20210101')
# s.getCounts()

=====20201231===== =====20201230===== =====20201229===== =====20201228===== =====20201227===== =====20201226===== =====20201225===== =====20201224===== =====20201223===== =====20201222===== =====20201221===== =====20201220===== =====20201219===== =====20201218===== =====20201217===== =====20201216===== =====20201215===== =====20201214===== =====NEW 20201213===== =====NEW 20201212===== =====20201211===== =====20201210===== =====20201209===== =====20201208===== =====20201207===== =====NEW 20201206===== =====NEW 20201205===== =====20201204===== =====20201203===== =====20201202===== =====20201201===== =====20201130===== =====NEW 20201129===== =====NEW 20201128===== =====20201127===== =====NEW 20201126===== =====20201125===== =====20201124===== =====20201123===== =====NEW 20201122===== =====NEW 20201121===== =====20201120===== =====20201119===== =====20201118===== =====20201117===== =====20201116===== =====NEW 20201115===== =====NEW 20201114===== =====20201113===== =====20201112===== =====N

[0001104659-20-114603] [0001611848-20-000007] [0001624729-20-000007] [0001606587-20-001038] [0001630363-20-000006] [0001085146-20-002516] [0001085146-20-002512] [0001635342-20-000004] [0001665359-20-000005] [0001172661-20-002011] [0001672265-20-000006] [0001606587-20-001019] [0001681490-20-000004] [0000870156-20-000033] [0001694284-20-000007] [0001085146-20-002525] [0001698750-20-000006] [0001606587-20-001032] [0001706669-20-000007] [0001713936-20-000005] [0001721608-20-000008] [0001085146-20-002515] [0000870156-20-000034] [0001606587-20-001022] [0001172661-20-002012] [0001606587-20-001024] [0001736260-20-000007] [0001085146-20-002521] [0001740871-20-000004] [0001606587-20-001014] [0001085146-20-002524] [0001085146-20-002513] [0001754960-20-000070] [0001765594-20-000007] [0001606587-20-001017] [0001767617-20-000004] [0001085146-20-002517] [0001606587-20-001021] [0001790525-20-000004] [0001754960-20-000069] [0001798026-20-000007] [0001606587-20-001023] [0001085146-20-002527] [0001085146

[0001509550-20-000004] [0001062993-20-004804] [0001398344-20-019748] [0001398344-20-019749] [0001542108-20-000005] [0001542265-20-000006] [0000909012-20-000129] [0001062993-20-004800] [0001563525-20-000005] [0001623883-20-000008] [0001632554-20-000008] [0001635925-20-000006] [0001653926-20-000004] [0001664017-20-000004] [0001715740-20-000004] [0001580642-20-003670] [0001731601-20-000005] [0001734565-20-000005] [0001799859-20-000004] [0001800513-20-000007] [0001802091-20-000005] [0001806226-20-000004] [0000764106-20-000006] [0000791191-20-000004] [0000811360-20-000004] [0000819864-20-000008] [0001104659-20-112675] [0000950123-20-010097] [0000892914-20-000004] [0001683168-20-003370] =====NEW 20201005===== [0001053054-20-000004] [0001054257-20-000004] [0001214659-20-008396] [0001269134-20-000007] [0001297731-20-000006] [0001370102-20-000010] [0001567619-20-017539] [0001104659-20-112039] [0001563690-20-000004] [0001588873-20-000005] [0001214659-20-008390] [0001601622-20-000004] [0001637246

In [None]:
# ciks,cusips,mat = getMatrixFor(2021, 1, 0.005, 3, 1)

period 2021-03-31 - total of 5911 ciks, 6039 13F filings
CIK 0001731061 1 - 2 [('20210408', '0001104659-21-047884', 829), ('20210506', '0001104659-21-062185', 895)]
CIK 0001386929 1 - 2 [('20210412', '0001386929-21-000005', 79), ('20210414', '0001386929-21-000006', 79)]
CIK 0001105410 1 - 2 [('20210413', '0001214659-21-004137', 50), ('20210421', '0001214659-21-004362', 50)]
CIK 0001566601 1 - 2 [('20210413', '0001566601-21-000005', 764), ('20210414', '0001566601-21-000006', 764)]
CIK 0001573767 1 - 2 [('20210413', '0001573767-21-000006', 49), ('20210505', '0001573767-21-000007', 66)]
CIK 0001840740 1 - 2 [('20210413', '0001840740-21-000005', 51), ('20210414', '0001840740-21-000006', 51)]
CIK 0001053994 1 - 2 [('20210414', '0001085146-21-001185', 184), ('20210419', '0001085146-21-001232', 296)]
CIK 0001381055 2 - 3 [('20210416', '0001381055-21-000002', 101), ('20210505', '0001381055-21-000003', 101), ('20210506', '0001381055-21-000004', 102)]
CIK 0001844568 1 - 2 [('20210416', '00018445

CIK 0001063296 1 - 2 [('20210517', '0000905718-21-000694', 10), ('20210518', '0000905718-21-000710', 10)]
CIK 0001080383 1 - 2 [('20210517', '0000919574-21-003508', 53), ('20210518', '0000919574-21-003778', 55)]
CIK 0001086619 1 - 2 [('20210517', '0001567619-21-010327', 3238), ('20210528', '0001567619-21-011331', 3238)]
CIK 0001119376 1 - 2 [('20210517', '0001567619-21-010351', 38), ('20210518', '0001567619-21-010595', 40)]
CIK 0001164688 1 - 2 [('20210517', '0001172661-21-001214', 16), ('20210521', '0001172661-21-001373', 16)]
CIK 0001224962 1 - 2 [('20210517', '0001012975-21-000229', 138), ('20210610', '0001012975-21-000242', 139)]
CIK 0001332784 1 - 2 [('20210517', '0001567619-21-010483', 12), ('20210623', '0001567619-21-012524', 12)]
CIK 0001387369 1 - 2 [('20210517', '0000919574-21-003641', 39), ('20210517', '0000919574-21-003738', 40)]
CIK 0001393818 1 - 2 [('20210517', '0000950123-21-007113', 500), ('20210519', '0000950123-21-007183', 500)]
CIK 0001423053 1 - 3 [('20210517', '00

In [None]:
# ciks,cusips,mat = getMatrixFor(2020, 4, 0.005, 3, 1)

ERR 0000950123-21-001365
ERR 0000950123-21-001368
ERR 0000950123-21-002850
period 2020-12-31 - total of 5872 ciks, 6034 13F filings
CIK 0001802098 1 - 2 [('20210107', '0001802098-21-000002', 78), ('20210208', '0001802098-21-000002', 78)]
CIK 0001056825 1 - 2 [('20210111', '0001056825-21-000001', 690), ('20210201', '0001056825-21-000002', 690)]
CIK 0001802533 1 - 2 [('20210111', '0001802533-21-000001', 197), ('20210112', '0001802533-21-000002', 197)]
CIK 0001714506 1 - 2 [('20210112', '0001754960-21-000006', 135), ('20210113', '0001754960-21-000011', 135)]
CIK 0001089707 1 - 2 [('20210113', '0001089707-21-000003', 85), ('20210125', '0001089707-21-000006', 87)]
CIK 0001780985 1 - 2 [('20210113', '0001780985-21-000001', 205), ('20210208', '0001780985-21-000003', 128)]
CIK 0001819955 2 - 3 [('20210113', '0001819955-21-000002', 90), ('20210119', '0001819955-21-000004', 92), ('20210217', '0001819955-21-000007', 91)]
CIK 0001599923 1 - 3 [('20210114', '0001085146-21-000111', 75), ('20210119',

CIK 0001677253 1 - 2 [('20210212', '0001677253-21-000001', 253), ('20210218', '0001677253-21-000002', 238)]
CIK 0001697366 1 - 2 [('20210212', '0001104659-21-021634', 186), ('20210223', '0001104659-21-026600', 187)]
CIK 0001698661 1 - 2 [('20210212', '0001213900-21-008583', 2), ('20210317', '0001213900-21-015767', 2)]
CIK 0001771067 1 - 2 [('20210212', '0000919574-21-000930', 9), ('20210216', '0000919574-21-001579', 9)]
CIK 0001837984 1 - 2 [('20210212', '0001837984-21-000001', 76), ('20210311', '0001837984-21-000002', 76)]
CIK 0001843358 1 - 2 [('20210212', '0001606587-21-000310', 116), ('20210323', '0001606587-21-000375', 166)]
CIK 0001844444 1 - 2 [('20210212', '0001844444-21-000001', 93), ('20210212', '0001844444-21-000002', 94)]
CIK 0000873630 1 - 2 [('20210212', '0000873630-21-000001', 7573), ('20210225', '0000873630-21-000002', 7573)]
CIK 0000897599 1 - 2 [('20210212', '0000897599-21-000003', 74), ('20210212', '0000897599-21-000004', 74)]
CIK 0000923338 1 - 2 [('20210212', '0000

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()