In [None]:
# default_exp scrape13D

# scrape13D

> Scrape holdings information from 13D SEC filings.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import itertools
import numpy as np
import os
from pathlib import Path
import re

from secscan import utils, dailyList, basicInfo, infoScraper, scrape13F, scrape13G

default13DDir = os.path.join(utils.stockDataRoot,'scraped13D')

13D scraper class - scrape holdings information from the SEC filing:

In [None]:
#export

class scraper13D(infoScraper.scraperBase) :
    def __init__(self, infoDir=default13DDir, startD=None, endD=None, fSuff='m.pkl', **pickle_kwargs) :
        super().__init__(infoDir, 'SC 13D', startD=startD, endD=endD, fSuff=fSuff, **pickle_kwargs)
    def scrapeInfo(self, accNo, formType=None) :
        return scrape13G.parse13GD(accNo, formType=formType), None

Test 13D scraper class:

In [None]:
dl = dailyList.dailyList(startD='empty')
dl.updateForDays('20210701','20210704')
assert len(dl.getFilingsList(None,'SC 13D')[0])==84,"testing 13D scraper class (daily list count)"
info = scrape13G.parse13GD('0000921895-21-001173', formType='SC 13D/A')
assert (info['ciks']==['0001165002', '0001461945']
        and info['positions']==[('412,925', '5.0'), ('430,214', '5.2'), ('843,139', '10.2'),
                                ('843,139', '10.2'), ('860,416', '10.4'), ('860,416', '10.4')]
        and info['purpose'].startswith('Item 4 is hereby amended and restated to read')
        and info['purpose'].endswith('Item 4 of Schedule 13D. 8 CUSIP No. 961765104')
    ),"testing 13D scraper class (parsing)"

20210703 WEEKEND 20210702 ### list index 64 filings for 20210702: 6569 * 20210701 filings for 20210701: 5573 * 

Combine 13F, 13G, and 13D filings for a quarter:

In [None]:
#export

def get13GDDatesForQ(y, qNo) :
    _,kwargs = scrape13F.getPeriodAndNextQStartEnd(y, qNo)
    kwargs['startD'] = str(int(kwargs['startD'][:4])-2) + kwargs['startD'][4:]
    return kwargs

def getCombNSSForQ(y, qNo, minFrac=0.01, maxFrac=1.0,
                   minStocksPerInvestor=3, maxStocksPerInvestor=100,
                   minTop10Frac=0.4, minAUM=None, dtype=np.float64,
                   minInvestorsPerStock=None, maxInvestorsPerStock=None,
                   minAllInvestorsPerStock=None, maxAllInvestorsPerStock=None,
                   cusipNameFilter=lambda cusip,name : name is not None,
                   max13GDBonus=0.2, min13GDBonus=0.02, max13GDCount=50,
                   include13F=True, include13G=False, include13D=False,
                   outsInfoFName='', outDir='ratings') :
    """
    Calculates a matrix of investor holdings for a quarter, based on all 13F filings filed
    during the succeeding quarter, combined with 13G and 13D filings from the previous year
    up through the succeeding quarter.

    Returns mat, ciks, cusips where mat is a matrix of shape (len(ciks), len(cusips))
    in which each row has the fractions held by the corresponding cik in each cusip.

    If minFrac and/or maxFrac is supplied, restricts to stocks with fraction of
    total portfolio >=minFrac and/or <=maxFrac.

    If minStocksPerInvestor, maxStocksPerInvestor, minTop10Frac or minAUM are specified,
    omits investors with too few stocks, too many stocks, too small a fraction in the
    top 10 holdings, or too small a total stock value.
    If minInvestorsPerStock is specified, restricts to stocks with at least that many investors;
    likewise, maxInvestorsPerStock can be used to give an upper bound.

    Note min/max StocksPerInvestor/InvestorsPerStock limit based on the counts of stocks/investors
    in the returned matrix. If minAllInvestorsPerStock or maxAllInvestorsPerStock is specified,
    this instead restricts based on a count of all investors that have any position in each stock.

    If cusipNameFilter is specified, this should be a function that gets two arguments (cusip and
    name, where name will be None if no name was found in either the SEC 13F CUSIP name index or
    in the CUSIP-CIK correspondence from 13D and 13G forms), and returns True for cusips to keep.

    13GD bonus fractions are 1.0/#positions, but restricted to [min13GDBonus..max13GDBonus]
    If max13GDCount is not None, restricts to investors with at most max13GDCount combined 13G
    and 13D positions.
    """
    allCusipCounter = collections.Counter()
    all13FHoldingsMap = {}
    cikNames = utils.loadPklFromDir(dailyList.defaultDLDir, 'cikNames.pkl', {})
    cikNames = dict((cik,name) for cik,(name,dStr) in cikNames.items())
    cusipNames = utils.pickLoad(os.path.join(utils.stockDataRoot,'13FLists',f'{y}Q{qNo}secCusipMap.pkl'))
    if include13G or include13D :
        dates = get13GDDatesForQ(y,qNo)
        scrapedL = []
        if include13G :
            scrapedL.append(scrape13G.scraper13G(**dates))
        if include13D :
            scrapedL.append(scraper13D(**dates))
        cik13GDPosMap = scrape13G.updateCik13GDPos(scrapedL, cusipNames=cusipNames, cikNames=cikNames,
                                                   includeTickers=True)
        cikBonusMaps = [scrape13G.calcBonusMap(cik13GDPosMap,
                                               max13GDBonus=max13GDBonus, min13GDBonus=min13GDBonus,
                                               max13GDCount=max13GDCount, allCusipCounter=allCusipCounter)]
        cik13GDSortedPosMap = dict((cik,sorted(((cusip,pos) for cusip,pos in posMap.items()),
                                        # sort positions largest first, then by name
                                        key=lambda x : (-x[1][2], cusipNames.get(x[0],'CUSIP-'+x[0]).lower())))
                                   for cik,posMap in cik13GDPosMap.items())
    else :
        cikBonusMaps = []
        cik13GDSortedPosMap = {}
    res = scrape13F.getNSSForQ(y, qNo, minFrac=minFrac, maxFrac=maxFrac,
                               minStocksPerInv=minStocksPerInvestor,
                               maxStocksPerInv=maxStocksPerInvestor,
                               minTop10Frac=minTop10Frac, minAUM=minAUM, dtype=dtype,
                               minInvestorsPerStock=minInvestorsPerStock,
                               maxInvestorsPerStock=maxInvestorsPerStock,
                               minAllInvestorsPerStock=minAllInvestorsPerStock,
                               maxAllInvestorsPerStock=maxAllInvestorsPerStock,
                               allCusipCounter=allCusipCounter,
                               cusipFilter=lambda cusip : cusipNameFilter(cusip,cusipNames.get(cusip)),
                               extraHoldingsMaps=cikBonusMaps, include13F=include13F,
                               all13FHoldingsMap=all13FHoldingsMap)
    if outsInfoFName is not None :
        if outDir is None :
            outDir = Path(utils.stockDataRoot)
        else :
            outDir = Path(utils.stockDataRoot)/outDir
        if not outDir.exists() :
            outDir.mkdir()
        mat, ciks, cusips = res
        cikSet = set(ciks)
        mat /= 0.01
        mat = np.minimum(mat,20.0)
        res = {'Y': mat, 'ciks': ciks, 'cusips': cusips, 'cusipinfo': [],
               'deletedcusips': set(cusip for cusip in cusips
                                    if 'DELETED' in cusipNames.get(cusip,''))}
        utils.pickSave(outDir/f'{outsInfoFName}{y}Q{qNo}sInfo.pkl', res,
                       fix_imports=True, protocol=2)
        utils.pickSave(outDir/f'{outsInfoFName}{y}Q{qNo}cusipMap.pkl',
                       dict((cusip, name)
                            for cusip,name in cusipNames.items() if cusip in allCusipCounter),
                       fix_imports=True, protocol=2)
        utils.pickSave(outDir/f'{outsInfoFName}{y}Q{qNo}hold13GD.pkl',cik13GDSortedPosMap,
                       #dict((cik, posMap)
                       #     for cik,posMap in cik13GDSortedPosMap.items() if cik.zfill(10) in cikSet),
                       fix_imports=True, protocol=2)
        utils.pickSave(outDir/f'{outsInfoFName}{y}Q{qNo}hold13F.pkl',all13FHoldingsMap,
                       #dict((cik, posMap)
                       #     for cik,posMap in all13FHoldingsMap.items() if cik.zfill(10) in cikSet),
                       fix_imports=True, protocol=2)
    return res

def oddballScreen(yNo, qNo) :
    """
    Screens for stocks that only one investor reports holding.
    These will mostly be mistakes, so I try to remove the mistakes using cusipFilter.
    """
    cusipNames = utils.pickLoad(os.path.join(utils.stockDataRoot,'cusipMap.pkl'))
    mat, ciks, cusips = getCombNSSForQ(yNo, qNo, max13GDCount=50, include13F=True, include13G=True, include13D=True,
                                       minAllInvestorsPerStock=None, maxAllInvestorsPerStock=1,
                                       cusipFilter = lambda x : x in cusipNames and 'DELETED' not in cusipNames[x])
    return [cusipNames[cusip] for cusip in cusips]

In [None]:
# hide

# get13GDDatesForQ(2021,2)

# for y,qNo in itertools.islice(itertools.product([2018,2019,2020,2021],[1,2,3,4]),13,14) :
#     getCombNSSForQ(y, qNo, outDir='ratingsNGD',
#                        include13D=True,include13G=True,include13F=True)

No event date in 0000903064-19-000010; using 2019-06-25
No event date in 0000903064-19-000009; using 2019-06-25
No CUSIP in 0001085146-19-001821
No CUSIP in 0000215457-19-008358
No event date in 0000315066-19-001607; using 2019-07-03
No event date in 0000315066-19-001596; using 2019-07-03
No event date in 0000315066-19-001608; using 2019-07-03
No event date in 0000315066-19-001612; using 2019-07-03
No CUSIP in 0000215457-19-008097
No CUSIP in 0000215457-19-008290
No CUSIP in 0000215457-19-008296
No CUSIP in 0000215457-19-008308
No CUSIP in 0000215457-19-008334
No CUSIP in 0000215457-19-008338
No event date in 0000315066-19-001605; using 2019-07-03
No event date in 0000315066-19-001613; using 2019-07-03
No event date in 0000315066-19-001600; using 2019-07-03
No CUSIP in 0001422848-19-000128
No CUSIP in 0001422848-19-000130
No event date in 0000315066-19-001598; using 2019-07-03
No event date in 0000315066-19-001594; using 2019-07-03
No event date in 0000315066-19-001606; using 2019-07-0

No CUSIP in 0001422849-20-000124
No event date in 0001567619-20-003950; using 2020-02-07
No CUSIP in 0001104659-20-020535
*** No positions found in 0001436126-20-000003
No event date in 0001140361-20-003345; using 2020-02-07
No event date in 0001104659-20-021444; using 2020-02-07
No event date in 0001104659-20-021447; using 2020-02-07
missing or ambiguous subject CIK '0001484737-20-000002'
*** No positions found in 0001493152-20-002361
No event date in 0001193125-20-036782; using 2020-02-07
No event date in 0001193125-20-036786; using 2020-02-07
*** ERROR in  0001520023-20-000002
*** ERROR in  0001520023-20-000003
No event date in 0000807985-20-000007; using 2020-02-07
No event date in 0001213900-20-003906; using 2020-02-07
No CUSIP in 0001562230-20-000036
No CUSIP in 0001562230-20-000007
No CUSIP in 0001562230-20-000011
No CUSIP in 0001562230-20-000031
No CUSIP in 0001562230-20-000042
No event date in 0001437749-20-002761; using 2020-02-07
missing or ambiguous subject CIK '0000943374-

No event date in 0001137774-21-000019; using 2021-02-02
No event date in 0001137774-21-000020; using 2021-02-02
No event date in 0001137774-21-000021; using 2021-02-02
No event date in 0001137774-21-000022; using 2021-02-02
No event date in 0001137774-21-000023; using 2021-02-02
No event date in 0001137774-21-000024; using 2021-02-02
No event date in 0001137774-21-000025; using 2021-02-02
No event date in 0001137774-21-000026; using 2021-02-02
No event date in 0001137774-21-000027; using 2021-02-02
No event date in 0001137774-21-000010; using 2021-02-02
No event date in 0001137774-21-000011; using 2021-02-02
No event date in 0001137774-21-000012; using 2021-02-02
No event date in 0001137774-21-000013; using 2021-02-02
No event date in 0001137774-21-000014; using 2021-02-02
No event date in 0001137774-21-000015; using 2021-02-02
No event date in 0001567619-21-002615; using 2021-02-02
No event date in 0001567619-21-002623; using 2021-02-02
No event date in 0001567619-21-002641; using 202

No event date in 0001193125-21-041533; using 2021-02-05
No event date in 0001193125-21-041582; using 2021-02-05
No event date in 0001193125-21-041614; using 2021-02-05
No event date in 0001193125-21-041640; using 2021-02-05
No event date in 0001104659-21-024016; using 2021-02-09
missing or ambiguous subject CIK '0000711669-21-000019'
missing or ambiguous subject CIK '0000860413-21-000039'
*** No positions found in 0001193125-21-046579
No event date in 0001185185-21-000234; using 2021-02-10
*** No positions found in 0001193125-21-046786
*** No positions found in 0001193125-21-046773
*** No positions found in 0001193125-21-046782
*** No positions found in 0001193125-21-046778
*** No positions found in 0000921669-21-000008
*** No positions found in 0001193125-21-046774
*** No positions found in 0001193125-21-046779
*** No positions found in 0001193125-21-046803
No CUSIP in 0001104659-21-025277
No event date in 0001104659-21-026270; using 2021-02-15
No CUSIP in 0000929638-21-000404
No CUSI

count1 6309 count2 2829
total of 7627 ciks, 57886 13G/D filings
min stock fraction of portfolio 0.01
max stock fraction of portfolio 1.0
min fraction of portfolio in top 10 positions 0.4
period 2021-06-30 - total of 5949 ciks, 6110 13F filings
CIK 1755670 1 - 2 [('20210708', '0001755670-21-000004', 396), ('20210804', '0001755670-21-000005', 509)]
CIK 1842560 1 - 2 [('20210709', '0001842560-21-000003', 332), ('20210709', '0001842560-21-000004', 332)]
CIK 1353395 1 - 2 [('20210712', '0001085146-21-001901', 260), ('20210713', '0001085146-21-001945', 154)]
CIK 1092903 1 - 2 [('20210713', '0001096906-21-001592', 708), ('20210805', '0001096906-21-001852', 708)]
CIK 1026720 1 - 2 [('20210715', '0001012975-21-000271', 24), ('20210831', '0001012975-21-000336', 24)]
CIK 1840740 1 - 2 [('20210720', '0001840740-21-000007', 52), ('20210720', '0001840740-21-000008', 52)]
CIK 1609674 1 - 2 [('20210721', '0001609674-21-000004', 175), ('20210816', '0001609674-21-000005', 176)]
CIK 1666786 1 - 2 [('2021

CIK 1318757 0 - 2 [('20210813', '0001318757-21-000006', 2650), ('20210825', '0001325091-21-000014', 327)]
CIK 1325091 1 - 3 [('20210813', '0001325091-21-000010', 2041), ('20210825', '0001325091-21-000012', 2055), ('20210825', '0001325091-21-000013', 135)]
CIK 1388736 1 - 2 [('20210813', '0001388736-21-000003', 350), ('20210813', '0001388736-21-000004', 539)]
CIK 1440771 1 - 2 [('20210813', '0001567619-21-015505', 29), ('20210816', '0001567619-21-015565', 26)]
CIK 1451531 1 - 2 [('20210813', '0001451531-21-000004', 570), ('20210819', '0001451531-21-000005', 566)]
CIK 1483339 1 - 2 [('20210813', '0001483339-21-000004', 141), ('20210813', '0001483339-21-000005', 145)]
CIK 1499066 2 - 3 [('20210813', '0001172661-21-001687', 38), ('20210816', '0001172661-21-001886', 4), ('20210816', '0001172661-21-001887', 42)]
CIK 1512858 1 - 2 [('20210813', '0001512858-21-000003', 46), ('20210826', '0001512858-21-000004', 46)]
CIK 1519964 0 - 2 [('20210813', '0001325091-21-000011', 208), ('20210825', '000

In [None]:
# hide
# assorted test code: 

# dl = dailyList.dailyList(startD='20200101')

# s = scraper13D(startD='empty')
# s.updateForDays(dl,startD='20210702',endD='20210703')
# print(); s.printCounts()

# dailyList.dlCountFilings(startD='20210726',endD='20210731',formClass='13F-HR',noAmend=False)

# accNo = '0001174947-20-001195'
# i = parse13D(accNo,'SC 13D')
# i

# for accNo,info in s.infoMap['20210702'].items() :
#     print(accNo,info['positions'])

# b = utils.downloadSecUrl('',toFormat='souptext')
# utils.secBrowse('0001104659-21-079401')
# scrape13G.parse13GD('0001171520-19-000289')

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()