In [None]:
# default_exp cikFInfo

# cikFInfo

> Save parsed form info for each CIK in a separate JSON-format file.

In [None]:
#hide
%load_ext autoreload
%autoreload 2
from nbdev import show_doc

In [None]:
#export

import collections
import json
import os

from secscan import utils, dailyList
from secscan import scrape13F, scrape8K, scrape6K, scrape13G, scrape13D, scrape4

defaultCikFInfoDir = os.path.join(utils.stockDataRoot,'cikFInfo')
allScraperClasses = [scrape13F.scraper13F,scrape8K.scraper8K,scrape6K.scraper6K,
                     scrape13G.scraper13G,scrape13D.scraper13D,scrape4.scraper4]
cikFPrefLen = 4

Save parsed form info for each CIK in a separate JSON-format file.
The format is designed so that info for additional forms can simply be appended to the file.

In [None]:
#export

def getCikFInfoDirAndPath(cik, cikFInfoDir=defaultCikFInfoDir) :
    if len(cik)<2 or not cik.isdigit() or cik[0]=='0' :
        raise ValueError(f'invalid CIK "{cik}"')
    fDir = os.path.join(cikFInfoDir,cik[:cikFPrefLen])
    return fDir,os.path.join(fDir,cik+'.json')

def jsonValError(msg, s) :
    if len(s) > 200 :
        s = s[:100] + ' ... ' + s[-100:]
    return ValueError(msg + ' in ' + s)

def loadCikFInfo(cik, cikFInfoDir=defaultCikFInfoDir) :
    fPath = getCikFInfoDirAndPath(cik, cikFInfoDir)[1]
    if not os.path.exists(fPath) :
        return {}
    with open(fPath,'r',encoding='ascii') as f :
        s = f.read().strip()
    if s[-1] != ',' :
        raise jsonValError('missing ending ,', s)
    return json.loads('{'+s[:-1]+'}')

def saveCikFInfo(cik, cikFInfo, removeDups=False, cikFInfoDir=defaultCikFInfoDir) :
    if removeDups :
        existingCikFInfo = loadCikFInfo(cik, cikFInfoDir=cikFInfoDir)
        cikFInfo = dict((k,v) for k,v in cikFInfo.items() if k not in existingCikFInfo)
    if len(cikFInfo) == 0 :
        return
    s = json.dumps(cikFInfo, indent=0).strip()
    if s[0]!='{' or s[-1]!='}' :
        raise jsonValError('missing start/end {}', s)
    fDir, fPath = getCikFInfoDirAndPath(cik, cikFInfoDir)
    if not os.path.exists(fDir) :
        os.makedirs(fDir)
    with open(fPath,'a',encoding='ascii') as f :
        f.write(s[1:-1])
        f.write(',\n')

def saveAllCikFInfo(startD, endD, scraperClasses,
                    removeDups=True, cikFInfoDir=defaultCikFInfoDir,
                    ciks=None) :
    dl = dailyList.dailyList(startD=startD, endD=endD)
    cikInfoMap = {}
    for scraperClass in scraperClasses :
        scraper = scraperClass(startD=startD, endD=endD)
        scraper.addToCikInfoMap(dl, cikInfoMap, ciks=ciks)
    for cik,cikFInfo in cikInfoMap.items() :
        if (ciks is not None and cik not in ciks) :
            continue
        saveCikFInfo(cik, cikFInfo, removeDups=removeDups, cikFInfoDir=cikFInfoDir)

Code to check CIK format and figure out the right prefix length:

In [None]:
def checkCiks() :
    cikNames = utils.pickLoad(os.path.join(utils.stockDataRoot,'dlMaps','cikNames.pkl'))
    print('ciks with leading 0', [cik for cik in cikNames if cik[0]=='0'])
    print('less than 4 long', [cik for cik in cikNames if len(cik)<4])
    for prefLen in [3,4] :
        ciksByPref = collections.defaultdict(list)
        for cik in cikNames :
            ciksByPref[cik[:prefLen]].append(cik)
        print(f'prefix length {prefLen}: {len(ciksByPref)} folders,'
              +f' max {max(len(v) for v in ciksByPref.values())} files')
# checkCiks()
# OUTPUT: 
# ciks with leading 0 []
# less than 4 long ['63']
# prefix length 3: 773 folders, max 9154 files
# prefix length 4: 4003 folders, max 933 files
# - chose prefix length 4

In [None]:
# saveAllCikFInfo('20221001','2024',allScraperClasses)
# saveAllCikFInfo('20220701','20221001',allScraperClasses)

In [None]:
# saveAllCikFInfo('20221001','2024',[scrape13F.scraper13F,scrape8K.scraper8K],ciks=['1553733','33533','1634379'])
# mm = loadCikFInfo('1634379')
# mm.keys()

In [None]:
# saveAllCikFInfo('20220701','20221001',[scrape13F.scraper13F,scrape8K.scraper8K],ciks=['1553733','33533','1634379'])
# mm2 = loadCikFInfo('1634379')
# mm2.keys()

Test cik file info class:

In [None]:
assert loadCikFInfo('123')=={}
fDir, fPath = getCikFInfoDirAndPath('12345')
assert fDir.endswith('12345'[:cikFPrefLen]) and fPath.endswith('12345.json')

In [None]:
#hide
# uncomment and run to regenerate all library Python files
# from nbdev.export import notebook2script; notebook2script()