In [None]:
# default_exp utils

# utils

> Assorted low-level utilities for a flexible SEC filings scanner.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import datetime
import gzip
import os
import pickle
import re
from pytz import timezone
import requests
import time

We store all scraped data under stockDataRoot.

In [None]:
#export
stockDataRoot = os.path.expanduser(os.path.join('~','secData2'))

In [None]:
stockDataRoot

'/home/ubuntu/secData2'

We download SEC data using the requests library.
We retry a few times in case of a temporary internet glitch,
and also recognize an SEC-specific temporary outage message and raise an Exception for it
so that we can flag the problem and retry later.

In [None]:
#export

def requestUrl(url, timeout=5.0, nTries=5, returnText=False, **kwargs) :
    "Downloads a URL using the requests package."
    for i in range(nTries) :
        try :
            r = requests.get(url,timeout=timeout,params=kwargs)
            r.raise_for_status()
            return r.text if returnText else r
        except Exception as e :
            print('Error','downloading',url,'-',e)
            if i >= nTries-1 :
                raise

secUrlPref = 'https://www.sec.gov'
pageUnavailableRE = re.compile('page is temporarily unavailable',re.IGNORECASE)
def downloadSecUrl(secSubUrl) :
    """
    Downloads a url from the SEC site, also checking for an SEC-specific temporary outage message.
    """
    urlContents = requestUrl(secUrlPref+secSubUrl).text
    if pageUnavailableRE.search(urlContents) :
        raise Exception('temporary SEC outage')
    return urlContents

Test downloading from the SEC website:

In [None]:
t = downloadSecUrl('')
assert 'securities and exchange' in t.lower()

We store scraped data in pickled format,
either storing an object in a single pickled file
or storing a dict by saving one file per key
(for example, one file per date).
We can optionally use gzip compression (smaller files, but slower to read).

In [None]:
#export

def openFp(fpath, mode, use_gzip) :
    "Open a file for writing or reading, optionally using gzip compression."
    openfunc = gzip.open if use_gzip else open
    return openfunc(fpath,mode)

def pickSave(fpath, ob, use_gzip=False, **kwargs) :
    "Save a pickled object to a file, optionally using gzip compression."
    with openFp(fpath, 'wb', use_gzip) as f :
        pickle.dump(ob, f, **kwargs)

def pickLoad(fpath, use_gzip=False) :
    "Load a pickled object from a file, optionally using gzip compression."
    with openFp(fpath, 'rb', use_gzip) as f :
        return pickle.load(f)

def pickLoadIfPath(path_or_ob) :
    """
    If given a path, loads a pickled object from it; otherwise returns
    its argument unchanged (assumes it's an already loaded object).
    """
    if isinstance(path_or_ob,str) :
        return pickLoad(path_or_ob)
    else :
        return path_or_ob

Test pickled data storage in single files:

In [None]:
import random
rng = random.Random(42)
test_rand = dict((f'r{i}', rng.random()) for i in range(10))
pickSave('test.pkl', test_rand)
assert test_rand == pickLoad('test.pkl')
pickSave('test.pkl', test_rand, use_gzip=True)
assert test_rand == pickLoad('test.pkl', use_gzip=True)
time.sleep(1)
os.unlink('test.pkl')

In [None]:
#export

def savePklToDir(toDir, fName, ob, use_gzip=False) :
    """
    Saves a pickled object to a file under a directory, optionally using gzip compression.
    Creates the directory if it doesn't exist.
    """
    if not os.path.exists(toDir) :
        os.makedirs(toDir)
    fPath = os.path.join(toDir, fName)
    pickSave(fPath,ob, use_gzip=use_gzip)

def loadPklFromDir(fromDir, fName, defaultVal, use_gzip=False) :
    """
    Load a pickled object from a file under a directory, optionally using gzip compression.
    Returns a default value if the file doesn't exist.
    """
    fPath = os.path.join(fromDir, fName)
    if os.path.exists(fPath) :
        return pickLoad(fPath, use_gzip=use_gzip)
    else :
        return defaultVal

Test pickled data storage under directory:

In [None]:
savePklToDir('testdirpkl','test.pkl', test_rand)
assert test_rand == loadPklFromDir('testdirpkl','test.pkl',None)
time.sleep(1)
os.unlink(os.path.join('testdirpkl','test.pkl'))
os.rmdir('testdirpkl')

In [None]:
#export

def saveSplitPklToDir(m, toDir, fSuff='m.pkl', dirtyMap=None) :
    """
    Saves a dict with str keys to a separate file for each key.
    If dirtyMap is True, saves all keys.
    If dirtyMap is None (default), saves only keys that don't yet have a file saved.
    Otherwise, also saves keys k for which dirtyMap.get(k) is true.
    """
    if not os.path.exists(toDir) :
        os.makedirs(toDir)
    for k in sorted(m.keys()) :
        fPath = os.path.join(toDir, k+fSuff)
        if dirtyMap is True :
            needToSave = True
        else :
            needToSave = not os.path.exists(fPath)
            if dirtyMap is not None :
                needToSave = needTooSave or dirtyMap.get(k)
        if needToSave :
            pickSave(fPath,m[k])

def loadSplitPklFromDir(fromDir, startK=None, endK=None, fSuff='m.pkl') :
    """
    Loads a pickled dict with str keys stored with a separate file for each key,
    optionally restricting to keys in [startK .. endK)
    """
    m = {}
    if not os.path.exists(fromDir) :
        return m
    fNames = sorted(fName for fName in os.listdir(fromDir)
                    if fName.endswith(fSuff))
    for fName in fNames :
        fPref = fName[:-len(fSuff)]
        if ((startK is not None and fPref<startK)
                or (endK is not None and endK<=fPref)) :
            continue
        m[fPref] = pickLoad(os.path.join(fromDir,fName))
    return m

Test pickled dict storage split by key:

In [None]:
saveSplitPklToDir(test_rand, 'testsplitpkl')
assert test_rand == loadSplitPklFromDir('testsplitpkl')
test_sub = dict((k,v) for k,v in test_rand.items() if 'r3'<=k<'r7')
assert test_sub == loadSplitPklFromDir('testsplitpkl',startK='r3',endK='r7')
time.sleep(1)
for k in test_rand.keys() :
    os.unlink(os.path.join('testsplitpkl',k+'m.pkl'))
os.rmdir('testsplitpkl')

We use the current Eastern US time to control when to check for SEC filings.

In [None]:
easternUSTimeZone = timezone('US/Eastern')
def curEasternUSTime() :
    return datetime.datetime.now(easternUSTimeZone)

In [None]:
curEasternUSTime().isoformat()

'2021-06-03T12:21:12.877201-04:00'

In [None]:
def printSamp(m,n=10) :
    """
    Prints a sample of n items from object m , where m is a list or dict;
    for other objects just prints the whole thing.
    """
    if isinstance(m,list) :
        for i,item in enumerate(m[:n]) :
            print(i,end=' ')
            printSamp(item,n)
    elif isinstance(m,dict) :
        for k in m.keys()[:n] :
            print(k,end=' ')
            printSamp(m[k],n)
    else :
        print(m)