## Crawl SEC Data

In [1]:
import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import time

### Get Ticker to CIK mapping

In [6]:
from typing import Dict, Optional

def dummy_sec_request_headers(user_agent: Optional[str] = None) -> Dict[str, str]:
    if user_agent is None:
        user_agent = 'Sample Company Name AdminContact@<sample company domain>.com'
    return {
        'User-Agent': user_agent,
        'Accept-Encoding': 'gzip, deflate',
    }
request_headers = dummy_sec_request_headers("Vernicosa Investment Club vernicosa.retrievers@slmail.me")

In [10]:
company_tickers = pd.read_json("data/company_tickers.json")

In [11]:
def ticker_to_cik(ticker):
    try:
        df = company_tickers.loc[ticker]
    except KeyError:
        return None
    
    if type(df) != pd.core.series.Series:
        return None
    else:
        return str(df.loc["cik_str"]).zfill(10)

In [12]:
ticker_to_cik("AAPL")

'0000320193'

### Get relevant index companies

In [13]:
def clean_ticker(ticker):
    return ticker.split(" ")[0]
companies = pd.read_csv("data/spx.csv", sep=";")
companies.Ticker = companies.Ticker.apply(clean_ticker)
companies["cik"] = companies.Ticker.apply(ticker_to_cik)
companies.dropna(subset=["cik"], inplace=True)
companies = companies.sample(n=100, random_state=1337).copy()

### Build request pipeline for 10-K filings

In [14]:
def get_annual_report_list(cik, request_headers, as_of):
    r = requests.get(f"https://data.sec.gov/submissions/CIK{cik}.json", headers=request_headers)
    company_submissions = r.json()
    annual_reports = pd.DataFrame(company_submissions["filings"]["recent"])
    for filing_subset in company_submissions["filings"]["files"]:
        r_subset = requests.get(f"https://data.sec.gov/submissions/{filing_subset['name']}", headers=request_headers)
        filing_subset_json = r_subset.json()
        filing_subset_df = pd.DataFrame(filing_subset_json)
        annual_reports = pd.concat([annual_reports, filing_subset_df]).copy()
    
    annual_reports.drop_duplicates(subset=["accessionNumber"], inplace=True)
    annual_reports = annual_reports[annual_reports["form"] == "10-K"].copy()
    annual_reports["reportDate"] = pd.to_datetime(annual_reports["reportDate"])
    annual_reports = annual_reports[annual_reports["reportDate"] > as_of]
    return annual_reports.accessionNumber.to_list(), annual_reports

In [15]:
def get_filing_by_acc_nr_cik(acc_nr, cik, request_headers):
    time.sleep(0.1)
    r = requests.get(f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_nr.replace('-', '')}/{acc_nr}.txt", headers=request_headers)
    if not r.ok:
        print(r.text)
    return r.text

In [16]:
def download_all_filings_by_cik(cik, request_headers, path, as_of):
    reports, annual_reports = get_annual_report_list(cik, request_headers, as_of)
    for report in reports:
        with open(os.path.join(path, f"{report}.html"), "w", encoding='utf-8') as file:
            file.write(get_filing_by_acc_nr_cik(report, cik, request_headers))
    return reports

In [17]:
def get_annual_reports(cik, request_headers, as_of):
    reports, annual_reports = get_annual_report_list(cik, request_headers, as_of)
    return annual_reports

In [None]:
acc_nr_store = {}
for index, data in tqdm(companies.iterrows(), total=len(companies)):
    if data["cik"] in acc_nr_store:
        continue
    acc_nr_store[data["cik"]] = download_all_filings_by_cik(data["cik"], request_headers, "filings", "2000-01-01")
    json.dump(acc_nr_store, open("metastore/acc_nr_store.json", "w"))
    time.sleep(1)

  0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
# get all annual reports
cik_store = {}
for index, data in tqdm(companies.iterrows(), total=len(companies)):
    if data["cik"] in cik_store:
        continue
    cik_store[data["cik"]] =  get_annual_reports(data["cik"], request_headers, "2000-01-01").reset_index(drop=True).to_json()
    json.dump(cik_store, open("metastore/cik_store.json", "w"))
    time.sleep(1)

  0%|          | 0/100 [00:00<?, ?it/s]