In [1]:
import requests
import json
import pandas as pd
import os
from lxml import etree
import re
from download_10k import get_all_tickers, download_report

pd.options.display.max_colwidth = 100


In [2]:
config_dict = None
config_file = f"/app/config.json"
with open(config_file) as json_file:
    config_dict = json.load(json_file)

In [None]:

ticker_list = get_all_tickers()

In [3]:
ticker_list = ["NVDA", "INTC"] # ["NVDA", "CMG", "INTC", "ABNB"]

In [4]:
def get_edgar_info_cik(ticker):
    url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={ticker}&type=10-k&dateb=&owner=include&count=100&search_text="
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Get the content of the file
        edgar_html = response.content
        
        # Parse the HTML content
        root = etree.HTML(edgar_html)

        # Use XPath to find all span elements with class="companyName"
        company_spans = root.xpath('//span[@class="companyName"]')

        # Iterate through each span element and extract its children nodes
        for company_span in company_spans:
            company_info = etree.tostring(
                company_span, encoding='unicode', method='html')
            cik_pattern = r'CIK\=(\d{10})'
            match = re.search(cik_pattern, company_info)
            
            if match:
                # Extract the CIK number from the match
                cik_number = match.group(1)
                print(f"{ticker} - CIK Number: {cik_number}")
                return cik_number
            else:
                raise ValueError("CIK number not found in the input string.")

    else:
        raise ValueError('Response not 200. Broken for: {}'.format(url))
    
def get_sec_sub(cik):
    sec_sub_cik_url = f"https://data.sec.gov/submissions/CIK{int(cik):010n}.json"

    print(f"Getting JSON info from SEC via {sec_sub_cik_url}")

    request = requests.get(
        sec_sub_cik_url, headers={"User-Agent": "Mozilla/5.0"})
    company_sec_sub_json = json.loads(request.content)

    # Any other filing -> files to retrieve?
    if "files" in company_sec_sub_json["filings"]:
        for other_filing in company_sec_sub_json["filings"]["files"]:
            other_filing_fn = other_filing["name"]
            sec_sub_cik_url = (
                f"https://data.sec.gov/submissions/{other_filing_fn}")

            print(f"Getting filing JSON info from SEC via {sec_sub_cik_url}")
            
            request = requests.get(
                sec_sub_cik_url, headers={"User-Agent": "Mozilla/5.0"})
            
            sub_filing_json = json.loads(request.content)
            other_filing["request_response_json"] = sub_filing_json

    return company_sec_sub_json

def get_ticker_10k_df(sec_sub_json):
    # Get recent filings
    ticker_sub_df = pd.DataFrame(sec_sub_json["filings"]["recent"])

    # Any other filing -> request_response_json to add too?
    if "files" in sec_sub_json["filings"]:
        for other_filing in sec_sub_json["filings"]["files"]:
            filing_sub_df = pd.DataFrame(other_filing["request_response_json"])
            ticker_sub_df = pd.concat([ticker_sub_df, filing_sub_df])

    # Filter out 10-k
    ticker_10k_df = ticker_sub_df[ticker_sub_df["form"] == "10-K"]
    ticker_10k_df = ticker_10k_df[ticker_10k_df["primaryDocument"] != ""]

    return ticker_10k_df

In [7]:
all_10k_df = None
for i, ticker in enumerate(ticker_list):
    check_saved_path = os.path.join(
        config_dict['annual_reports_html_save_directory'], ticker)
    if os.path.exists(check_saved_path):
        continue

    # Try to find the CIK
    ticker_cik = get_edgar_info_cik(ticker) # pd.read_html(edgar_url)
    
    # Get sec submissions
    ticker_sec_sub_json = get_sec_sub(ticker_cik)

    # Get 10-k submissions
    ticker_10k_df = get_ticker_10k_df(ticker_sec_sub_json)

    # Do what's needed to get 10-K URL
    acc_num_formatted = \
        ticker_10k_df["accessionNumber"].apply(lambda x: x.replace("-", ""))

    ticker_10k_df = ticker_10k_df.assign(accNumFormatted=acc_num_formatted)
        
    doc_url = ticker_10k_df.apply(
        lambda row: f'https://www.sec.gov/Archives/edgar/data/{ticker_cik}/{row["accNumFormatted"]}/{row["primaryDocument"]}', axis=1)

    ticker_10k_df = ticker_10k_df.assign(docUrl=doc_url)

    # Add the companies CIK and Ticker
    ticker_10k_df.insert(loc=0, column="ticker", value=ticker)
    ticker_10k_df["cik"] = ticker_cik

    if all_10k_df is None:
        all_10k_df = ticker_10k_df
    else:
        all_10k_df = pd.concat([all_10k_df, ticker_10k_df], ignore_index=True)
    

    

NVDA - CIK Number: 0001045810
Getting JSON info from SEC via https://data.sec.gov/submissions/CIK0001045810.json
Getting filing JSON info from SEC via https://data.sec.gov/submissions/CIK0001045810-submissions-001.json
INTC - CIK Number: 0000050863
Getting JSON info from SEC via https://data.sec.gov/submissions/CIK0000050863.json
Getting filing JSON info from SEC via https://data.sec.gov/submissions/CIK0000050863-submissions-001.json
Getting filing JSON info from SEC via https://data.sec.gov/submissions/CIK0000050863-submissions-002.json


In [8]:
def download_10k_html(row):
    save_path_directory = os.path.join(
        config_dict['annual_reports_html_save_directory'], 
        row["ticker"]) # , "10k", row["reportDate"])
    
    if not os.path.exists(save_path_directory):
        os.makedirs(save_path_directory)

    save_fn = os.path.join(save_path_directory, f"{row['reportDate']}")
    file_extension = row["docUrl"].split('.')[-1]
    save_fn = save_fn + '.' + file_extension

    print((
        f'Downloading 10k for {row["ticker"]} - {row["reportDate"]}: {row["docUrl"]}'))
    download_report(row["docUrl"], save_fn)
    return save_fn

    

rval = all_10k_df.apply(download_10k_html, axis=1)

# Put location of where we saved the HTML files into the dataframe
all_10k_df = all_10k_df.assign(savePath10kHtml=rval)


Downloading 10k for NVDA - 2023-01-29: https://www.sec.gov/Archives/edgar/data/0001045810/000104581023000017/nvda-20230129.htm
Downloading 10k for NVDA - 2022-01-30: https://www.sec.gov/Archives/edgar/data/0001045810/000104581022000036/nvda-20220130.htm
Downloading 10k for NVDA - 2021-01-31: https://www.sec.gov/Archives/edgar/data/0001045810/000104581021000010/nvda-20210131.htm
Downloading 10k for NVDA - 2020-01-26: https://www.sec.gov/Archives/edgar/data/0001045810/000104581020000010/nvda-2020x10k.htm
Downloading 10k for NVDA - 2019-01-27: https://www.sec.gov/Archives/edgar/data/0001045810/000104581019000023/nvda-2019x10k.htm
Downloading 10k for NVDA - 2018-01-28: https://www.sec.gov/Archives/edgar/data/0001045810/000104581018000010/nvda-2018x10k.htm
Downloading 10k for NVDA - 2017-01-29: https://www.sec.gov/Archives/edgar/data/0001045810/000104581017000027/nvda-2017x10k.htm
Downloading 10k for NVDA - 2016-01-31: https://www.sec.gov/Archives/edgar/data/0001045810/000104581016000205/nv

In [9]:
all_10k_df.to_pickle(
    os.path.join(
        config_dict["10k_df_pkl_pathfn"]))