In [1]:
#installing sec edgar using pip - we will collect our 10-Ks from here
pip install sec-edgar-downloader

Collecting sec-edgar-downloaderNote: you may need to restart the kernel to use updated packages.

  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec-edgar-downloader)
  Downloading pyrate_limiter-3.7.1-py3-none-any.whl.metadata (25 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.7.1-py3-none-any.whl (28 kB)
Installing collected packages: pyrate-limiter, sec-edgar-downloader
Successfully installed pyrate-limiter-3.7.1 sec-edgar-downloader-5.0.3


In [7]:
# installing necessary libraries, including downloader from sec edgar, allowing us to store our 10-Ks on our local computer.
#using beautiful soup to parse HTML

import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader
from tqdm import tqdm

# Creating a function that takes in a Wikipedia URL and extracts the stock tickers listed in the table.
# S&P pages use a class="wikitable", so we find the table and then convert it to a DataFrame.
# We also replace '.' with '-' because EDGAR uses dash format.

def fetch_sp_tickers(wiki_url):
    resp = requests.get(wiki_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find('table', {'class': 'wikitable'})
    df = pd.read_html(str(table))[0]
    return df['Symbol'].str.replace('.', '-', regex=False).tolist()

# obtaining the list of 1500 S&P companies from wikipedia
wiki_urls = [
    "https://en.wikipedia.org/wiki/List_of_S&P_500_companies",
    "https://en.wikipedia.org/wiki/List_of_S&P_400_companies",
    "https://en.wikipedia.org/wiki/List_of_S&P_600_companies"
]

#initializing empty list to store the tickers
tickers = []

#running a loop to gather all our tickers
for url in wiki_urls:
    tickers += fetch_sp_tickers(url)
tickers = sorted(set(tickers))
print(f"Total unique tickers: {len(tickers)}")  #expecting around 1500 based on our URLs

#Downloading the 10-Ks by feeding our tickers
#using our email since sec-edgar requires one

dl = Downloader("sec-edgar-filings", "shahk83@mcmaster.ca")
for ticker in tqdm(tickers):
    out_dir = os.path.join("sec-edgar-filings", ticker, "10-K")
    if os.path.exists(out_dir) and os.listdir(out_dir):
        continue  # skip if already downloaded
    try:
        dl.get("10-K", ticker)
    except Exception as e:
        print(f"❌ {ticker} error: {e}")
    time.sleep(0.5)  # using this since there is a rate limit for sec edgar (10/s)


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]


Total unique tickers: 1505


 14%|█▍        | 212/1505 [46:16<3:57:55, 11.04s/it] 

❌ BSIG error: Ticker 'BSIG' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 24%|██▎       | 354/1505 [1:13:27<3:47:33, 11.86s/it]

❌ CSWI error: Ticker 'CSWI' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 28%|██▊       | 424/1505 [1:26:08<3:32:52, 11.82s/it]

❌ DRQ error: Ticker 'DRQ' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 49%|████▉     | 739/1505 [2:30:10<2:07:25,  9.98s/it]

❌ JBT error: Ticker 'JBT' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 80%|████████  | 1205/1505 [4:01:45<1:04:56, 12.99s/it]

❌ SGH error: Ticker 'SGH' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 81%|████████  | 1218/1505 [4:04:56<1:22:04, 17.16s/it]

❌ SJW error: Ticker 'SJW' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


 91%|█████████ | 1363/1505 [4:33:49<25:28, 10.76s/it]  

❌ UCBI error: Ticker 'UCBI' is invalid and cannot be mapped to a CIK. Please enter a valid ticker or CIK.


100%|██████████| 1505/1505 [5:01:45<00:00, 12.03s/it]


In [None]:
# our output shows that we gathered a total of 1505 tickers from sec-edgar using our list
# some of tickers gave us an error and could not be obtained.
#running this code took us 5 hours