In [2]:
# Required libraries for file system operations, regular expressions, and data manipulation
import os
import re
import pandas as pd

# location where the 10-Ks are stored
ROOT_DIR = r"C:\Users\kusha\SEP769 - Deep Learning Project\sec-edgar-filings"
year_pattern = re.compile(r'-(\d{2})-')  # extracting years since each ticker has multiple 10-Ks

# Initializing our list to collect details for the most recent 10-K filing for each ticker
selected_records = []

# Iterating over each ticker folder inside the SEC filings directory
for ticker in os.listdir(ROOT_DIR):
    ticker_dir = os.path.join(ROOT_DIR, ticker, "10-K")
    
    # Skipping tickers that don’t have any 10-K filings
    if not os.path.exists(ticker_dir):
        continue

    # Getting the list of available 10-K folders for every ticker
    filings = os.listdir(ticker_dir)
    filing_years = []

    # extracting filing year from each folder using regex
    for f in filings:
        match = year_pattern.search(f)
        if match:
            year = int(match.group(1))

            # Normalizing the year to 4-digit format: assuming years <50 are 2000s, others are 1900s (since our current format is YY)
            full_year = 2000 + year if year < 50 else 1900 + year
            filing_years.append((full_year, f))

    # Sorting the filings by year (most recent first)
    filing_years.sort(reverse=True)
    
    # Selecting only the most recent filing (top entry)
    selected = filing_years[:1]

    # Construct the full file path for the selected 10-K's 'full-submission.txt' file
    for year, folder in selected:
        full_path = os.path.join(ticker_dir, folder, "full-submission.txt")
        selected_records.append({
            "ticker": ticker,
            "year": year,
            "accession_folder": folder,
            "file_path": full_path
        })

# Convert the collected records to a pandas DataFrame for easier downstream processing
df = pd.DataFrame(selected_records)

# Save the resulting DataFrame to a CSV file for later use (e.g., during extraction or model input)
df.to_csv("selected_10K_paths.csv", index=False)
print("Saved selected 10-K file paths to selected_10K_paths.csv")

Saved selected 10-K file paths to selected_10K_paths.csv


In [None]:
#summary
#this file gathers the paths for each ticker and stores it in a csv file
#the reason for running this file is to get 1 10-K per company
#we also ensure that we are getting the latest filing, hence, adding code to select the top result
#we will now use this csv file to extract our 10-K data