In [2]:
#importing the necessary libraries

import pandas as pd
import re
import os
from tqdm import tqdm

# Load selected file paths - we now use a separate folder structure to make it GitHub friendly, 
#essentially making sure our 50GB filings data is not uploaded online

df = pd.read_csv(r"C:\Users\kusha\Desktop\financial-risk-analyzer\data\selected_10K_paths.csv")

#creating a function to extract sections from 10-K using regex
#we will use this for our item1A, item7
def extract_section(text, item_start, item_end_candidates):
    try:
        text_upper = text.upper() # normalizing to uppercase to make matching case-insensitive

        # Finding where the target item starts
        start_match = re.search(item_start, text_upper)
        if not start_match:
            return ""

        start_index = start_match.end() #this is where the section actually begins (after the header)

        # Find closest end candidate - like the beginning of another section (like 2A)
        end_index = len(text)
        for item_end in item_end_candidates:
            end_match = re.search(item_end, text_upper[start_index:])
            if end_match:
                end_index = start_index + end_match.start()
                break #breaking when we find a valid end

        return text[start_index:end_index].strip()

    except Exception as e:
        #failing mechanism - we are not using error codes due to large no. of inputs
        return ""

# initializing our storing list and the counters to determine whether the results are complete, partial, or empty
records = []
complete, partial, empty = 0, 0, 0

#running a loop over each 10-K available
for _, row in tqdm(df.iterrows(), total=len(df)):
    ticker = row['ticker']
    year = row['year']
    file_path = row['file_path']

    #skipping if file is not available/ moved
    if not os.path.exists(file_path):
        continue

    #reading the entire 10-K file
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            full_text = f.read()

        # Extract "Item 1A: Risk Factors" section using regex
        item_1a_text = extract_section(
            full_text,
            r"ITEM\s+1A[\s\.\:\-]*RISK\s+FACTORS",   # making our code robust to spacing/punctuation
            [r"ITEM\s+1B", r"ITEM\s+2", r"ITEM\s+3"]
        )

        # Extract "Item 7: MD&A" section using similar flexible pattern
        item_7_text = extract_section(
            full_text,
            r"ITEM\s+7[\s\.\:\-]*MANAGEMENT[’'\s]+S\s+DISCUSSION",  #ensures we dont overlook our section
            [r"ITEM\s+7A", r"ITEM\s+8", r"ITEM\s+9"]
        )

        #using this classifier logic to determine if outputs are complete, partial, or empty
        if len(item_1a_text) > 500 and len(item_7_text) > 500:
            complete += 1
        elif item_1a_text or item_7_text:
            partial += 1
        else:
            empty += 1

        # storing the data we extracted
        records.append({
            "ticker": ticker,
            "year": year,
            "file_path": file_path,
            "item_1a_text": item_1a_text,
            "item_7_text": item_7_text
        })

    except Exception as e:
        # adding exception if we werent able to find the sections
        print(f"Error reading {file_path}: {e}")
        empty += 1

#saving our output to csv file
out_df = pd.DataFrame(records)
out_df.to_csv("10K_extracted_sections.csv", index=False)

#printing our results
total = len(df)
print(f"\n✅ Extraction complete. Saved to 10K_extracted_sections.csv")
print(f"📄 Total files: {total}")
print(f"✔️  Complete (both 1A and 7): {complete}")
print(f"⚠️  Partial (only one): {partial}")
print(f"❌  Empty/missing both: {empty}")

100%|██████████| 1493/1493 [08:06<00:00,  3.07it/s]



✅ Extraction complete. Saved to 10K_extracted_sections.csv
📄 Total files: 1493
✔️  Complete (both 1A and 7): 244
⚠️  Partial (only one): 773
❌  Empty/missing both: 476


In [1]:
#summary of this file

# This script extracts Item 1A and Item 7 sections from 1493 SEC 10-K filings.
# It saves the extracted text into a structured CSV for our Langchain integration
# The extraction uses regex patterns to locate section boundaries within raw filing text.
# Out of 1493 filings, 244 had both sections extracted successfully.
# A large portion (773 partial, 476 empty) indicates challenges with inconsistent formatting across companies.