In [1]:
import pandas as pd
import re
import os
from tqdm import tqdm

# Load selected file paths
df = pd.read_csv("selected_10K_paths.csv")

# Helper: extract a section from text using regex
def extract_section(text, item_start, item_end_candidates):
    try:
        # Normalize text to uppercase for matching
        text_upper = text.upper()

        start_match = re.search(item_start, text_upper)
        if not start_match:
            return ""

        start_index = start_match.end()

        # Try to find the nearest end match
        end_index = len(text)
        for item_end in item_end_candidates:
            end_match = re.search(item_end, text_upper[start_index:])
            if end_match:
                end_index = start_index + end_match.start()
                break

        return text[start_index:end_index].strip()

    except Exception as e:
        return ""

# Storage
records = []

# Loop over each filing
for _, row in tqdm(df.iterrows(), total=len(df)):
    ticker = row['ticker']
    year = row['year']
    file_path = row['file_path']

    if not os.path.exists(file_path):
        continue

    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            full_text = f.read()

        # Extract Item 1A
        item_1a_text = extract_section(
            full_text,
            r"ITEM\s+1A[\.\: ]+RISK\s+FACTORS",
            [r"ITEM\s+1B", r"ITEM\s+2", r"ITEM\s+3"]
        )

        # Extract Item 7
        item_7_text = extract_section(
            full_text,
            r"ITEM\s+7[\.\: ]+MANAGEMENT['’]S\s+DISCUSSION.*?",
            [r"ITEM\s+7A", r"ITEM\s+8", r"ITEM\s+9"]
        )

        records.append({
            "ticker": ticker,
            "year": year,
            "file_path": file_path,
            "item_1a_text": item_1a_text,
            "item_7_text": item_7_text
        })

    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Save to CSV
out_df = pd.DataFrame(records)
out_df.to_csv("10K_extracted_sections.csv", index=False)
print("✅ Extraction complete. Saved to 10K_extracted_sections.csv")

100%|██████████| 1493/1493 [03:17<00:00,  7.55it/s]


✅ Extraction complete. Saved to 10K_extracted_sections.csv
