# LLM 10K Document Analysis
#### By: Jeffrey Potvin 
#### Summary:

This script automates the extraction of new product information from SEC 10-K filings for a set of S&P 500 companies. It performs the following steps:

1. **Load Data:**  
   Reads a CSV file containing company data (including CIK, ticker, and market cap) from the WIKI_COMPANY_SCRAPE_CSV.py script.
   **Note:**
   This was done to save time during testing. The CSV is provided but you could pull some different subsets of companys using the provided WIKI_COMPANY_SCRAPE_CSV.py script. 
2. **Fetch Filings:**  
   Retrieves the latest 10-K filing for each company from the SEC EDGAR database.

3. **Clean Text:**  
   Cleans the raw filing text by removing HTML, URLs, extraneous numbers, and boilerplate language to prepare it for LLM analysis.

4. **Extract Products:**  
   Sends the cleaned text to the GPT-4O-MINI model via the OpenAI client to extract new product names and descriptions in JSON format. The script then parses the JSON output.

5. **Save Output:**  
   Writes the extracted product information along with company details to an output CSV file.

**Note:**  
This script features robust error handling throughout, ensuring that network issues, parsing errors, and API errors are properly managed and logged.


In [None]:
import requests
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
import openai
import json
import csv

# -------------------- SETTINGS -------------------- #
openai.api_key = "YOUR API KEY"                                             # Replace with your OPENAI API KEY
HEADERS = {"User-Agent": "YOUR NAME (YOUREMAIL@example.com)"}               # Replace with your name and email
SEC_API_BASE = "https://data.sec.gov/submissions/"
SEC_ARCHIVES_BASE = "https://www.sec.gov/Archives/edgar/data/"
OUTPUT_DIR = "10k_cleaned_files"                                            # Directory to save cleaned 10-K text files for reference
CSV_FILE = "sp500_companies.csv"                                            # CSV file with S&P 500 company data created with WIKI_COMPANY_SCRAPE_CSV.py
NUM_TO_PULL = 100                                                           # Number of companies to process decsending by market cap
EXTRACTED_PRODUCTS_CSV = "extracted_products.csv"                           # Output CSV file for extracted products
# -------------------------------------------------- #


# Initialize the OpenAI client
from openai import OpenAI
client = OpenAI(api_key=openai.api_key)

# -----------------------------------------------------------------
# 1) LOAD CSV & PREPARE SUBSET
# -----------------------------------------------------------------
# Define function to load the top companies by market cap from the CSV file
def load_top_companies_by_marketcap(csv_file=CSV_FILE, n=NUM_TO_PULL):
    df = pd.read_csv(csv_file, dtype={"CIK": str})
    df["Ticker"] = df["Ticker"].str.upper()
    return df.head(n)

# -----------------------------------------------------------------
# 2) GET LATEST 10-K FILING
# -----------------------------------------------------------------
# Define function to get the most recent 10-K filing for a CIK
def get_most_recent_10k_filing(cik):
    url = f"{SEC_API_BASE}CIK{cik}.json"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Could not load data for CIK={cik} (status={response.status_code}).")
        return None, None

    data = response.json()
    forms = data["filings"]["recent"]["form"]
    for i, form in enumerate(forms):
        if form == "10-K":
            accession = data["filings"]["recent"]["accessionNumber"][i].replace("-", "")
            primary_doc = data["filings"]["recent"]["primaryDocument"][i]
            filing_date = data["filings"]["recent"]["filingDate"][i]
            filing_url = f"{SEC_ARCHIVES_BASE}{cik}/{accession}/{primary_doc}"
            return filing_url, filing_date
    return None, None

# -----------------------------------------------------------------
# 3) CLEAN FILING TEXT
# -----------------------------------------------------------------
# Define function to clean the text of a 10-K filing using BeautifulSoup and regex
def clean_filing_text(url):
    try:
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Filing fetch failed: HTTP {response.status_code}")
            return None

        soup = BeautifulSoup(response.text, "html.parser")
        for script_or_style in soup(["script", "style"]):
            script_or_style.extract()

        text = soup.get_text(separator="\n")
        # Remove URLs
        text = re.sub(r"http[s]?://\S+", "", text)
        # Remove 4 and higher digit numbers to avoid confusing the LLM
        text = re.sub(r"\b\d{4,}\b", "", text)
        # Remove certain tags
        text = re.sub(r"\b(document|entity|dei|us-gaap|iso4217|xbrli|country):\S+\b", "", text, flags=re.IGNORECASE)

        # Attempt to find the start of the main content
        match = re.search(r"(Item 1\..*?Business|Item 7\..*?Management’s Discussion)",
                          text, re.DOTALL | re.IGNORECASE)
        if match:
            text = text[match.start():]

        # Remove boilerplate words
        text = re.sub(r"\b(pursuant to|section|reporting period|incorporated by reference|exhibit|schedule|part ii|part iii)\b",
                      "", text, flags=re.IGNORECASE)
        # Remove excess whitespace
        text = re.sub(r"\s+", " ", text).strip()
        # Remove nonprintable characters
        text = re.sub(r"[^\x20-\x7E]+", "", text)
        return text
    except Exception as e:
        print(f"Error in clean_filing_text: {e}")
        return None

# -----------------------------------------------------------------
# 4) USE GPT-4O-MINI TO EXTRACT NEW PRODUCTS
# -----------------------------------------------------------------
# Define function to extract new products from a 10-K text using GPT-4O-Mini
def extract_new_products(text):
    # Define the prompts for the GPT-4O-Mini model
    system_prompt = (
        "You are a world-class information extraction assistant. "
        "Given an excerpt of a 10-K SEC filing, identify references to any newly introduced "
        "or recently launched products. Return a JSON array of objects, each with "
        "'product_name' and 'product_description'. If no new products are found, return an empty JSON array: []."
    )
    user_prompt = f"Extract any new or recently introduced products from the following 10-K text:\n\n{text}"

    try:
        # Call the OpenAI API to generate completions
        completion = client.chat.completions.create(
            model="gpt-4o-mini",  # Change this model name as needed
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.0,
        )
        content = completion.choices[0].message.content
        
        # Debugging: Print the raw GPT output
        print("DEBUG: Raw GPT output:")
        print(content)
        
        # Remove markdown code fences if present
        if content.startswith("```"):
            lines = content.splitlines()
            # Remove the first line if it starts with triple backticks
            if lines[0].startswith("```"):
                lines = lines[1:]
            # Remove the last line if it contains only triple backticks
            if lines and lines[-1].strip() == "```":
                lines = lines[:-1]
            content = "\n".join(lines).strip()
        
        # Parse the cleaned content as JSON
        extracted_products = json.loads(content)
        if not isinstance(extracted_products, list):
            print("DEBUG: Parsed JSON is not a list.")
            return []
        
        final_products = []
        for item in extracted_products:
            product_name = item.get("product_name", "").strip()
            product_description = item.get("product_description", "").strip()
            if product_name:
                final_products.append({
                    "product_name": product_name,
                    "product_description": product_description
                })
        return final_products

    except json.JSONDecodeError as jde:
        print(f"JSON Decode Error: {jde}")
        print("Raw content from GPT after cleaning:", content)
        return []
    except Exception as e:
        print(f"GPT Extraction Error: {e}")
        return []

# -----------------------------------------------------------------
# 5) MAIN
# -----------------------------------------------------------------
# Define the main function to process the S&P 500 companies
def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # Create the CSV file with a header for extracted products
    with open(EXTRACTED_PRODUCTS_CSV, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter='|')
        writer.writerow(["Company Name", "Stock Name", "Filing Time", "New Product", "Product Description"])

    df_subset = load_top_companies_by_marketcap(CSV_FILE, NUM_TO_PULL)
    print(f"Processing {len(df_subset)} S&P 500 companies from {CSV_FILE}...")
    # Iterate over the subset of companies
    for idx, row in df_subset.iterrows():
        ticker = row["Ticker"]
        company_name = row["Company"]
        cik = row["CIK"]
        market_cap = row.get("MarketCap", None)

        print(f"\n=== {idx+1}: {ticker} ({company_name}) - MarketCap: {market_cap} ===")
        filing_url, filing_date = get_most_recent_10k_filing(cik)
        if not filing_url:
            print("  No 10-K found. Skipping...")
            continue

        print(f"  Found 10-K: {filing_url} (Date: {filing_date})")
        cleaned_text = clean_filing_text(filing_url)
        if not cleaned_text:
            print(f"  No cleaned text for {ticker}.")
            continue

        # Save the cleaned text locally for reference
        file_name = f"10k_{ticker}.txt"
        output_path = os.path.join(OUTPUT_DIR, file_name)
        with open(output_path, "w", encoding="utf-8") as f_out:
            f_out.write(cleaned_text)
        print(f"  ✔ Saved cleaned text to: {output_path}")

        # Extract new products using GPT
        extracted_products = extract_new_products(cleaned_text)
        if extracted_products:
            print(f"  ✔ Found {len(extracted_products)} new product(s). Writing to CSV...")
            with open(EXTRACTED_PRODUCTS_CSV, mode="a", newline="", encoding="utf-8") as f:
                writer = csv.writer(f, delimiter='|')
                for product_info in extracted_products:
                    writer.writerow([
                        company_name,
                        ticker,
                        filing_date,
                        product_info["product_name"],
                        product_info["product_description"]
                    ])
        else:
            print("  No new products extracted from GPT.")

if __name__ == "__main__":
    main()

Processing 100 S&P 500 companies from sp500_companies.csv...

=== 1: AAPL (Apple Inc.) - MarketCap: 3259495350272 ===
  Found 10-K: https://www.sec.gov/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm (Date: 2024-11-01)
  ✔ Saved cleaned text to: 10k_cleaned_files/10k_AAPL.txt
DEBUG: Raw GPT output:
```json
[
    {
        "product_name": "iPhone 16",
        "product_description": "The latest model in the iPhone line, featuring advanced technology and capabilities."
    },
    {
        "product_name": "iPhone 16 Plus",
        "product_description": "A larger version of the iPhone 16, offering enhanced display and battery life."
    },
    {
        "product_name": "iPhone 16 Pro",
        "product_description": "A premium model in the iPhone 16 series, equipped with advanced features and specifications."
    },
    {
        "product_name": "iPhone 16 Pro Max",
        "product_description": "The top-tier model in the iPhone 16 series, featuring the largest displa