In [None]:
# Import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import os
import spacy
import nltk
import ollama
from difflib import SequenceMatcher

# Setup nltk to create manageable sections of longer blocks of text

nltk.download("punkt")
ner_model = spacy.load("output/model-best")
if not ner_model.has_pipe("sentencizer"):
    ner_model.add_pipe("sentencizer", first=True)
    
    
# Load my S&P 500 company data with CIKs and tickers
sp500_df = pd.read_csv("sp500_with_cik_cleaned.csv")
sp500_df['CIK'] = sp500_df['CIK'].astype(str).str.zfill(10)

HEADERS = {"User-Agent": "Your Name your.email@example.com"}

# Initiate dictionary to store my duplicate entry prevention
extracted_products = {}

# Check for duplicate product names
def is_duplicate(name, company, threshold=0.88):
    existing = extracted_products.get(company, [])
    for prev in existing:
        ratio = SequenceMatcher(None, name.lower(), prev.lower()).ratio()
        if ratio >= threshold:
            return True
    return False

# Logs results
def log(msg):
    print(f"\033[96m[LOG]\033[0m {msg}")

# Retrieves the most recen 8-K filings using the url available from Edgar
def get_most_recent_8k_index_url(cik):
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    res = requests.get(url, headers=HEADERS)
    if res.status_code == 200:
        data = res.json()
        for i, form in enumerate(data['filings']['recent']['form']):
            if form == "8-K":
                acc = data['filings']['recent']['accessionNumber'][i]
                acc_no_dash = acc.replace("-", "")
                return f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc_no_dash}/{acc}-index.htm"
    return None

# Extracts Ex. 99.1 url from the 8-K index page
def extract_ex99_url(index_url):
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    for row in soup.find_all("tr"):
        cols = row.find_all("td")
        if len(cols) >= 4 and "EX-99.1" in cols[3].text.upper():
            href = cols[2].find("a", href=True)
            if href:
                return f"https://www.sec.gov{href['href']}" if href['href'].startswith("/") else href['href']
    return None

# Gets the text that is within the url
def fetch_text_from_url(url):
    try:
        res = requests.get(url, headers=HEADERS)
        if res.ok:
            soup = BeautifulSoup(res.text, "html.parser")
            return soup.get_text(separator=" ", strip=True)
    except:
        return ""
    return ""

# backup for if EX-99.1 is missing I can get other HTML pages
def fallback_text_from_index(index_url):
    res = requests.get(index_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    for link in soup.find_all("a", href=True):
        href = link['href']
        if href.endswith(".htm") or href.endswith(".html"):
            full_url = f"https://www.sec.gov{href}" if href.startswith("/") else href
            try:
                page = requests.get(full_url, headers=HEADERS)
                text = BeautifulSoup(page.text, "html.parser").get_text(separator=" ", strip=True)
                if any(kw in text.lower() for kw in ["launch", "introduce", "announce", "unveil", "expand"]):
                    return text
            except:
                continue
    return ""

# Retrieve the filing date from the index page
def get_filing_date(index_url):
    try:
        res = requests.get(index_url, headers=HEADERS)
        soup = BeautifulSoup(res.text, "html.parser")
        match = re.search(r"Filing Date\s+(\d{4}-\d{2}-\d{2})", soup.get_text())
        return match.group(1) if match else "N/A"
    except:
        return "N/A"

# Get the product name and description from the text and use trigger words and validation via Ollama's prompt
def extract_product_info(text, company_name, ticker):
    doc = ner_model(text)
    trigger_words = ["launch", "introduce", "unveil", "announce", "expand", "roll out", "release"]
    
    sentences = [
        sent.text.strip() for sent in doc.sents
        if any(w in sent.text.lower() for w in trigger_words)
        and 20 < len(sent.text) < 300
    ]
    log(f"🧠 {len(sentences)} candidate sentences found")

    for sentence in sentences:
        log(f"[CANDIDATE] {sentence}")
        sent_doc = ner_model(sentence)
        for ent in sent_doc.ents:
            if ent.label_ != "PRODUCT":
                continue
            product = ent.text.strip()
            if is_duplicate(product, company_name):
                log(f"⏩ Skipping duplicate: {product}")
                continue

            # Validation via LLM
            validation_prompt = f"""
Is "{product}" a product, service, or platform introduced by {company_name} in the following sentence?

"{sentence}"

Only respond with Yes or No.
""".strip()
            try:
                val_resp = ollama.chat(model="llama3.2:1b", messages=[{"role": "user", "content": validation_prompt}])
                if "yes" not in val_resp["message"]["content"].lower():
                    log(f"❌ LLM Rejected: {product}")
                    continue
            except Exception as e:
                log(f"LLM validation error: {e}")
                continue

            # Description
            description_prompt = f"""
Write a one-sentence product description (under 180 characters):

Company: {company_name}
Product: {product}
Context: {sentence}

Only provide the description text.
""".strip()
            try:
                desc_resp = ollama.chat(model="llama3.2:1b", messages=[{"role": "user", "content": description_prompt}])
                description = desc_resp["message"]["content"].strip()[:180]
                extracted_products.setdefault(company_name, []).append(product)
                return product, description
            except:
                return product, "N/A"

    # Fallback if no NER product
    for sentence in sentences:
        match = re.search(r"(?:launched|introduced|unveiled|released|announced) (the )?([A-Z][a-zA-Z0-9\- ]{3,50})", sentence)
        if match:
            product = match.group(2).strip()
            if is_duplicate(product, company_name):
                continue

            fallback_prompt = f"""
Is "{product}" a product, service, or platform introduced by {company_name} in this sentence?

Sentence: "{sentence}"

Only answer Yes or No.
""".strip()
            try:
                val_resp = ollama.chat(model="llama3.2:1b", messages=[{"role": "user", "content": fallback_prompt}])
                if "yes" in val_resp["message"]["content"].lower():
                    description_prompt = f"""
Write a short description (under 180 characters) of the product mentioned below.

Company: {company_name}
Product: {product}
Context: {sentence}
""".strip()
                    desc_resp = ollama.chat(model="llama3.2:1b", messages=[{"role": "user", "content": description_prompt}])
                    description = desc_resp["message"]["content"].strip()[:180]
                    extracted_products.setdefault(company_name, []).append(product)
                    return product, description
            except:
                continue

    return "N/A", "N/A"

# Save the results in a csv file
def save_to_csv_if_valid(data, filename="extracted_data.csv"):
    if not data["Product Name"] or data["Product Name"].upper() == "N/A":
        log("Skipping invalid product.")
        return
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
    else:
        df = pd.DataFrame([data])
    df.to_csv(filename, index=False)
    print(f"💾 Saved: {data['Product Name']} → {filename}")

    
# run a pipeline to automate the entire process
def run_full_pipeline():
    for _, row in sp500_df.iterrows():
        cik, company, ticker = row['CIK'], row['Company Name'], row['Ticker']
        print(f"\n📄 Processing {company} ({ticker})")
        index_url = get_most_recent_8k_index_url(cik)
        if not index_url:
            log("No recent 8-K")
            continue

        ex99_url = extract_ex99_url(index_url)
        log(f"EX-99.1: {ex99_url if ex99_url else 'Fallback'}")

        text = fetch_text_from_url(ex99_url) if ex99_url else fallback_text_from_index(index_url)
        if not text:
            log("No usable text")
            continue

        product, description = extract_product_info(text, company, ticker)
        filing_date = get_filing_date(index_url)

        result = {
            "Company Name": company,
            "Ticker Symbol": ticker,
            "Filing Date": filing_date,
            "Product Name": product,
            "Product Description": description
        }

        print("📊 Extracted Result:")
        for k, v in result.items():
            print(f"{k}: {v}")

        save_to_csv_if_valid(result)
        time.sleep(1.2)

# Run the pipeline
run_full_pipeline()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kylaemmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



📄 Processing 3M (MMM)
[96m[LOG][0m EX-99.1: Fallback
[96m[LOG][0m 🧠 1 candidate sentences found
[96m[LOG][0m [CANDIDATE] Upon
the authorization by you of the release of the Securities, the several Underwriters propose to offer the Securities for sale upon the
terms and conditions set forth in the Pricing Prospectus and the Prospectus.
📊 Extracted Result:
Company Name: 3M
Ticker Symbol: MMM
Filing Date: 2025-03-13
Product Name: N/A
Product Description: N/A
[96m[LOG][0m Skipping invalid product.

📄 Processing A. O. Smith (AOS)
[96m[LOG][0m EX-99.1: https://www.sec.gov/Archives/edgar/data/91142/000009114225000002/a12312024exhibit991.htm
[96m[LOG][0m 🧠 3 candidate sentences found
[96m[LOG][0m [CANDIDATE] Reconciliations from GAAP measures to non-GAAP measures are provided in the financial information included in this news release.
[96m[LOG][0m [CANDIDATE] Forward-looking Statements This release contains statements that the Company believes are “forward-looking statements” 