In [1]:
import requests
import re
from bs4 import BeautifulSoup
import nltk
import time
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

In [2]:
# NLTK data files
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nirmitsachde/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
class SECFilingAnalyzer:
    def __init__(self, cik):
        self.headers = {
            'User-Agent': 'Student pamperedrebel@gmail.com',
            'Accept': 'application/json',
            'Host': 'data.sec.gov'
        }
        self.cik = cik

    def get_filing_links(self, num_years=5):
        """fetch 10-K filing links from SEC EDGAR db"""
        time.sleep(0.1)
        try:
            url = f'https://data.sec.gov/submissions/CIK{self.cik}.json'
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            data = response.json()
            filings = {
                'accessionNumber': data['filings']['recent']['accessionNumber'],
                'reportDate': data['filings']['recent']['reportDate'],
                'form': data['filings']['recent']['form']
            }
            k10_filings = [(date, f'https://www.sec.gov/Archives/edgar/data/{self.cik}/'
                                 f'{acc.replace("-", "")}/{acc}.txt')
                           for date, acc, form in zip(filings['reportDate'], filings['accessionNumber'], filings['form'])
                           if form == '10-K']
            return k10_filings[:num_years]
        except Exception as e:
            print(f"Error fetching SEC data: {str(e)}")
            return []

In [4]:
class DocumentParser:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Student pamperedrebel@gmail.com',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
            'Host': 'www.sec.gov'
        }

    def fetch_document(self, url):
        """fetch and parse SEC document"""
        try:
            response = requests.get(url, headers=self.headers, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "xml")
            return soup
        except requests.exceptions.RequestException as e:
            raise ValueError(f"Error fetching document: {e}")

    def extract_text_after_marker(self, soup, start_marker):
        """extract text after a specified marker"""
        text_nodes = soup.get_text(separator="\n", strip=True)
        start_index = text_nodes.find(start_marker)
        if start_index != -1:
            return text_nodes[start_index + len(start_marker):].strip()
        return ""

    def clean_text(self, text):
        """cleaning and processing"""
        # Remove URLs, XML tags, and unwanted patterns
        patterns_to_remove = [
            r"http\S+|www\S+",
            r"<.*?>",
            r"\bdei:.*?\b",
            r"\becd:.*?\b",
            r"\bus-gaap:.*?\b",
            r"Apple Inc\. \| \d+ Form \d+-K \| \d+"
        ]
        for pattern in patterns_to_remove:
            text = re.sub(pattern, '', text)

        def roman_to_int(roman):
            roman_values = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
            int_value = 0
            prev_value = 0
            for char in roman:
                curr_value = roman_values.get(char, 0)
                if curr_value > prev_value:
                    int_value += curr_value - 2 * prev_value
                else:
                    int_value += curr_value
                prev_value = curr_value
            return int_value

        text = re.sub(r'\b[XIVLCDM]+\b', lambda match: str(roman_to_int(match.group())), text)

        sentences = nltk.sent_tokenize(text)
        cleaned_sentences = [sentence for sentence in sentences if len(sentence.split()) > 5]
        return cleaned_sentences

    def extract_tables_as_sentences(self, soup):
        """trying to convert table data to sentences"""
        ix_elements = soup.find_all('ix:nonNumeric', recursive=True)
        tabular_sentences = []
        for ix_elem in ix_elements:
            name = ix_elem.get('name', 'Unknown')
            context = ix_elem.get('contextRef', 'Unknown')
            value = ix_elem.get_text(strip=True)
            if value and not re.match(r"^[-\d\s]*$", value):
                sentence = f"{name} (context: {context}) has a value of {value}."
                tabular_sentences.append(sentence)
        return tabular_sentences

In [5]:
def summarize_text_traditional(text, sentence_count=2):
    """summarize text using LSA algorithm (a variant of TextRank)"""
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)
    return ' '.join(str(sentence) for sentence in summary)

In [6]:
def summarize_text_modern(text_chunk, max_length=200):
    """summarize text using a distilbart model"""
    device = 0 if torch.cuda.is_available() else -1 
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device, truncation=True)
    summary_list = summarizer(text_chunk, max_length=max_length, min_length=int(max_length/2), do_sample=False)
    return summary_list[0]['summary_text']

In [7]:
def summarize_large_text(text, chunk_size=500):
    """handle large texts by breaking them into smaller chunks with controlled output"""
    sentences = nltk.sent_tokenize(text)
    chunks = [' '.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    summaries = [summarize_text_modern(chunk, max_length=200) for chunk in chunks]
    return ' '.join(summaries[:3])  # Combine summaries of the first few chunks for a longer summary

In [8]:
def extract_keywords_traditional(text, num_keywords=5):
    """extract single words using TF-IDF"""
    vectorizer = TfidfVectorizer(stop_words='english', max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform([text])
    return vectorizer.get_feature_names_out()

In [9]:
def extract_keywords_modern(text, num_keywords=5):
    """extract single words using KeyBERT"""
    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=num_keywords)
    return [keyword[0] for keyword in keywords]

In [10]:
class EvaluationMetrics:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

    def calculate_rouge_scores(self, reference, summary):
        """calculating ROUGE scores for summarization evaluation"""
        scores = self.rouge_scorer.score(reference, summary)
        return {
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_f1': scores['rougeL'].fmeasure
        }

    def calculate_semantic_similarity(self, reference, summary):
        """calculating semantic similarity using sentence embeddings"""
        ref_embedding = self.sentence_model.encode([reference])[0]
        sum_embedding = self.sentence_model.encode([summary])[0]
        similarity = cosine_similarity([ref_embedding], [sum_embedding])[0][0]
        return similarity

    def evaluate_keywords(self, text, extracted_keywords):
        """evaluating keyword relevance using embedding similarity"""
        text_embedding = self.sentence_model.encode([text])[0]
        keyword_embeddings = self.sentence_model.encode(extracted_keywords)
        similarities = cosine_similarity([text_embedding], keyword_embeddings)[0]
        return {
            'avg_relevance': np.mean(similarities),
            'max_relevance': np.max(similarities),
            'min_relevance': np.min(similarities)
        }

In [11]:
def main():
    # defining CIKs and start markers for 3 chosen companies
    companies = {
        "Apple": {"cik": "0000320193", "start_marker": "Unless otherwise stated, all information presented herein is based on the Company’s fiscal calendar"},
        "Amazon": {"cik": "0001018724", "start_marker": "This Annual Report on Form 10-K and the documents incorporated herein by reference"},
        "NVIDIA": {"cik": "0001045810", "start_marker": "Forward-Looking Statements"}
    }
    
    # html 
    html_content = """
    <html>
    <head>
        <title>SEC Filing Analysis</title>
        <style>
            table { width: 100%; border-collapse: collapse; }
            th, td { border: 1px solid black; padding: 8px; text-align: left; vertical-align: top; width: 33.33%; }
            th { background-color: #f2f2f2; }
            td { max-height: 160px; overflow-y: auto; white-space: pre-wrap; }
        </style>
    </head>
    <body>
        <h1>SEC Filing Analysis Results</h1>
        
        <!-- Apple Section -->
        <h2>Apple Inc.</h2>
        <p><a href="apple 10k 2024.pdf" target="_blank">apple 10k 2024.pdf</a></p>
        <table>
            <tr>
                <th>Type</th>
                <th>Summary</th>
                <th>Keywords</th>
            </tr>
    """
    
    # Apple 2024
    apple_info = companies["Apple"]
    apple_analyzer = SECFilingAnalyzer(apple_info["cik"])
    apple_parser = DocumentParser()
    
    apple_links = apple_analyzer.get_filing_links(1)
    
    for date, link in apple_links:
        try:
            soup = apple_parser.fetch_document(link)
            raw_text = apple_parser.extract_text_after_marker(soup, apple_info["start_marker"])
            
            cleaned_text = ' '.join(apple_parser.clean_text(raw_text))
            tabular_sentences = ' '.join(apple_parser.extract_tables_as_sentences(soup))
            all_text = cleaned_text + ' ' + tabular_sentences
            
            traditional_summary = summarize_text_traditional(all_text, sentence_count=2)
            traditional_keywords = extract_keywords_traditional(all_text, num_keywords=5)
            
            modern_summary = summarize_large_text(all_text, chunk_size=500)
            modern_keywords = extract_keywords_modern(all_text, num_keywords=5)
            
            html_content += f"""
                <tr>
                    <td style="height:160px;">Traditional</td>
                    <td style="height:160px;">{traditional_summary}</td>
                    <td style="height:160px;">{', '.join(traditional_keywords)}</td>
                </tr>
                <tr>
                    <td style="height:160px;">Modern</td>
                    <td style="height:160px;">{modern_summary}</td>
                    <td style="height:160px;">{', '.join(modern_keywords)}</td>
                </tr>
            """
            
        except Exception as e:
            print(f"Error processing Apple's {date} filing: {str(e)}")
    
    html_content += """
        </table>

        <!-- Amazon and NVIDIA Section -->
        <h2>Amazon and NVIDIA Comparison</h2>
        <table>
            <tr>
                <th>Company</th>
                <th>Modern Summary</th>
                <th>Modern Keywords</th>
            </tr>
    """
    
    # Amazon and NVIDIA 2023
    for company_name in ["Amazon", "NVIDIA"]:
        info = companies[company_name]
        analyzer = SECFilingAnalyzer(info["cik"])
        parser = DocumentParser()
        
        links = analyzer.get_filing_links(1)
        
        for date, link in links:
            try:
                soup = parser.fetch_document(link)
                raw_text = parser.extract_text_after_marker(soup, info["start_marker"])
                
                cleaned_text = ' '.join(parser.clean_text(raw_text))
                tabular_sentences = ' '.join(parser.extract_tables_as_sentences(soup))
                all_text = cleaned_text + ' ' + tabular_sentences
                
                modern_summary = summarize_large_text(all_text, chunk_size=500)
                modern_keywords = extract_keywords_modern(all_text, num_keywords=5)
                
                html_content += f"""
                    <tr>
                        <td style="height:160px;">{company_name}</td>
                        <td style="height:160px;">{modern_summary}</td>
                        <td style="height:160px;">{', '.join(modern_keywords)}</td>
                    </tr>
                """
                
            except Exception as e:
                print(f"Error processing {company_name}'s {date} filing: {str(e)}")

    evaluator = EvaluationMetrics()
    evaluation_results = {}
    
    for company_name, info in companies.items():
        evaluation_results[company_name] = {
            'traditional': {},
            'modern': {}
        }
        
        # ... (inside the processing loop for each company)
        
        # Add evaluation metrics
        evaluation_results[company_name]['traditional'] = {
            'rouge_scores': evaluator.calculate_rouge_scores(cleaned_text[:1000], traditional_summary),
            'semantic_similarity': evaluator.calculate_semantic_similarity(cleaned_text[:1000], traditional_summary),
            'keyword_metrics': evaluator.evaluate_keywords(cleaned_text, traditional_keywords)
        }
        
        evaluation_results[company_name]['modern'] = {
            'rouge_scores': evaluator.calculate_rouge_scores(cleaned_text[:1000], modern_summary),
            'semantic_similarity': evaluator.calculate_semantic_similarity(cleaned_text[:1000], modern_summary),
            'keyword_metrics': evaluator.evaluate_keywords(cleaned_text, modern_keywords)
        }

    # Add evaluation results to HTML output
    html_content += """
    <h2>Evaluation Metrics</h2>
    <table>
        <tr>
            <th>Company</th>
            <th>Method</th>
            <th>ROUGE-1</th>
            <th>ROUGE-2</th>
            <th>ROUGE-L</th>
            <th>Semantic Similarity</th>
            <th>Keyword Relevance</th>
        </tr>
    """
    
    for company, results in evaluation_results.items():
        for method, metrics in results.items():
            html_content += f"""
            <tr>
                <td>{company}</td>
                <td>{method}</td>
                <td>{metrics['rouge_scores']['rouge1_f1']:.3f}</td>
                <td>{metrics['rouge_scores']['rouge2_f1']:.3f}</td>
                <td>{metrics['rouge_scores']['rougeL_f1']:.3f}</td>
                <td>{metrics['semantic_similarity']:.3f}</td>
                <td>{metrics['keyword_metrics']['avg_relevance']:.3f}</td>
            </tr>
            """

    html_content += "</table>"
    
    html_content += """
        </table>
    </body>
    </html>
    """
    
    with open("sec_filing_analysis.html", "w") as file:
        file.write(html_content)

In [12]:
if __name__ == "__main__":
    main()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
