In [77]:
import pathlib
import textwrap
from bs4 import BeautifulSoup
import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [37]:
GOOGLE_API_KEY = 'AIzaSyCOUhHFqksZF5X2lILrTzoh0OYfN4J2Pb4'
genai.configure(api_key=GOOGLE_API_KEY)

In [38]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [39]:
model = genai.GenerativeModel('gemini-1.0-pro')

In [95]:
import pandas as pd
import re

def read_txt_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def get_text_from_candidates(response):
    """Extracts only the text content from the response.candidates list.

    Args:
      response: The response object from the Gemini API.

    Returns:
      A list containing the text content from each candidate.
    """
    text_list = []
    for candidate in response.candidates:
        text_list.append(candidate.content.parts[0].text)  # Access the text part
    return text_list

def extract_company_info(txt):
    # Find the index where HTML starts
    html_start_index = txt.find('<HTML>')
    
    if html_start_index == -1:
        raise ValueError("HTML not found in the text.")
    
    # Split text into two parts: before HTML and HTML part
    before_html = txt[:html_start_index]
    html_part = txt[html_start_index:]
    
    # Store the two parts in a list
    text_parts = [before_html, html_part]
    
    # Label the first element as company information text
    company_info_text = text_parts[0]
    
    response = model.generate_content(company_info_text +'\n' + "Using the above info Tell me only the name(like ABC) of the company?" + "\n" 
                                      + "When is the filing is filed convert in year-month-day(Answer like Filing Date : 2004-05-10) format(only date nothing else)?")
    
    to_markdown(response.text)
    text_list = get_text_from_candidates(response)
    
    answers = []
    filing_date = []

    # Extract company using regular expression (assuming company name is present)
    match = re.search(r"(?:Company|Company Name): (.*?)\n",text_list[0], flags=re.IGNORECASE)  # Case-insensitive
    if match:
        answers.append(f"Company: {match.group(1)}")

    # Extract filing date using regular expression
    match = re.search(r"Filing Date : (\d{4}-\d{2}-\d{2})", text_list[0])
    if match:
        filing_date.append(match.group(1))
# You can add more regular expressions to extract other important information  
    
    return text_parts, answers, filing_date,text_list

file_path = "sec-edgar-filings/NVDA/10-K/0001012870-02-002262/full-submission.txt"
text = read_txt_file(file_path)

text_parts, ans, date,response = extract_company_info(text)

In [101]:
print(f"10-K Filing of {ans[0]} is filed on {date[0]}")

10-K Filing of Company: NVIDIA is filed on 2002-05-14


In [41]:
def combine_text(text):
    """Combines a single text string into paragraphs, handling non-breaking spaces and empty lines.

    Args:
      text: A string containing the text snippets.

    Returns:
      A string with the combined text, where paragraphs are separated by newlines
      and non-breaking spaces are replaced with regular spaces.
    """

    # Process the single text string
    processed_text = ""
    current_paragraph = ""
    for line in text.splitlines():  # Split text into lines
    # Skip empty lines
        if not line.strip():
            continue

        # Replace non-breaking spaces with regular spaces
        line = line.replace(u'\xa0', ' ')

        # Check for triple newlines to mark paragraph breaks
        if line == "\n\n\n":
            if current_paragraph:
                processed_text += current_paragraph.strip() + "\n\n"  # Add paragraph separator with double newlines
            current_paragraph = ""
        else:
            current_paragraph += line.strip() + " "  # Add text with space for proper separation

    # Append the last paragraph (if any)
    if current_paragraph:
        processed_text += current_paragraph.strip()

    return processed_text

In [42]:
# Parse HTML content
soup = BeautifulSoup(text_parts[1], 'html.parser')

# Extract all text from HTML
all_text = soup.get_text()

all_text = combine_text(all_text)

In [64]:
import nltk

sentences = nltk.sent_tokenize(all_text)
len(sentences)

1184

In [65]:
sentences[:10]

['Prepared by R.R.',
 'Donnelley Financial -- Form 10-K UNITED STATES  SECURITIES AND EXCHANGE COMMISSION  Washington, D.C. 20549      FORM 10-K x ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934   For the fiscal year ended January 27, 2002    OR ¨ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934     Commission file number: 0-23985    NVIDIA CORPORATION  (Exact name of registrant as specified in its charter) Delaware 94-3177549 (State or Other Jurisdiction of Incorporation or Organization) (I.R.S.',
 'Employer Identification No.)',
 '2701 San Tomas Expressway  Santa Clara, CA 95050  (408) 486-2000  (Address, including zip code, and telephone number, including area code, of principal executive offices)    Securities registered pursuant to Section 12(b) of the Act:  None    Securities registered pursuant to Section 12(g) of the Act:  Common stock, $.001 par value per share    Indicate by check mark whether the re

In [46]:
def process_batch(batch, sections, section_dictionary):
    """Processes a batch of sentences and classifies them into sections."""

    prompt = f"Classify the following paragraph:\n{'. '.join(batch)}\n" \
          f"Possible sections: {' | '.join(sections)}\n" \
          f"If none of the above, classify as 'Other'"
    response = model.generate_content(prompt)

    for section_name, classified_sentence in zip(get_text_from_candidates(response), batch):
        if section_name in sections:
            section_dictionary[section_name].append(classified_sentence)
        else:
            section_dictionary["Others"].append(classified_sentence)

def classify_report_sections(report_text):
    """Classifies sentences in an annual report into sections using heuristics.

    Args:
    report_text: A string containing the annual report text.

    Returns:
    A dictionary where keys are section names and values are lists of sentences in that section.
    """

    sections = [
      "Business", "Risk Factors", "Selected Financial Data",
      "Management Discussion", "Financial Statements"
    ]
    sentences = nltk.sent_tokenize(report_text)
    section_dictionary = {section: [] for section in sections}
    section_dictionary["Others"] = []

    batch_size = 20  # Adjust batch size as needed (consider API limits)
    batch = []
    for sentence in sentences:
        batch.append(sentence)
        if len(batch) == batch_size:
            process_batch(batch, sections, section_dictionary)
            batch = []  # Clear batch after processing

    # Process remaining sentences (if any)
    if batch:
        process_batch(batch, sections, section_dictionary)

    return section_dictionary

sections_text = classify_report_sections(all_text)

In [61]:
for section,sentences in sections_text.items():
    sections_text[section] = " ".join(sentences)

In [102]:
def summarize_with_gemini(text):
    """
    This function calls the Gemini API (replace with your actual API endpoint)
    to summarize the text and extract important things it covers.

    Args:
      text: The text to be summarized.

    Returns:
      A dictionary containing the summary and important things covered.
    """
    prompt1 = f"Given text \n{text} \n Summarize it"
    prompt2 = f"Given text \n{text} \n Note down important things in New Line and in short"
    summary = model.generate_content(prompt1)
    important_things = model.generate_content(prompt2)
    
    summary = get_text_from_candidates(summary)
    important_things = get_text_from_candidates(important_things)
    
    return summary,important_things

In [105]:
summary = {'Business' : [], 'Financial Statements' : [], 'Management Discussion' : [],
           'Risk Factors' : [],'Selected Financial Data' : [],'Others' : []}

for section, text in summary.items():
    section_text = sections_text[section]
    summary_section, imp_of_section = summarize_with_gemini(section_text)
    text.append([summary_section,imp_of_section])
    
    if section == 'Business':
        prompt1 = f'From given paragraph : \n{section_text}\n What is the main product of the company?'
        prompt2 = f'From given paragraph : \n{section_text}\n How do they plan to expanding business?'
        prompt3 = f'From given paragraph : \n{section_text}\n Is the business profitable?'
        prompt4 = f'From given paragraph : \n{section_text}\n Is there market for it?'
        prompt5 = f'From given paragraph : \n{section_text}\n How much is the competition in the market for the product?'
        
        prompt_list = [prompt1,prompt2,prompt3,prompt4,prompt5]
        
        for prompt in prompt_list:
            response1 = model.generate_content(prompt)
            text.append(get_text_from_candidates(response1))
            
            time.sleep(1)
            
    elif section == 'Financial Statements':
        prompt1 = f'From given paragraph : \n{section_text}\n What is the important numbers associated with companies finance here?'
        prompt2 = f'From given paragraph : \n{section_text}\n How is the result of the companies fincances Good, Moderate, Bad?'
        prompt3 = f'From given paragraph : \n{section_text}\n Is there growth in investment or Capital Expenditure of Company?'
        
        prompt_list = [prompt1,prompt2,prompt3]
        
        for prompt in prompt_list:
            response1 = model.generate_content(prompt)
            text.append(get_text_from_candidates(response1))
            
            time.sleep(1)
    
    else:
        prompt1 = f'From given paragraph : \n{section_text}\n Give insight of the company work?'
        prompt2 = f'From given paragraph : \n{section_text}\n Give me small details about management of Company'
        prompt3 = f'From given paragraph : \n{section_text}\n Give me details of risk associated'
        
        prompt_list = [prompt1,prompt2,prompt3]
        
        for prompt in prompt_list:
            response1 = model.generate_content(prompt)
            text.append(get_text_from_candidates(response1))
            
            time.sleep(1)