# Installs

In [1]:
%%capture
#For raw data
!pip install -U sec-edgar-downloader
!pip install beautifulsoup4

# #For embeddings
!pip install tiktoken
!pip install chromadb

# #For LLM Insights
!pip install openai
!pip install langchain
!pip install langchain-openai
!pip install gradio

# Raw Data Processing

In [42]:
# Setup sec-edgar
from sec_edgar_downloader import Downloader

def download_filings(ticker):
  """
  Downloads all available 10-K filings for a given company ticker

  Args:
      ticker: Ticker of company
  """

  secdl = Downloader("DelhiTechnologicalUniversity","keshavnath_me20a9_47@dtu.ac.in")
  # Download specifics
  DOCTYPE = "10-K"
  AFTER = "1995-01-01"
  BEFORE = "2024-01-01"
  # Download 10-K filing
  try:
    secdl.get(DOCTYPE,ticker,after=AFTER,before=BEFORE)
  except:
    print(f"Could not find given ticker {ticker}")
  print("Downloaded all available filings")

In [43]:
# Text cleaning and singling out the actual 10-K filing
from bs4 import BeautifulSoup
import re

def get_text_content(text_path):
  """
  Extracts textual content from the 10-K HTML file using Beautiful Soup.
  1) Open .text file just as a text file even though its actually an XML because its a flawed XML
  2) The first contained document is an HTML of the 10-K filing, followed by embedded zip files, images, excel sheets, etc. (not useful)
  3) Instead of trying to create an XML Tree, we can simply find the end of the first </DOCUMENT> tag, end the input stream, and open it as an HTML
  4) Use bs4 HTML Parser to extract the actual textual content from the HTML (i.e the filing)
  
  - Future work: Try to use other embedded docs especially the excels and the tables

  Args:
      text_path: Path to SEC-EDGAR .txt file.

  Returns:
      String containing the extracted text content of 10-K Filing.
  """

  html_text=""
  token = "</DOCUMENT>"
  # The above variable contains the value < / D O C U M E N T > (w/o spaces) (also mentioned in the docstring: first _ tag) but apparently github doesn't show that token so I'm adding this comment for clarity

  try:
    with open(text_path, 'r') as f:
      # text_content = f.read()
      for line in f:
        html_text += line
        if token in html_text:
          break
  except FileNotFoundError:
    print(f"Error: File not found at {text_path}")
    exit()

  soup = BeautifulSoup(html_text, 'html.parser')

  # Exclude specific tags and attributes
  excluded_tags = ['img', 'script', 'style', 'link', 'object', 'form', 'button', 'td','embed','iframe']
  for tag in excluded_tags:
    for element in soup.find_all(tag):
      element.decompose()  # Remove the entire element

  # Extract text from remaining elements
  text = ''.join(node.string.strip() for node in soup.find_all(string=True))
  return text

In [44]:
# Create dictionary of year:filing
# Used to extract the relevant downloaded filing during RAG
import os

def create_yearly_dict(ticker):
  """
  Creates a mapping of year: filing path

  Args:
      Company ticker

  Returns:
      Dictionary that maps year to filing path
  """
  filings = {}
  checkdir = f'./sec-edgar-filings/{ticker}/10-K' #Default save directory template of sec-edgar
  for filename in os.listdir(checkdir):
    filing_folder = os.path.join(checkdir, filename)
    full_path = os.path.join(filing_folder,'full-submission.txt')
    year = filename.split('-')[1] # Files are in the format 123xx-YY-456xx
    year=int(year)
    if (year<=24): # Filenames only contain the last 2 digits of the year so we need to convert them to full years
      year+=2000
    else:
      year+=1900
    filings[year]=full_path

  return filings

#OpenAI Setup

In [45]:
import os
os.environ["OPENAI_API_KEY"] = 'openai_api_key' # Replace with API key if want to reproduce

In [46]:
# Create chat model
from langchain_openai import ChatOpenAI

chat = ChatOpenAI(temperature=0) # Temperature 0 for reproducability

In [47]:
# In case questions are repeated
from langchain.cache import InMemoryCache
import langchain
langchain.llm_cache = InMemoryCache()
langchain.chat_cache = InMemoryCache()

In [48]:
from langchain.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List

In [49]:
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Document Embedding

In [50]:
def extract_years(prompt):
  """
  Extracts all the years from a prompt
  Important so that we don't unnecessarily waste credits embedding info we don't need. This is a naive method as it relies on mentions of each year.
  - Future work: use better parser (check for words like 'between' to generate list).

  Args:
      prompt: Input request

  Returns:
      A list of extracted years in orer of mention
  """
  pattern = r"\d{4}"
  extracted_years = re.findall(pattern, prompt)
  print(f"Found years:{extracted_years}")
  return extracted_years

In [34]:
!mkdir embeddings

mkdir: cannot create directory ‘embeddings’: File exists


In [51]:
# Embedding chunks of all 10-K filings

CHUNK_SIZE = 200
MAX_DOCS = 15
# Total of 3000 tokens given to context, and max 15 different years

embedding_function = OpenAIEmbeddings()

def embed_filings(ticker,filings_dict,extracted_years):
  """
  Breaks and embeds chunks of 10-K filings per year using OpenAI and ChromaDB

  Args:
      ticker: Ticker of company
      filings_dict: Mapping of filings generated from create_yearly_dict
  """

  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=CHUNK_SIZE) # Recursive to ensure chunk size similarity as break tokens (\n, \n\n, ' ' etc) are inconsistent in HTML

  for year in extracted_years:
    year = int(year)
    filepath = filings_dict[year]
    clean_text = get_text_content(filepath)
    doc = Document(page_content=clean_text,metadata={"source":"local"}) # Create langchain document from text

    docs = text_splitter.split_documents([doc])
    db = Chroma.from_documents(docs, embedding_function, persist_directory=f'./embeddings/{ticker}/{year}')
    print(f"Completed embedding of filings for {ticker}:{year}")

# Prompting and Document Retrieval

In [52]:
# Pydantic output format so we can generate plots automatically
# The model can sometimes make mistakes with values in this format (compared w/ raw text output) but its necessary for plots and analysis
class Insight(BaseModel):
  title: str = Field(description="Title of the insight/metric to be reported (e.g. Net Income, Gross Margin as Percentage of Net Sales etc.)")
  datapoints: List[float] = Field(description="List of numerical (float) data points and correpsonding years (e.g. [10,20,30]) in ascending order of year")
  years: List[str] = Field(description="List of corresponding year for each datapoint (e.g. [2002,2003,2004]) in ascending order")
  unit: str = Field(description="Unit of measurement for the values (e.g. Million USD, Percentage, etc.)")

class ModelOutput(BaseModel):

  insights: List[Insight] = Field(description="List of insights extracted from the given context")
  summary: str = Field(description="Textual summary of the insights (e.g. These figures indicate a decline in revenue, gross margin, net income, and earnings per share from 1995 to 1996, along with an increase in restructuring costs. The company faced challenges in maintaining profitability and operational efficiency during this period.)")

  class Config:
    arbitrary_types_allowed = True

parser = PydanticOutputParser(pydantic_object=ModelOutput)

In [53]:
def generate_request(ticker,actual_prompt=None):
  """
  Generates the required prompt from an input promp, context, system prompt and output format

  Args:
      ticker: Company ticker
      actual_prompt: Requested prompt (if not given, a default prompt is run)

  Returns:
      request: Fully-formed ChatPrompt request
  """

  extracted_years = extract_years(actual_prompt)
  docs_per_year = MAX_DOCS//len(extracted_years) # Try to fill up 3000 tokens with context

  relevant_docs = {}

  for extracted_year in extracted_years:
    extracted_year=int(extracted_year)
    dbl = Chroma(persist_directory=f'./embeddings/{ticker}/{extracted_year}', embedding_function=embedding_function) # Open embeddings of required year
    retriever = dbl.as_retriever()

    search_kwargs = {"k":docs_per_year}
    picked = retriever.get_relevant_documents(actual_prompt,search_kwargs=search_kwargs) #Thought of using MultiQueryRetriever but wanted to minimise credit usage

    combined_context = ''.join([pick.page_content for pick in picked])
    relevant_docs[extracted_year] = combined_context

  # Basic Prompt engineering - I didn't really try many other system prompts so this could definitely be approved
  system_template = "You are an expert at carefully reading and analyzing the SEC 10-K filings of a company. You are able to identify key figures about the company from reading portions of these annual reports."
  system_template+= " You make sure to report the correct value, sign and unit for each metric/insight."
  system_prompt = SystemMessagePromptTemplate.from_template(system_template)

  prompt_template = actual_prompt + "\n"
  for extracted_year in relevant_docs.keys():
    prompt_template += f"Content from {extracted_year}:{relevant_docs[extracted_year]}\n\n"

  prompt_template+="{instructions}"

  human_prompt = HumanMessagePromptTemplate.from_template(prompt_template)

  chat_prompt = ChatPromptTemplate.from_messages([system_prompt,
                                                  human_prompt]) #Zero shot as of now, but few-shot would certainly make it better
  request = chat_prompt.format_prompt(instructions=parser.get_format_instructions())

  print(f"Generated prompts")

  return request

In [54]:
def send_request(request):
  """
  Sends request to ChatGPT and gets Pydantic object

  Args:
      request: Request from prompt function
  """
  result = chat.invoke(request)
  result_object = parser.parse(result.content) #Create pydantic object from json

  print(f"LLM called and output generated")

  return result_object

In [55]:
import matplotlib.pyplot as plt

def plotter(result_object):
  """
  Creates output plots from result object

  Args:
      result_object: Pydantic Object generated from send_request

  Returns:
      Generated plots
  """

  rows = (len(result_object.insights) + 1) // 2  # Adjust rows for a 2-column layout
  cols = min(2, len(result_object.insights))  # Adjust columns for a maximum of 2 plots per row
  fig, axes = plt.subplots(rows, cols, figsize=(12, 8)) # Easier to pass a single fig to Gradio so add all figs as subplots into a single fig

  for i, insight in enumerate(result_object.insights):
    ax = axes.flat[i]
    title = str(insight.title)
    unit = str(insight.unit)
    datapoints = insight.datapoints
    years = insight.years

    ax.bar(years, datapoints) # Bar plot
    ax.plot(years, datapoints, marker='o', color='green')  # Line plot
    for year, datapoint in zip(years,datapoints):
      ax.text(year, datapoint, f"{datapoint} {unit}", ha='center', va='bottom', fontsize=8)
    ax.set_xlabel("Year")
    ax.set_ylabel(f"{title} ({unit})")
    ax.set_title(f"{title} in {', '.join(years)}")
    ax.grid(True)

  fig.suptitle(f"Insights ({len(result_object.insights)} total)", fontsize=14)  # Add overall title
  plt.tight_layout()  # Adjust spacing between subplots

  print(f"Completed plot generation")

  # return plt
  return fig

In [66]:
saved_tickers = []

def pipeline(ticker: str, prompt=None):
  """
  Runs the pipeline from start to finish
  - Future work: check for ticker correctness and other error management

  Args:
    ticker: Company Ticker
    prompt: Insight you want to generate (defaults to earning-based insight)

  Returns:
    (plots: plt object of generated plots,
     summary: Textual summary of insights)
  """

  if not prompt:
    prompt = "Can you tell me the differences in company revenue and some other key metrics between the years 1995 and 1996 of this company?"
    # Default insight request if user can't think of one

  if (ticker not in saved_tickers):
    saved_tickers.append(ticker)
    download_filings(ticker) # No need to redownload the filings if we have already used this ticker in the current session

  filings_dict = create_yearly_dict(ticker)

  embed_filings(ticker,filings_dict,extract_years(prompt))

  request = generate_request(ticker,prompt)
  result_object = send_request(request)
  plots = plotter(result_object)
  summary = str(result_object.summary)+"\n"

  return plots,summary

In [None]:
# Basic gradio interface built into the notebook

import gradio as gr

demo = gr.Blocks()

with demo:
  gr.Markdown("Type a company ticker and request some insight, this will return an insight summary and some plots.")
  gr.Markdown("You can leave the prompt blank to obtain a default insight: 'Can you tell me the differences in company revenue and some other key metrics between the years 1995 and 1996 of this company?'")

  gr.Interface(
      fn=pipeline,
      inputs=["text", "text"],
      outputs=[gr.Plot(), "text"]
      )

demo.launch(share=True,debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://09675ed24b574b91ec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Found years:['1995', '1996']
Completed embedding of filings for AAPL:1995
Completed embedding of filings for AAPL:1996
Found years:['1995', '1996']
Generated prompts
LLM called and output generated
Completed plot generation
