<a href="https://colab.research.google.com/github/kevalshah90/llms/blob/main/10k_chunk_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial setup

In [None]:
import getpass
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain openai unstructured==0.12.5 xmltodict instructor

# Download TSLA 10-K

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'yourname yourname@yourorg.com'})
tsla_10k = loader.load()

# Define LLM code

In [None]:
prompt = """
You are an expert at extracting text data from SEC filings like 10-Ks, 10-Qs, etc.
Your sole job is to take a chunk of text and classify which Item in the SEC filing it belongs to.
The goal is to identify and extract the text corresponding to the following Items:
- Item 1. Business
- Item 1A. Risk Factors
- Item 2. Properties
- Item 3. Legal Proceedings
- Item 7. Management's Discussion and Analysis of Financial Condition and Results of Operations
- Item 7A. Quantitative and Qualitative Disclosures About Market Risk
- Item 8. Financial Statements and Supplementary Data

Please adhere to the following guidelines:
1. Be accurate, factual, and concise with your responses.
2. Your response MUST be grounded in truth and the data that is present in the text. Do not make any assumptions or inferences beyond what is explicitly stated in the text.
3. If the chunk of text does not clearly belong to any of the specified Items, return an empty list []. Do not attempt to guess or assign an Item if there is insufficient evidence in the text.
4. If the chunk of text belongs to multiple Items, return a list of all applicable Items, like ["Item 1", "Item 1A"].
5. Do not provide any explanations or additional information in your response. Only return a list of Items or an empty list.

Your response should be formatted as a valid JSON list, like ["Item 1", "Item 1A"] or [].
"""

In [None]:
from pydantic import BaseModel
from typing import List, Optional
import anthropic
from openai import OpenAI
import instructor

openai_client = instructor.patch(OpenAI())


class Item(BaseModel):
    name: Optional[str]

class ItemResponse(BaseModel):
    items: List[Item]


def classify_items(chunk: str) -> str:
  response = openai_client.chat.completions.create(
      model="gpt-4-0125-preview",
      response_model=ItemResponse,
      messages=[
          {
              "role": "system",
              "content": prompt,
          },
          {
              "role": "user",
              "content": f"Extract the items from the following chunk: {chunk}",
          },
      ],
  )
  return response


# Chunk TSLA 10-K

In [None]:
from langchain.text_splitter import TokenTextSplitter

filing_text = tsla_10k[0].page_content.replace("\n", " ")

# You can play around with the chunk_size and chunk_overlap to see what works best!
chunk_size = 4096
chunk_overlap = 400

# Split by tokens
token_splitter = TokenTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

# Chunk the text
chunks = token_splitter.split_text(filing_text)

# Run Item extraction

In [None]:
from collections import defaultdict

"""
Define the output dictionary.
Key: Item name.
Value: Chunks of text that may belong to the Item.

You can modify to include more (or less) Items, if you wish.
"""
items_map = {
    "Item 1": [],
    "Item 1A": [],
    "Item 2": [],
    "Item 3": [],
    "Item 7": [],
    "Item 7A": [],
    "Item 8": [],
    "unknown": [],
}

In [None]:
import time

start_time = time.time()

# Loop through all chunks
for index, chunk in enumerate(chunks):
    # Use LLM to classify the possible Items for the chunk
    response = classify_items(chunk)

    # Get the possible Items
    items = response.items

    # Print output for debugging
    print(f"Chunk {index}. Items: {items}")

    # Loop through each item
    for item in items:
      name = item.name
      # Only keep Items that we are expecting
      if name in items_map:
        items_map[name].append(chunk)

    # Prevent rate limiting
    time.sleep(1)

end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

# Save the output locally as a .json file

In [None]:
filename = "tsla_2023_10-k_items.json"

In [None]:
import json
from google.colab import files

# Assuming you have populated the items_map dictionary with the extracted text

# Convert the items_map dictionary to a JSON string
json_data = json.dumps(items_map, indent=2)

# Write the JSON string to a file
with open("filename", "w") as file:
    file.write(json_data)

# Download the JSON file from Google Colab
files.download("filename")