In [1]:
import nest_asyncio

nest_asyncio.apply()

import os
import openai
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from llama_index import ServiceContext
from llama_index.llms import OpenAI
from llama_index.schema import MetadataMode

In [3]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)


In [4]:
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
    BaseExtractor,
)
from llama_index.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)


class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": (
                    node.metadata["document_title"]
                    + "\n"
                    + node.metadata["excerpt_keywords"]
                )
            }
            for node in nodes
        ]
        return metadata_list


extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
    # EntityExtractor(prediction_threshold=0.5),
    # SummaryExtractor(summaries=["prev", "self"], llm=llm),
    # KeywordExtractor(keywords=10, llm=llm),
    # CustomExtractor()
]

transformations = [text_splitter] + extractors

In [5]:
from llama_index import SimpleDirectoryReader


In [6]:
!mkdir -p data
!wget -O "data/10k-132.pdf" "https://www.dropbox.com/scl/fi/6dlqdk6e2k1mjhi8dee5j/uber.pdf?rlkey=2jyoe49bg2vwdlz30l76czq6g&dl=1"
!wget -O "data/10k-vFinal.pdf" "https://www.dropbox.com/scl/fi/qn7g3vrk5mqb18ko4e5in/lyft.pdf?rlkey=j6jxtjwo8zbstdo4wz3ns8zoj&dl=1"

--2024-01-25 11:46:17--  https://www.dropbox.com/scl/fi/6dlqdk6e2k1mjhi8dee5j/uber.pdf?rlkey=2jyoe49bg2vwdlz30l76czq6g&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.13.18, 2620:100:6057:18::a27d:d12
Connecting to www.dropbox.com (www.dropbox.com)|162.125.13.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc219789a86fee09b4a53fe8595a.dl.dropboxusercontent.com/cd/0/inline/CMC6JaAyXXJrvX1AFSvZebpmKoMTfMLsHil6IjvHVeWhfby_c4G-gycy30uM9Hoo4L8kkGMPo3B81Bto2xMWTxnOF8nqjRdOn1T-wE_FqjOVu197WWx5HXwJ6F-ASKn_Bws/file?dl=1# [following]
--2024-01-25 11:46:18--  https://uc219789a86fee09b4a53fe8595a.dl.dropboxusercontent.com/cd/0/inline/CMC6JaAyXXJrvX1AFSvZebpmKoMTfMLsHil6IjvHVeWhfby_c4G-gycy30uM9Hoo4L8kkGMPo3B81Bto2xMWTxnOF8nqjRdOn1T-wE_FqjOVu197WWx5HXwJ6F-ASKn_Bws/file?dl=1
Resolving uc219789a86fee09b4a53fe8595a.dl.dropboxusercontent.com (uc219789a86fee09b4a53fe8595a.dl.dropboxusercontent.com)... 162.125.13.15, 2620:100:6057:15::a27d:d0f
Connect

In [7]:
# Note the uninformative document file name, which may be a common scenario in a production setting
uber_docs = SimpleDirectoryReader(input_files=["data/10k-132.pdf"]).load_data()
uber_front_pages = uber_docs[0:3]
uber_content = uber_docs[63:69]
uber_docs = uber_front_pages + uber_content


In [8]:
uber_docs[0].__dict__

{'id_': '1109a751-48cc-4463-b608-6cc2edec1824',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': '10k-132.pdf',
  'file_path': 'data/10k-132.pdf',
  'file_type': 'application/pdf',
  'file_size': 2829436,
  'creation_date': '2024-01-25',
  'last_modified_date': '2024-01-25',
  'last_accessed_date': '2024-01-23'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': '2019\nAnnual  \nReport',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n'}

In [9]:
from llama_index.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

uber_nodes = pipeline.run(documents=uber_docs)

100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
100%|██████████| 3/3 [00:01<00:00,  2.88it/s]
100%|██████████| 1/1 [00:00<00:00,  1.22it/s]
100%|██████████| 3/3 [00:00<00:00,  3.73it/s]
100%|██████████| 3/3 [00:00<00:00,  4.65it/s]
100%|██████████| 2/2 [00:00<00:00,  2.79it/s]
100%|██████████| 3/3 [00:00<00:00,  5.05it/s]
100%|██████████| 3/3 [00:00<00:00,  3.64it/s]
100%|██████████| 20/20 [00:16<00:00,  1.18it/s]


In [10]:
uber_nodes[0].metadata["document_title"]

'"Unveiling the Multifaceted Terrain of 2019: An All-Encompassing Annual Report"'

In [11]:
uber_nodes[2].metadata["document_title"]

'Form 10-K Annual Report for Uber Technologies, Inc. for the fiscal year ended December 31, 2019: Securities Act Filing Requirements, Company Classification, and Summary of Financial Information for an Emerging Growth Company.'

In [12]:
# Note the uninformative document file name, which may be a common scenario in a production setting
lyft_docs = SimpleDirectoryReader(
    input_files=["data/10k-vFinal.pdf"]
).load_data()
lyft_front_pages = lyft_docs[0:3]
lyft_content = lyft_docs[68:73]
lyft_docs = lyft_front_pages + lyft_content

In [13]:
from llama_index.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

lyft_nodes = pipeline.run(documents=lyft_docs)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
100%|██████████| 3/3 [00:00<00:00,  3.03it/s]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
100%|██████████| 2/2 [00:00<00:00,  4.92it/s]
100%|██████████| 3/3 [00:00<00:00,  3.58it/s]
100%|██████████| 3/3 [00:00<00:00,  3.67it/s]
100%|██████████| 4/4 [00:00<00:00,  5.53it/s]
100%|██████████| 3/3 [00:01<00:00,  2.92it/s]
100%|██████████| 20/20 [00:11<00:00,  1.71it/s]


In [14]:
lyft_nodes[2].metadata


{'page_label': '2',
 'file_name': '10k-vFinal.pdf',
 'file_path': 'data/10k-vFinal.pdf',
 'file_type': 'application/pdf',
 'file_size': 3416577,
 'creation_date': '2024-01-25',
 'last_modified_date': '2024-01-25',
 'last_accessed_date': '2024-01-25',
 'document_title': 'Lyft, Inc. Annual Report on Form 10-K for the Fiscal Year Ended December 31, 2020',
 'questions_this_excerpt_can_answer': '1. Has Lyft, Inc. filed all the required reports under Section 13 or 15(d) of the Securities Exchange Act of 1934 in the past 12 months?\n2. Has Lyft, Inc. submitted all the Interactive Data Files required under Rule 405 of Regulation S-T in the past 12 months?\n3. Is Lyft, Inc. considered a large accelerated filer according to the definitions provided in Rule 12b-2 of the Exchange Act?'}

In [15]:
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

service_context = ServiceContext.from_defaults(
    llm=llm, text_splitter=text_splitter
)
question_gen = LLMQuestionGenerator.from_defaults(
    service_context=service_context,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

In [16]:
from copy import deepcopy

nodes_no_metadata = deepcopy(uber_nodes) + deepcopy(lyft_nodes)
for node in nodes_no_metadata:
    node.metadata = {
        k: node.metadata[k]
        for k in node.metadata
        if k in ["page_label", "file_name"]
    }
print(
    "LLM sees:\n",
    (nodes_no_metadata)[9].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
page_label: 66
Excerpt:
-----
62 2019 Compared to 2018 
Adjusted EBITDA loss increased $878 million, or 48%, primar ily attributable to continued investments within our non-
Rides offerings and an increase in corpor ate overhead as we grow the business. Th ese investments drove an increase in our 
Adjusted EBITDA loss margin as a percentage of  Adjusted Net Revenue of (3)% to (21)%. 
Components of Results of Operations 
The following discussion on trends in our components of results of operations excludes IPO related impacts as well 
as the Driver appreciation award of $299 million, both of which occurred during the second quarter of 2019. The Driver 
appreciation award was accounted for as a Driver incentive.  For additional information about our IPO, see Note 1 - 
Description of Business and Summary of Significant Accoun ting Policies to our consolidated financial statements 
included in Part II, Item 8, “Financial  Statements and Supplementary Data

In [17]:
from llama_index import VectorStoreIndex
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.tools import QueryEngineTool, ToolMetadata

In [18]:
index_no_metadata = VectorStoreIndex(
    nodes=nodes_no_metadata,
    service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-4")),
)
engine_no_metadata = index_no_metadata.as_query_engine(
    similarity_top_k=10,
)

In [19]:
final_engine_no_metadata = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine_no_metadata,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)


In [20]:
from llama_index import QueryBundle

response_no_metadata = final_engine_no_metadata.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response_no_metadata.response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[sec_filing_documents] Q: What was the cost due to research and development for Uber in 2019 in millions of USD
[0m[1;3;38;2;90;149;237m[sec_filing_documents] Q: What was the cost due to sales and marketing for Uber in 2019 in millions of USD
[0m[1;3;38;2;11;159;203m[sec_filing_documents] Q: What was the cost due to research and development for Lyft in 2019 in millions of USD
[0m[1;3;38;2;155;135;227m[sec_filing_documents] Q: What was the cost due to sales and marketing for Lyft in 2019 in millions of USD
[0m[1;3;38;2;90;149;237m[sec_filing_documents] A: The cost due to sales and marketing for Uber in 2019 was $4,626 million.
[0m[1;3;38;2;155;135;227m[sec_filing_documents] A: The cost due to sales and marketing for Lyft in 2019 was $814.122 million.
[0m[1;3;38;2;237;90;200m[sec_filing_documents] A: The cost due to research and development for Uber in 2019 was $1,505,640 in thousands, which is equivalent to $1,505.64 million.


In [21]:
print(
    "LLM sees:\n",
    (uber_nodes + lyft_nodes)[9].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
page_label: 66
file_path: data/10k-132.pdf
document_title: Financial Performance, Revenue Generation, Cost of Revenue, and Operations and Support Expenses in the Annual Report on Form 10-K: Entities and Themes.
questions_this_excerpt_can_answer: 1. What were the factors that contributed to the increase in Adjusted EBITDA loss in 2019 compared to 2018?
2. How does the company generate its revenue and what is its revenue recognition policy?
3. What are the components included in the cost of revenue, exclusive of depreciation and amortization, for the company?
Excerpt:
-----
62 2019 Compared to 2018 
Adjusted EBITDA loss increased $878 million, or 48%, primar ily attributable to continued investments within our non-
Rides offerings and an increase in corpor ate overhead as we grow the business. Th ese investments drove an increase in our 
Adjusted EBITDA loss margin as a percentage of  Adjusted Net Revenue of (3)% to (21)%. 
Components of Results of Oper

In [22]:
index = VectorStoreIndex(
    nodes=uber_nodes + lyft_nodes,
    service_context=ServiceContext.from_defaults(llm=OpenAI(model="gpt-4")),
)
engine = index.as_query_engine(
    similarity_top_k=10,
)

In [23]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                name="sec_filing_documents",
                description="financial information on companies.",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [24]:
response = final_engine.query(
    """
    What was the cost due to research and development v.s. sales and marketing for uber and lyft in 2019 in millions of USD?
    Give your answer as a JSON.
    """
)
print(response.response)
# Correct answer:
# {"Uber": {"Research and Development": 4836, "Sales and Marketing": 4626},
#  "Lyft": {"Research and Development": 1505.6, "Sales and Marketing": 814 }}

Generated 4 sub questions.
[1;3;38;2;237;90;200m[sec_filing_documents] Q: What was the cost due to research and development for Uber in 2019 in millions of USD
[0m[1;3;38;2;90;149;237m[sec_filing_documents] Q: What was the cost due to sales and marketing for Uber in 2019 in millions of USD
[0m[1;3;38;2;11;159;203m[sec_filing_documents] Q: What was the cost due to research and development for Lyft in 2019 in millions of USD
[0m[1;3;38;2;155;135;227m[sec_filing_documents] Q: What was the cost due to sales and marketing for Lyft in 2019 in millions of USD
[0m[1;3;38;2;90;149;237m[sec_filing_documents] A: The cost due to sales and marketing for Uber in 2019 was $814,122 thousand, which is equivalent to $814.122 million.
[0m[1;3;38;2;11;159;203m[sec_filing_documents] A: The cost due to research and development for Lyft in 2019 was $1,505 million.
[0m[1;3;38;2;237;90;200m[sec_filing_documents] A: The cost due to research and development for Uber in 2019 was $4,836 million.
[0m