In [2]:
%load_ext autoreload
%autoreload 2
from src.utils import *
load_llm_config()

In [3]:
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    TitleExtractor, 
    QuestionsAnsweredExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
import nest_asyncio
nest_asyncio.apply()
import asyncio

In [4]:
# load document
load_dotenv()
pdf_path = '../data/tesla.pdf'
parser = LlamaParse(
    api_key = os.getenv('LLAMA_CLOUD_API_KEY'),
    show_progress = True,
    n_workers = 8
)
documents = await parser.aload_data(pdf_path)

Started parsing the file under job_id 5381e174-900e-4e4f-b40d-2a82c7b73e6d
.

# Indexing

In [5]:

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
    ]
)
nodes = pipeline.run(documents=documents, num_workers=8)



In [8]:
index = VectorStoreIndex(nodes=nodes)
index.storage_context.persist(persist_dir='../data/index_storage')

In [9]:
# load
storage_context = StorageContext.from_defaults(persist_dir='../data/index_storage')
index = load_index_from_storage(storage_context)

# Retrieval

In [10]:
bm25_retriever = BM25Retriever.from_defaults(
    docstore = index.docstore,
    similarity_top_k = 10,
    stemmer = Stemmer.Stemmer('english'),
    language = 'en'
)
bm25_retriever.persist('../data/bm25_retriever')

DEBUG:bm25s:Building index from IDs objects


Finding newlines for mmindex:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

## Create retriever engine

In [11]:
from llama_index.core.retrievers import VectorIndexRetriever, QueryFusionRetriever
from llama_index.core.query_engine import TransformQueryEngine, RetrieverQueryEngine

In [12]:
index_retriever = VectorIndexRetriever(
    index = index,
    nodes = nodes,
    similarity_top_k=10
)

In [13]:
hybrid_retriever = QueryFusionRetriever(
    retrievers = [bm25_retriever, index_retriever],
    similarity_top_k=5,
    num_queries=3,
    use_async=True,
    verbose=True
)


In [14]:
async def get_response(query):
    response = await hybrid_retriever_engine.aquery(query)
    return response 

hybrid_retriever_engine = RetrieverQueryEngine.from_args(hybrid_retriever)

In [15]:
reposne = await get_response('What is revenue of Tesla?')
print(reposne)

Generated queries:
Tesla annual revenue 2023
Tesla quarterly earnings report
The revenue of Tesla is not provided in the given information.


In [16]:
reposne = await get_response('What is Automotive sales in 2023')
print(reposne)

Generated queries:
Automotive industry sales statistics for 2023
Current trends in automotive sales for 2023
Automotive sales in 2023 were $78.509 billion.


In [17]:
reposne = await get_response('What is challenge for Tesla?')
print(reposne)

Generated queries:
Challenges faced by Tesla in the automotive industry
Current obstacles and issues for Tesla in 2023
Tesla faces several challenges, including increased volatility as they expand and adjust operations, competition in the sustainable transportation market, potential impacts on supplier liquidity and allocation plans, changes in government and economic incentives for electric vehicles, and the need to maintain efficient delivery and servicing capabilities as production volumes grow. Additionally, they are dealing with legal issues related to allegations of systemic race discrimination and hostile work environment, as well as increased scrutiny and changing expectations regarding their environmental, social, and governance (ESG) practices.


In [18]:
reposne = await get_response('What is best sale car in 2023')
print(reposne)

Generated queries:
Best cars to buy in 2023
Top car deals and discounts in 2023
The information provided does not specify which car was the best-selling in 2023.


In [19]:
reposne = await get_response('Summarize the document')
print(reposne)

Generated queries:
Summarize the key points of the document
Provide a brief overview of the document's content
The document outlines various aspects of a company's operations and policies. It defines key terms such as the effective date and erroneously awarded compensation. The company designs, manufactures, sells, and leases electric vehicles and energy systems, emphasizing performance, safety, and cost reduction. It operates in two segments: automotive and energy generation and storage. The document also includes details about warranties, performance guarantees, and certifications related to financial reporting and compliance. Additionally, it lists various exhibits and agreements, including a credit agreement and a clawback policy.
