In [1]:
import os

from transformers.utils.chat_template_utils import returns_re

os.chdir('..')
print(os.getcwd())

/Users/Placebo/MyTechProjects/LLM-Learning/codes/RAG


In [2]:
%load_ext autoreload
%autoreload 2
from src.utils import *
load_llm_config()

In [3]:
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    TitleExtractor, 
    QuestionsAnsweredExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer
import nest_asyncio
nest_asyncio.apply()
import asyncio

In [12]:
# load document
load_dotenv()
pdf_path = '../data/tesla.pdf'
parser = LlamaParse(
    api_key = os.getenv('LLAMA_CLOUD_API_KEY'),
    show_progress = True,
    n_workers = 8
)
documents = await parser.aload_data(pdf_path)

Started parsing the file under job_id 8261cf44-8379-48af-b1c4-b70e3426f792


# Indexing

In [14]:

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
    ]
)
nodes = pipeline.run(documents=documents, num_workers=8)



AttributeError: 'list' object has no attribute 'persist'

In [13]:
# # run llm based
# llm_pipeline = IngestionPipeline(
#     transformations=[
#         TitleExtractor(nodes=5),
#         QuestionsAnsweredExtractor(questions=8)
#     ]
# )
# nodes = llm_pipeline.arun(nodes=nodes, num_workers=10)

In [17]:
index = VectorStoreIndex(nodes=nodes)

In [18]:
index.storage_context.persist(persist_dir='../data/index_storage')

In [4]:
# load
storage_context = StorageContext.from_defaults(persist_dir='../data/index_storage')
index = load_index_from_storage(storage_context)

# Retrieval

In [6]:
bm25_retriever = BM25Retriever.from_defaults(
    docstore = index.docstore,
    similarity_top_k = 10,
    stemmer = Stemmer.Stemmer('english'),
    language = 'en'
)
bm25_retriever.persist('../data/bm25_retriever')

Finding newlines for mmindex:   0%|          | 0.00/892k [00:00<?, ?B/s]

## Create retriever engine

In [9]:
from llama_index.core.retrievers import VectorIndexRetriever, QueryFusionRetriever
from llama_index.core.query_engine import TransformQueryEngine, RetrieverQueryEngine

In [15]:
index_retriever = VectorIndexRetriever(
    index = index,
    nodes = nodes,
    similarity_top_k=10
)

In [17]:
hybrid_retriever = QueryFusionRetriever(
    retrievers = [bm25_retriever, index_retriever],
    similarity_top_k=5,
    num_queries=3,
    use_async=True,
    verbose=True
)


In [24]:
async def get_response(query):
    response = await hybrid_retriever_engine.aquery(query)
    return response 

hybrid_retriever_engine = RetrieverQueryEngine.from_args(hybrid_retriever)

In [26]:
reposne = await get_response('What is revenue of Tesla?')
print(reposne)

Generated queries:
1. Tesla annual revenue 2023
2. Tesla quarterly earnings report
3. Tesla financial performance analysis
Tesla recognized total revenues of $96.77 billion in 2023.


In [27]:
reposne = await get_response('What is Automotive sales in 2023')
print(reposne)

Generated queries:
1. Automotive sales statistics for 2023
2. Trends in car sales for 2023
3. Global automotive market performance in 2023
Automotive sales in 2023 were $78,509 million.


In [28]:
reposne = await get_response('What is challenge for Tesla?')
print(reposne)

Generated queries:
1. What are the main challenges faced by Tesla in 2023?
2. How is Tesla addressing production and supply chain issues?
3. What are the financial challenges impacting Tesla's growth?
Tesla faces several challenges, including the need to ramp up efficient and cost-effective manufacturing capabilities, manage supply chain complexities, and address potential delays in production and regulatory approvals. Additionally, they must ensure responsible sourcing of materials, maintain high safety standards, and manage risks associated with product recalls and warranty claims. Labor union activities and global component shortages, such as the semiconductor shortage, also pose significant challenges.


In [29]:
reposne = await get_response('What is best sale car in 2023')
print(reposne)

Generated queries:
1. Top-rated cars for sale in 2023
2. Best-selling cars of 2023
3. Most popular cars to buy in 2023
The best sale car in 2023 is the Cybertruck, a full-size electric pickup truck with a stainless steel exterior that combines the utility and strength of a truck with the speed of a sports car.


In [30]:
reposne = await get_response('Summarize the document')
print(reposne)

Generated queries:
1. How to effectively summarize a document
2. Techniques for summarizing lengthy documents
3. Tools for automatic document summarization
The document outlines several key areas:

1. **Changes in and Disagreements with Accountants**: There are no changes or disagreements with accountants on accounting and financial disclosure.

2. **Controls and Procedures**: Management, including the CEO and CFO, evaluated the effectiveness of disclosure controls and procedures, concluding they were effective as of December 31, 2023. They acknowledge that controls can only provide reasonable assurance due to resource constraints and the need for management judgment.

3. **Management’s Report on Internal Control over Financial Reporting**: Management is responsible for maintaining adequate internal control over financial reporting.

4. **Recent Accounting Pronouncements**: 
   - **ASU No. 2023-07**: Issued in November 2023, it updates reportable segment disclosure requirements and is 