In [4]:
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
import nest_asyncio
import os
from llama_index.core import PromptTemplate

In [11]:
from llama_parse import LlamaParse

parser = LlamaParse(
    api_key=os.environ.get("LLAMA_CLOUD_API_KEY"),  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="text",  # "markdown" and "text" are available
    verbose=True,
)

file_extractor = {".pdf": parser}
documents_dir = SimpleDirectoryReader(
    "./data/Test_Slides", file_extractor=file_extractor, filename_as_id=True,
).load_data()

Started parsing the file under job_id cac11eca-76e5-4b88-9a37-52c01075d0a0
Started parsing the file under job_id cac11eca-9c5b-4eff-b5d8-e96821b02f49
Started parsing the file under job_id cac11eca-b682-40a2-b4f4-aa8529868fe0
Started parsing the file under job_id cac11eca-1492-4995-9465-7fbc0ce3effb
Started parsing the file under job_id cac11eca-55ac-4094-9ab3-7134933f34cd


In [12]:
print(documents_dir[0])

Doc ID: f:\Study\Github_Repos\DR\llama-index\data\Test_Slides\L01-f23.pdf_part_0
Text: 0   63542     EE542    Loculro I:Intodctbn     Lecture 1:
Introduction     Internet and Cloud Computing       Young Cho
Department of Electrical Engineering     University of Southern
California                                             1 --- Goals
Goals      Internet       ◦ Computer Networking       ◦ Internet
protocols       ◦ Network...


In [72]:
async def load_pdfs_directory(folder_path):
    parser = LlamaParse(
        api_key=os.environ.get("LLAMA_CLOUD_API_KEY"),  # can also be set in your env as LLAMA_CLOUD_API_KEY
        result_type="text",  # "markdown" and "text" are available
        verbose=True,
    )
    documents = []
    # List all files in the folder
    for filename in os.listdir(folder_path):
        # Check if the file is a PDF
        if filename.endswith('.pdf'):
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            print(f'Loading file {file_path}')
            # Load the PDF file
            document = await parser.aload_data(file_path)
            # Append the document to the list
            document[0].metadata = {"filename": filename, "category": "PDF file"}
            documents += document
    return documents

In [73]:
# Example usage
folder_path = '.\\data\\EE542_Slides'
documents_single = await load_pdfs_directory(folder_path)

Loading file .\data\EE542_Slides\L01-f23.pdf
Started parsing the file under job_id cac11eca-7f04-4d5b-ae7a-8de92d056a86
Loading file .\data\EE542_Slides\L02-f23.pdf
Started parsing the file under job_id cac11eca-c889-4968-aae9-7579aba8beca
Loading file .\data\EE542_Slides\L03-f23.pdf
Started parsing the file under job_id cac11eca-4ced-4fee-bea2-3e206e4d9c84
Loading file .\data\EE542_Slides\L04-f23.pdf
Started parsing the file under job_id cac11eca-e7a1-4a7b-80c7-011561d90a39
Loading file .\data\EE542_Slides\L05-f23.pdf
Started parsing the file under job_id cac11eca-5604-4c3e-8a98-f1d8b82aceaa
Loading file .\data\EE542_Slides\L06-f23.pdf
Started parsing the file under job_id cac11eca-48e8-476e-aad5-1142aed2b89f
Loading file .\data\EE542_Slides\L07-f23.pdf
Started parsing the file under job_id cac11eca-8a0b-4d78-ac76-6bdd326a78f4
Loading file .\data\EE542_Slides\L08-f23.pdf
Started parsing the file under job_id cac11eca-5a12-416c-83ca-9deafd64fb6f
Loading file .\data\EE542_Slides\L09-f23

### Creating and Saving the index locally

In [74]:
index = VectorStoreIndex.from_documents(documents_single)
index.storage_context.persist(persist_dir='index/pdf')

### Query Pipeline

In [75]:
from llama_index.core.query_pipeline import QueryPipeline
from llama_index.core import PromptTemplate
from pydantic import BaseModel, Field
from llama_index.core.output_parsers import PydanticOutputParser
from llama_index.llms.openai import OpenAI
from llama_index.core.memory import ChatMemoryBuffer

In [76]:
class AnswerFormat(BaseModel):
    """Object representing a single knowledge pdf file."""

    answer: str = Field(..., description="Your Answer to the given question")
    file_name: str = Field(..., description="PDF file's file name where the answer can be found, fill in with empty string if you couldn't find it")
    page_number: int = Field(..., description="Page number where the answer can be found, fill in with 0 if you couldn't find it")

output_parser = PydanticOutputParser(AnswerFormat)
json_prompt_str = """\
Please answer the following question according to the context given to you, don't answer if you can't find it in the context.
{question}
Then please output with the following JSON format:
"""
json_prompt_str = output_parser.format(json_prompt_str)

In [103]:
llm = OpenAI(model="gpt-4o")
json_prompt_tmpl = PromptTemplate(json_prompt_str)
memory = ChatMemoryBuffer.from_defaults(token_limit=3000)
query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(
#     chat_mode="context",
#     memory=memory,
#     system_prompt=(
#         "You are a chatbot assistant helping a user with their questions. "
#         "You are only allowed to provide information that is inside the context given to you. "
#     ),
# )
p = QueryPipeline(chain=[json_prompt_tmpl, query_engine, output_parser], verbose=True)

In [100]:
output = p.run(question="Internet Security History")
print(output.answer)
print(output.file_name)
print(output.page_number)

[1;3;38;2;155;135;227m> Running module 6e19b24e-a2e6-4db2-a609-8e0b4b4f7ca2 with input: 
question: Internet Security History

[0m[1;3;38;2;155;135;227m> Running module 5c604f80-b64b-43f9-bf5c-358db07963b3 with input: 
input: Please answer the following question according to the context given to you, don't answer if you can't find it in the context.
Internet Security History
Then please output with the following JSON forma...

[0m[1;3;38;2;155;135;227m> Running module c603e46b-4e43-4dbc-a7f6-27bee8b6bc7f with input: 
input: {
  "answer": "Robert Tappan Morris launched a worm that had a major impact, causing targeted machines to slow dramatically from resource starvation. He was arrested, found guilty, and sentenced to co...

[0mRobert Tappan Morris launched a worm that had a major impact, causing targeted machines to slow dramatically from resource starvation. He was arrested, found guilty, and sentenced to community service and a fine. Today, he is a tenured professor at MIT and 