In [2]:
import fitz  # PyMuPDF
from PIL import Image
import io
import base64
from typing import Any, Dict, List, Optional
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.query_pipeline import QueryPipeline, InputComponent, ArgPackComponent
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.openai import OpenAI
from llama_index.postprocessor.colbert_rerank import ColbertRerank
from llama_index.core.llms import ChatMessage
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.query_pipeline import CustomQueryComponent
from llama_index.core.schema import NodeWithScore
from pydantic import BaseModel
from llama_index.core.bridge.pydantic import Field
from llama_index.core.output_parsers import PydanticOutputParser

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from typing import Dict, Any

class AnswerFormat(BaseModel):
    """Object representing a single knowledge pdf file."""
    answer: str = "None"
    file_name: str = "None"
    page_number: int = 0

    @classmethod
    def schema(cls, by_alias: bool = True) -> Dict[str, Any]:
        schema = super().model_json_schema(by_alias)
        properties = schema.get("properties", {})

        # Manually adding descriptions
        properties["answer"]["description"] = "Your Answer to the given query"
        properties["file_name"]["description"] = "PDF file's file name where the answer can be found, fill in with empty string if you couldn't find it"
        properties["page_number"]["description"] = "Page number where the answer can be found, fill in with 0 if you couldn't find it"

        return schema

In [4]:
storage_context = StorageContext.from_defaults(persist_dir="./index/pdf")
index = load_index_from_storage(storage_context)

In [5]:
from llama_index.core.output_parsers import PydanticOutputParser

input_component = InputComponent()
output_parser = PydanticOutputParser(AnswerFormat)
prompt_str = """\
You are given a context with course materials in pdf files format, and you have access to these file's names, page numbers, and the content of the files. 
The file name is displayed at the beginning of every context chunk, and the page number is displayed as a integer number at the end of every page, and pages are separated by a dashed line.
Answer the following questions: {query_str}
And then find the file name and page number where the answer is mentioned.
"""
json_prompt_str = output_parser.format(prompt_str)
llm = OpenAI(
    model="gpt-4o",
    temperature=0.2,
)
retriever = index.as_retriever(similarity_top_k=10)
reranker = ColbertRerank(top_n=3)
DEFAULT_CONTEXT_PROMPT = json_prompt_str + (
    "Here is some context that may be relevant:\n"
    "-----\n"
    "{node_context}\n"
    "-----\n"
)
output_parser = PydanticOutputParser(AnswerFormat)


In [6]:
print(json_prompt_str)

You are given a context with course materials in pdf files format, and you have access to these file's names, page numbers, and the content of the files. 
The file name is displayed at the beginning of every context chunk, and the page number is displayed as a integer number at the end of every page, and pages are separated by a dashed line.
Answer the following questions: {query_str}
And then find the file name and page number where the answer is mentioned.



Here's a JSON schema to follow:
{{"description": "Object representing a single knowledge pdf file.", "properties": {{"answer": {{"default": "None", "title": "Answer", "type": "string", "description": "Your Answer to the given query"}}, "file_name": {{"default": "None", "title": "File Name", "type": "string", "description": "PDF file's file name where the answer can be found, fill in with empty string if you couldn't find it"}}, "page_number": {{"default": 0, "title": "Page Number", "type": "integer", "description": "Page number 

In [7]:
print(DEFAULT_CONTEXT_PROMPT)

You are given a context with course materials in pdf files format, and you have access to these file's names, page numbers, and the content of the files. 
The file name is displayed at the beginning of every context chunk, and the page number is displayed as a integer number at the end of every page, and pages are separated by a dashed line.
Answer the following questions: {query_str}
And then find the file name and page number where the answer is mentioned.



Here's a JSON schema to follow:
{{"description": "Object representing a single knowledge pdf file.", "properties": {{"answer": {{"default": "None", "title": "Answer", "type": "string", "description": "Your Answer to the given query"}}, "file_name": {{"default": "None", "title": "File Name", "type": "string", "description": "PDF file's file name where the answer can be found, fill in with empty string if you couldn't find it"}}, "page_number": {{"default": 0, "title": "Page Number", "type": "integer", "description": "Page number 

In [8]:
class Response(CustomQueryComponent):
    llm: OpenAI = Field(..., description="OpenAI LLM")
    system_prompt: Optional[str] = Field(
        default=None, description="System prompt to use for the LLM"
    )
    context_prompt: str = Field(
        default=DEFAULT_CONTEXT_PROMPT,
        description="Context prompt to use for the LLM",
    )

    def _validate_component_inputs(
        self, input: Dict[str, Any]
    ) -> Dict[str, Any]:
        return input

    @property
    def _input_keys(self) -> set:
        # Removed "chat_history" from the input keys
        return {"nodes", "query_str"}

    @property
    def _output_keys(self) -> set:
        return {"response"}

    def _prepare_context(
        self,
        # Removed chat_history parameter
        nodes: List[NodeWithScore],
        query_str: str,
    ) -> List[ChatMessage]:
        node_context = ""
        for idx, node in enumerate(nodes):
            node_text = node.get_content(metadata_mode="llm")
            node_context += f"Context Chunk {idx}:\n{node_text}\n\n"

        formatted_context = self.context_prompt.format(
            node_context=node_context, query_str=query_str
        )
        user_message = ChatMessage(role="user", content=formatted_context)

        # print(formatted_context)

        # Removed appending to chat_history
        context = [user_message]

        if self.system_prompt is not None:
            # Adjusted to use context instead of chat_history
            context = [
                ChatMessage(role="system", content=self.system_prompt)
            ] + context

        return context

    def _run_component(self, **kwargs) -> Dict[str, Any]:
        # Removed chat_history from kwargs
        nodes = kwargs["nodes"]
        query_str = kwargs["query_str"]

        prepared_context = self._prepare_context(
            # Adjusted call to _prepare_context
            nodes, query_str
        )
        
        response = self.llm.chat(prepared_context)
        print(prepared_context)
        return {"response": response}

    async def _arun_component(self, **kwargs: Any) -> Dict[str, Any]:
        # Removed chat_history from kwargs
        nodes = kwargs["nodes"]
        query_str = kwargs["query_str"]

        prepared_context = self._prepare_context(
            # Adjusted call to _prepare_context
            nodes, query_str
        )

        response = await self.llm.achat(prepared_context)

        return {"response": response}


In [9]:
response_component = Response(
    llm=llm,
    system_prompt=(
        "You are a Q&A system. You will be provided with the previous chat history, "
        "as well as possibly relevant context, to assist in answering a user message."
    )
)

In [10]:
import pandas as pd

def pretty_print(df):
    return display(HTML(df.to_html().replace("\\n", "<br>")))

def visualize_retrieved_nodes(nodes) -> None:
    result_dicts = []
    for node in nodes:
        result_dict = {"Score": node.score, "Text": node.node.get_text()}
        result_dicts.append(result_dict)

    pretty_print(pd.DataFrame(result_dicts))

In [11]:
pipeline = QueryPipeline(
    modules={
        "input": input_component,
        "query_retriever": retriever,
        "reranker": reranker,
        "response_component": response_component,
        "output_parser": output_parser
    },
    verbose=True,
)

pipeline.add_link("input", "query_retriever", src_key="query_str")
pipeline.add_link("query_retriever", "reranker", dest_key="nodes")
pipeline.add_link(
    "input", "reranker", src_key="query_str", dest_key="query_str"
)
pipeline.add_link("reranker", "response_component", dest_key="nodes")
pipeline.add_link("input", "response_component", dest_key="query_str")
pipeline.add_link("response_component", "output_parser")

In [12]:
response = pipeline.run(query_str="What is the first lecture about?")

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the first lecture about?

[0m[1;3;38;2;155;135;227m> Running module query_retriever with input: 
input: What is the first lecture about?

[0m[1;3;38;2;155;135;227m> Running module reranker with input: 
query_str: What is the first lecture about?
nodes: [NodeWithScore(node=TextNode(id_='fabd5f98-a782-4c1d-beff-910648178d11', embedding=None, metadata={'filename': 'L01-f23.pdf', 'category': 'PDF file'}, excluded_embed_metadata_keys=[], excluded_llm_met...

[0m[1;3;38;2;155;135;227m> Running module response_component with input: 
query_str: What is the first lecture about?
nodes: [NodeWithScore(node=TextNode(id_='d70fab61-c911-49f4-8d64-8d81f893f1a2', embedding=None, metadata={'filename': 'L24-f23.pdf', 'category': 'PDF file'}, excluded_embed_metadata_keys=[], excluded_llm_met...

[0m[ChatMessage(role=<MessageRole.SYSTEM: 'system'>, content='You are a Q&A system. You will be provided with the previous chat

In [13]:
response

AnswerFormat(answer='The first lecture is about an introduction to Internet and Cloud Computing.', file_name='L01-f23.pdf', page_number=1)

In [None]:
from pyvis.network import Network

net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(pipeline.dag)
net.show("rag_dag.html")