In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
loader = PyPDFLoader('../data/user-guide/Collection Manager User Guide.pdf')

In [3]:
pages = loader.load_and_split()

In [4]:
pages[:3]

[Document(page_content='Collection Manager User Guide v 1.0\nXperi Machine Learning Program\nExported on 05/09/2024', metadata={'source': '../data/user-guide/Collection Manager User Guide.pdf', 'page': 0}),
 Document(page_content='Xperi Machine Learning Program – Collection Manager User Guide v 1.0\n2Table of Contents\n1 Table of Content ............................................................................................... 7\n2 Concepts ........................................................................................................... 8\n2.1 Introduction to the Collection Manager ........................................................... 8\n2.2 The Collection Manager Entities ...................................................................... 8\n2.2.1 The Collection Entity. ........................................................................................ 8\n2.2.2 The Collection Version Entity. .................................................................

In [5]:
for i in range(3):
    print(pages[i].metadata)

{'source': '../data/user-guide/Collection Manager User Guide.pdf', 'page': 0}
{'source': '../data/user-guide/Collection Manager User Guide.pdf', 'page': 1}
{'source': '../data/user-guide/Collection Manager User Guide.pdf', 'page': 2}


since each page is still quite long, we would break the pages into smaller pieces.
we give a bit of overlap so that no meaningful sentence is lost.

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

documents = text_splitter.split_documents(pages)

In [7]:
print(f"{len(pages)} vs {len(documents)}")

75 vs 133


In [46]:
import os
from dotenv import load_dotenv
load_dotenv(".env")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [9]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [10]:
from langchain_community.vectorstores import Chroma

vector = Chroma.from_documents(documents, embeddings)

In [12]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(openai_api_key=openai_api_key)

In [13]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

In [14]:
retriever = vector.as_retriever()

### User's followup question => LLM => reformulated question (with history)

In [17]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

instruction_to_system = """
Given a chat history and the latest user question which might refeence context in the chat history, formulate a standalone
question which can be understood without the chat history. Do NOT answer the question, just formulate it.
"""

question_maker_prompt = ChatPromptTemplate.from_messages(
  [
    ("system", instruction_to_system),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
  ]
)

question_chain = question_maker_prompt | llm | StrOutputParser()

chain = ({ "context": itemgetter("question") | retriever, "question": itemgetter("question") } | prompt | model | parser)  

chain.invoke({"question": "..."}) -> {"response": "...", "context": "..."}

In [20]:
from langchain_core.messages import AIMessage, HumanMessage

question_chain.invoke({
  "question": "What is the Collection Manager?",
  "chat_history": [HumanMessage(content="What is the Collection Manager?")
]})


'Can you provide more context or specify which collection manager you are referring to?'

In [24]:
qa_sytem_prompt = """
You are an assistent helping a user with the Collection Manager. The user has asked a question about the Collection Manager.\
If you don't know the answer, you can ask the user for more information.\

{context}
"""

qa_prompt = ChatPromptTemplate.from_messages(
  [
    ("system", qa_sytem_prompt),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{question}"),
  ]
)

In [25]:
def contextualized_question(input: dict):
  if input.get("chat_history"):
    return question_chain
  else:
    return input["question"]

In [29]:
def format_docs(docs):
  return "\n".join([f"Document {i}: {doc.metadata['title']}" for i, doc in enumerate(docs)])

In [30]:
from langchain_core.runnables import RunnablePassthrough

retriever_chain = RunnablePassthrough.assign(context=contextualized_question | retriever)

In [31]:
retriever_chain.invoke({
  "chat_history": [HumanMessage(content="What is the Collection Manager?")],
  "question": "can you explain more about the Collection Manager?"
})

{'chat_history': [HumanMessage(content='What is the Collection Manager?')],
 'question': 'can you explain more about the Collection Manager?',
 'context': [Document(page_content='Xperi Machine Learning Program – Collection Manager User Guide v 1.0\n\xa0– 6Purpose\nThis space contains the user guide of the Collection Manager, it contains topics related to how to use \nthe Collection Manager and important concepts.\npage status :\xa0 DRAFT', metadata={'page': 5, 'source': '../data/user-guide/Collection Manager User Guide.pdf'}),
  Document(page_content='Xperi Machine Learning Program – Collection Manager User Guide v 1.0\n\xa0Collection Manager Web User Interface. \xa0– 20Clicking the right-top icon to open dropdown list menu:\n3.2 Collections Page\n3.2.1 List collections.\nThe collections page will show collections, with pagination, sortable, searchable feature:', metadata={'page': 19, 'source': '../data/user-guide/Collection Manager User Guide.pdf'}),
  Document(page_content='Xperi Mac

In [37]:
rag_chain = (
  retriever_chain
  | qa_prompt
  | llm
  # | output_parser
)

In [38]:
question = "what is the Collection Manager, and how to use it?"

In [39]:
chat_history = []

ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
ai_msg

AIMessage(content='The Collection Manager is a tool within the Xperi Machine Learning Program that allows users to manage collections of data. It helps users organize and categorize their data sets for easier access and analysis.\n\nTo use the Collection Manager, you can follow these steps based on the information provided in the user guide:\n\n1. Open the Collection Manager Web User Interface.\n2. Click on the right-top icon to open the dropdown list menu.\n3. Navigate to the Collections page to list collections.\n4. On the Collections page, you can view collections with pagination, sortable, and searchable features.\n5. To search for a specific collection, select the search criteria and then submit the search.\n\nIf you need more detailed instructions or specific information about a certain feature of the Collection Manager, please let me know!', response_metadata={'token_usage': {'completion_tokens': 157, 'prompt_tokens': 369, 'total_tokens': 526}, 'model_name': 'gpt-3.5-turbo', 'sy

In [40]:
print(ai_msg.content)

The Collection Manager is a tool within the Xperi Machine Learning Program that allows users to manage collections of data. It helps users organize and categorize their data sets for easier access and analysis.

To use the Collection Manager, you can follow these steps based on the information provided in the user guide:

1. Open the Collection Manager Web User Interface.
2. Click on the right-top icon to open the dropdown list menu.
3. Navigate to the Collections page to list collections.
4. On the Collections page, you can view collections with pagination, sortable, and searchable features.
5. To search for a specific collection, select the search criteria and then submit the search.

If you need more detailed instructions or specific information about a certain feature of the Collection Manager, please let me know!


In [42]:
question = "Can you explain more about the Collection Manager more?"

ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])
ai_msg

AIMessage(content='To provide you with more detailed information about the Collection Manager, could you please specify which aspects or functionalities of the Collection Manager you are interested in learning more about? This will help me tailor the explanation to address your specific questions or requirements more effectively.', response_metadata={'token_usage': {'completion_tokens': 49, 'prompt_tokens': 819, 'total_tokens': 868}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-aa3304c7-cacb-46f8-8086-0b4a3700e8fa-0', usage_metadata={'input_tokens': 819, 'output_tokens': 49, 'total_tokens': 868})

In [44]:
print(ai_msg.content)

To provide you with more detailed information about the Collection Manager, could you please specify which aspects or functionalities of the Collection Manager you are interested in learning more about? This will help me tailor the explanation to address your specific questions or requirements more effectively.
