### Necessary Packages

In [31]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.9.0-py3-none-any.whl (313 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 KB[0m [31m112.0 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
Installing collected packages: pypdf
Successfully installed pypdf-5.9.0


In [4]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

Collecting langchain_community
  Using cached langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
Collecting tiktoken
  Using cached tiktoken-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.28-py3-none-any.whl (70 kB)
Collecting langchainhub
  Using cached langchainhub-0.1.21-py3-none-any.whl (5.2 kB)
Collecting chromadb
  Using cached chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Collecting langchain
  Using cached langchain-0.3.27-py3-none-any.whl (1.0 MB)
Collecting SQLAlchemy<3,>=1.4
  Using cached sqlalchemy-2.0.41-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting langchain-core<1.0.0,>=0.3.66
  Using cached langchain_core-0.3.72-py3-none-any.whl (442 kB)
Collecting aiohttp<4.0.0,>=3.8.3
  Using cached aiohttp-3.12.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Collecting numpy>=1.26.2
  Using cached nu

In [8]:
!pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.3/187.3 KB[0m [31m218.4 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m
Collecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7


### Langchain, OpenAI API

In [10]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'langchain_api_key'

In [11]:
os.environ['OPENAI_API_KEY'] = 'model_api_key'

In [12]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [25]:
from langchain.document_loaders import PyPDFLoader

### INDEXING

In [32]:
# Load Documents
pdf_path = "/home/qasob/knowledge_model_composition/data/Book_Introduction to Robotics _JJ_Craig.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

In [33]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [44]:
# Prompt
# prompt = hub.pull("rlm/rag-prompt") # create our own prompt
from langchain.prompts import ChatPromptTemplate
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""



prompt_perspectives = ChatPromptTemplate.from_template(template)



from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [35]:
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [36]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [39]:

# Question
rag_chain.invoke("what is the book about?")

'The book "Introduction to Robotics: Mechanics and Control" by John J. Craig is suitable for senior undergraduate or first-year graduate-level courses. It covers material related to robotics, including mechanics, control theory, and computational aspects. The book evolved from class notes used at Stanford University and has been used at various institutions from 1986 through 2002.'