### Count number of tokens in a text string

In [2]:
question = "Which animal does my son like?"
document = "My son's favorite animal is giraffe."

In [3]:
import tiktoken

In [4]:
def num_tokens(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens(question, "cl100k_base")

7

### Use OpenAI embeddings

In [5]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
question_embd = embd.embed_query(question)
document_embd = embd.embed_query(document)
len(question_embd), len(document_embd)

(1536, 1536)

### Calculate cosine similarity

In [6]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(question_embd, document_embd)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.9100154358844044


### Load PDF - langchain community PyMuPDFLoader

In [7]:
from langchain_community.document_loaders import PyMuPDFLoader

In [8]:
loader = PyMuPDFLoader("../../resources/ncert/c7/geography/ch6-tropical-and-subtropical.pdf")
pages = loader.load_and_split()

In [None]:
print(pages[3])

In [10]:
pdf_docs = loader.load()

In [None]:
print(pdf_docs)

### Chunk - split document into chunk for indexing

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)


splits = text_splitter.split_documents(pdf_docs)

In [None]:
print(splits[0])

In [14]:
len(splits)

21

### Indexing and retrieval - embed in vector store and retrieve

In [15]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [16]:
docs = retriever.get_relevant_documents("Which animals are found in the rainforests?")

  warn_deprecated(


In [17]:
len(docs)

1

In [None]:
print(docs[0])

### Generation - context set manually, prompt set manually

In [19]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'))])

In [20]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [21]:
chain = prompt | llm

In [22]:
chain.invoke({"context":docs,"question":"Which animals are found in the rainforests?"})

AIMessage(content='Monkeys, sloth, ant-eating tapirs, crocodiles, snakes, pythons, anaconda, boa constrictor, various species of reptiles, and thousands of species of insects are found in the rainforests.', response_metadata={'token_usage': {'completion_tokens': 50, 'prompt_tokens': 484, 'total_tokens': 534}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-e237b69e-72f8-4035-815d-124c52df1ac3-0', usage_metadata={'input_tokens': 484, 'output_tokens': 50, 'total_tokens': 534})

### Generation - passing retriever as context to the chain and using hub prompt templates

In [23]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [24]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [27]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What do most people do for a living in the Ganga-Brahmaputra basin and why?")

'Most people in the Ganga-Brahmaputra basin are engaged in agriculture as their main occupation. This is because the plain area of the basin provides suitable land for human habitation with fertile soil, making it ideal for growing crops. The main crop grown in this region is paddy, as it requires sufficient water and is grown in areas with high rainfall. Additionally, other crops such as wheat, maize, sorghum, gram, and millets are also grown in the basin.'