### Count number of tokens in a text string

In [12]:
question = "Which animal does my son like?"
document = "My son's favorite animal is giraffe."

In [26]:
import tiktoken

In [14]:
def num_tokens(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens(question, "cl100k_base")

7

### Use OpenAI embeddings

In [15]:
from langchain_openai import OpenAIEmbeddings
embd = OpenAIEmbeddings()
question_embd = embd.embed_query(question)
document_embd = embd.embed_query(document)
len(question_embd), len(document_embd)

(1536, 1536)

### Calculate cosine similarity

In [16]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(question_embd, document_embd)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.9101006177228447


### Load PDF - langchain community PyMuPDFLoader

In [17]:
from langchain_community.document_loaders import PyMuPDFLoader

In [18]:
loader = PyMuPDFLoader("../../resources/ncert/c7/geography/ch6-tropical-and-subtropical.pdf")
pages = loader.load_and_split()

In [19]:
print(pages[3])

page_content='42
OUR ENVIRONMENT
Fig. 6.6: Gradual Destruction of Forests
Do you know?
Slash and Burn is a
way of cultivating land
where farmers clear a
piece of land by
slashing or cutting
down trees and bushes.
These are then burnt,
which releases the
nutrients into the soil.
Now crops are grown in
this cleared field for a
few years.
After repeatedly
using the patch of
land, the soil looses
its nutrients. So it is
abandoned. Then they
clear another plot of
land to plant. In the
mean time young
trees grow in the old
field. In this way soil
fertility is restored.
People can then
return to it and start
cultivating it again.
tapioca, pineapple and sweet potato. As hunting and fishing
are uncertain it is the women who keep their families alive
by feeding them the vegetables they grow. They practice
“slash and burn agriculture”. The staple food is manioc,
also known as cassava that grows under the ground like
the potato. They also eat queen ants and egg sacs. Cash
crops like coffee, maize 

In [20]:
pdf_docs = loader.load()

In [21]:
print(pdf_docs)



### Chunk - split document into chunk for indexing

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)


splits = text_splitter.split_documents(pdf_docs)

In [23]:
print(splits[0])

page_content='Renuka was excited. Shrikant Uncle was home after a
gap of nearly four months. He was a wildlife photographer
and travelled widely. Renuka’s interest in wildlife and
forests began at an early age, when her uncle introduced
her to books on nature. Pictures of distant lands and
people, who lived there, always fascinated her
.
Fig. 6.1: People from various parts of the world
“In these pictures Renuka, you can see people from
different parts of the world – some from dry deserts, some
from frozen lands and some from hot wet rainforests.”
“They look so different from me”, observed Renuka. “They
may look different, but they share the same basic needs
of life – food, clothing and shelter”, explained Shrikant Uncle.
“Their children do the same things as you probably do,
play games, quarrel sometimes and then make-up, sing,
dance and help the families with various things that need
to be done. They live closer to nature and very early in
their lives have learnt to care for nature. T

In [24]:
len(splits)

21

### Index - embed and store in vector db

In [25]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()