Packages

In [5]:
%pip install langchain_community tiktoken langchain-huggingface langchainhub chromadb langchain

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
%pip install pypdf





[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
%pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


API Keys

In [8]:
import os
import getpass

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your token: ")

Part 1: Overview

In [9]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

### INDEXING ###
loader = PyPDFLoader("./data/sierra_emp_handbook.pdf")

docs = loader.load()

# Split 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding= HuggingFaceEmbeddings())

retriever = vectorstore.as_retriever()


  from tqdm.autonotebook import tqdm, trange


In [10]:
### RETRIEVAL and GENERATIOn ###

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          temperature=0.1, 
                          max_length=512, 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


In [11]:
# Post Processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("Give the Summary of Attendance Policy")

' The Attendance Policy at Shri Parvathy Tech Park expects employees to maintain good attendance habits, including arriving on time, using access cards to enter and exit, and giving proper notice for attendance problems. Excessive tardiness, absenteeism, and unauthorized departures may result in disciplinary action. All absence and permission should be sent in advance as an email to HR.'

Part 2: Indexing

In [12]:
# Documents
question = "What kind of pets do I like?"
document = "My favorite pet is a cat."

In [13]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name:str) -> int:
    """Return no. of tokens in a string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))

    return num_tokens

num_tokens_from_string(question, "cl100k_base")

8

In [14]:
embd = HuggingFaceEmbeddings()
query_result = embd.embed_query(question)
document_result = embd.embed_query(document)

query_result[:3]

len(query_result)

768

In [15]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity: ", similarity)

Cosine Similarity:  0.5654733869335381


In [16]:
#### INDEXING ####

# Load document

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./data/sierra_emp_handbook.pdf")

file_docs = loader.load()

In [17]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

# Make splits
splits = text_splitter.split_documents(file_docs)

In [18]:
# Vector Store or Index

vectorstore = Chroma.from_documents(documents=splits, embedding= HuggingFaceEmbeddings())

retriever = vectorstore.as_retriever()

Part 3: Retrieval

In [55]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [62]:
docs = retriever.get_relevant_documents("Give the Summary of Leave Policy")

print(docs)

[Document(metadata={'page': 28, 'source': './data/sierra_emp_handbook.pdf'}, page_content='The purpose of the leave policy is to lay down the guidelines under which employees become'), Document(metadata={'page': 28, 'source': './data/sierra_emp_handbook.pdf'}, page_content='eligible for different types of leave and the rules and procedures for availing the leave.'), Document(metadata={'page': 1, 'source': './data/sierra_emp_handbook.pdf'}, page_content='22.  LEAVE POLICY  ................................ ................................'), Document(metadata={'page': 28, 'source': './data/sierra_emp_handbook.pdf'}, page_content='22.3. Types of Leaves:  \n \nEmployees are entitled to avail these types of leave as mentioned below:  \nEarned Leave (PL)  Twelve (12) days per year (1 day per month) -valid till 15th Dec \npay cycle  \nHospitalization Leave (SL)  Three (3) days per year – valid till 15th Dec pay cycle  \nNational Holidays  Ten (10) days per year – valid till 31st Dec 202 3 \n 

Part 4: Generation

In [50]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate

# Prompt
template = """Answer the question based on only following the context:
{context} 

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

prompt


ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based on only following the context:\n{context} \n\nQuestion: {question}\n'), additional_kwargs={})])

In [41]:
# llm
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          temperature=0.1, 
                          max_length=512, 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


In [60]:
# Chain 

chain = prompt | llm

In [64]:
# Run
chain.invoke({"context": docs, "question": "Give the Summary of Leave Policy"})

'\nAnswer: The Leave Policy lays down the guidelines for employees to become eligible for different types of leave, including Earned Leave (PL), Hospitalization Leave (SL), and National Holidays. The number of days for each leave type is specified, and the leaves are given on a pro rata basis in the first year of service. The rules and procedures for availing these leaves are specified in the Leave Rules framed and notified by the Company.'

In [66]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")



In [67]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

In [69]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    |StrOutputParser()
)

rag_chain.invoke("Give the Summary of Leave Policy")

'\nAnswer: The Leave Policy lays down the guidelines for employees to become eligible for different types of leave, including Earned Leave (PL), Hospitalization Leave (SL), and National Holidays. The number of days for each leave type is specified, and the leaves are given on a pro rata basis in the first year of service. The rules and procedures for availing these leaves are specified in the Leave Rules framed and notified by the Company.'