Packages

In [2]:
%pip install langchain_community tiktoken langchain-huggingface langchainhub chromadb langchain pypdf

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


API Keys

In [24]:
import os
import getpass

os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your HF token: ")

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = getpass.getpass("Enter your LANGSMITH token: ")

Part 5: Multi Query

Index

In [4]:
### Indexing ###

# Load Doc
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./data/sierra_emp_handbook.pdf")

load_docs = loader.load()


# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap=50)

# Make Split
splits = text_splitter.split_documents(load_docs)

# Index
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())

retriever = vectorstore.as_retriever()


  from tqdm.autonotebook import tqdm, trange


Prompt

In [5]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Persepctives
template = """You are an AI language model assistant behalf of Sierra Support Centre Pvt Ltd. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from vector database.
By generating multiple perspective of the user question, your goal is to help the user overcome 
some of the limitations of the distance-based similarity search.
Provide these alternative questions seperated by newlines. Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint

generate_queries = (
    prompt_perspectives
    | HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


In [6]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique Union of retrieved docs """

    # Flatters list of list, and convert each Document to the string
    falttern_docs = [dumps(doc) for sublist in documents for doc in sublist]

    # Get unique documents
    unique_docs = list(set(falttern_docs))

    # return
    return [loads(doc) for doc in unique_docs]

# Retrieve

retrieval_chain = generate_queries | retriever.map() | get_unique_union

# question = "What are the Leave Policies for Sierra Support Centre?"

# docs = retrieval_chain.invoke({"question": question})

# len(docs)

In [7]:
from operator import itemgetter
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.runnables import RunnablePassthrough


# RAG
template = """Answer the following question based on this context:
{context}

Question: {question}"""

prompt = ChatPromptTemplate.from_template(template)

llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

user_question = input("Enter Your Question: ")
final_rag_chain.invoke({"question": user_question})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


  return [loads(doc) for doc in unique_docs]


' mentioned in the document.\n\nAnswer: The leave policies mentioned in the document are:\n1. Annual Entitlement: 15 for Paid Leave (PL) and 10 for Hospitalization Leave (SL).\n2. Advance Approval: Yes for both Paid Leave (PL) and Hospitalization Leave (SL).\n3. Partial Day Leave: Allowed for both Paid Leave (PL) and Hospitalization Leave (SL).\n4. Carry Forward to next calendar year: Not allowed for both Paid Leave (PL) and Hospitalization Leave (SL).\n5. Rules on Leave: Earned & Hospitalization Leave (PL & SL) Published Holidays.'

Part 6: RAG-Fusion

Prompts

In [25]:
from langchain.prompts import ChatPromptTemplate

template = """You are helpful assistant that generate multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""

prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [26]:
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint

generate_queries = (
    prompt_rag_fusion
    | HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


In [27]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ reciprocal_rank_fusion that takes multiple list of ranked documents and an optional parameter k 
    used in the RRF formula"""

    # Initialized a dictionary to hold fused score for each unique document
    fused_scores = {}

    # Iterate through eached list of ranked document
    for docs in results:

        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):

            # Convert the document to string format to use as a key (assumed documents can be  serialized  to JSON)
            doc_str = dumps(doc)

            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0

            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]

            # Update the score if the document using RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the financial remarked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked lists as a list of tuples, each containg the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

In [28]:
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """You are helpful assistant, Answer the following question based on this context:
{context}

Question: {question}"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

user_question = input("Enter your question: ")

final_rag_chain.invoke({"question": user_question})

'\n\nAssistant: The leave types available are Earned Leave (PL), Hospitalization Leave (SL), National Holidays, and Maternity leave (for female employees who have been employed with the company for more than one year).'

Part 7: Decomposition

In [38]:
from langchain_core.prompts import ChatPromptTemplate

# Decomposition
prompt = """You are a helpful assistant that generates multiple sub-questions related to an input question.
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation.
Generate multiple search queries related to: {question}
Output (3 queries): """

prompt_decomposition = ChatPromptTemplate.from_template(prompt)


In [32]:
%pip install langchain_ai21

Collecting langchain_ai21
  Downloading langchain_ai21-0.1.9-py3-none-any.whl (17 kB)
Collecting ai21<3.0.0,>=2.14.1
  Downloading ai21-2.15.1-py3-none-any.whl (87 kB)
     ---------------------------------------- 87.6/87.6 kB 1.2 MB/s eta 0:00:00
Collecting ai21-tokenizer<1.0.0,>=0.12.0
  Downloading ai21_tokenizer-0.12.0-py3-none-any.whl (2.7 MB)
     ---------------------------------------- 2.7/2.7 MB 11.4 MB/s eta 0:00:00
Collecting sentencepiece<1.0.0,>=0.2.0
  Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl (991 kB)
     ---------------------------------------- 991.5/991.5 kB 15.6 MB/s eta 0:00:00
Installing collected packages: sentencepiece, ai21-tokenizer, ai21, langchain_ai21
Successfully installed ai21-2.15.1 ai21-tokenizer-0.12.0 langchain_ai21-0.1.9 sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.3", 
                          huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
                        )

# Chain
generate_queries_decompsition = prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n"))

# Run
question = input("")
questions = generate_queries_decompsition.invoke({"question": question})

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\KathiresanParamasiva\.cache\huggingface\token
Login successful


In [37]:
questions

['', '- "Leave types available"', '- "Types of leaves available"', '']

Answer Recursively

In [None]:
# Prompt

template = """You are helpful assistant, Answer the following question based on this context:
{context}

Question: {question}"""
