# Install

In [1]:
!pip install chromadb langchain pypdf2 tiktoken streamlit python-dotenv
!pip install pypdf
!pip install openai

Collecting chromadb
  Downloading chromadb-0.4.20-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.7/507.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.351-py3-none-any.whl (794 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.3/794.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting streamlit
  Downloading streamlit-1.29.0-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8

There is a .pdf example for quick test

In [3]:
!gdown 1jsF4rg70BZT9CJy-l4LXstyQThW_qXrz

Downloading...
From: https://drive.google.com/uc?id=1jsF4rg70BZT9CJy-l4LXstyQThW_qXrz
To: /content/Laws of the Game 2023_24.pdf
100% 66.1M/66.1M [00:01<00:00, 47.8MB/s]


# Import

In [2]:
import os
from dotenv import load_dotenv

from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
# Set logging for the queries
import logging
from langchain.retrievers.multi_query import MultiQueryRetriever

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

# RetrievalQA

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

# Document Loader

In [4]:
def load_documents(file_path):
    documents = None
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
        documents = loader.load()
    elif file_path.endswith(".txt"):
        loader = TextLoader(file_path)
        documents = loader.load()
    else:
        raise ValueError(f"`file_path` expected `.pdf` or `.txt`, but got .{os.path.splitext(file_path)[-1]}")
    return documents

# Vector Database initialization

In [5]:
def load_chunk_persist_pdf(
        path,
        chunk_size: int = 1000,
        chunk_overlap: int = 10,
        saved_persist_directory: str = "./chroma_store",
) -> Chroma:
    """
    Load files and create vector database
    Args:
        path:
            path to `.pdf` or `.txt` file or path to directory
            containing `.pdf` or `.txt` files
        chunk_size:
            Length of a chunk after being splitted from a document
        chunk_overlap:
            Number of overlapped character between current and previous chunk
        saved_persist_directory:
            path to the directory which saves vector database

    Returns:
        Chroma vector database
    TODO:
        - Test if document A and document B are unexpectedly combined by chunk_overlap
        - verify the metric of this vector database
    """
    documents = []
    if os.path.isdir(path):
        for f in os.listdir(path):
            documents_ = load_documents(
                os.path.join(path, f)
            )
            documents.extend(documents_)
    else:
        documents.extend(load_documents(path))

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunked_documents = text_splitter.split_documents(documents)
    client = chromadb.Client()
    if client.list_collections():
        consent_collection = client.create_collection("consent_collection")
    else:
        print("Collection already exists")
    vectordb = Chroma.from_documents(
        documents=chunked_documents,
        embedding=OpenAIEmbeddings(),
        persist_directory=saved_persist_directory
    )
    vectordb.persist()
    return vectordb


# Init LLM

In [6]:
def init_llm(
        model_name: str,
        temperature: float = 0.7
):
    llm = ChatOpenAI(
        # openai_api_key=OPENAI_API_KEY,
        model_name=model_name,
        temperature=temperature
    )
    return llm

# Init RAG

In [7]:
def init_naive_rag(
        llm,
        retriever,
):
    rag = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        return_source_documents=True
        #chain_type_kwargs={"prompt": prompt}
    )

    return rag

In [28]:
def rag_pipeline_wo_decompose(qa_chain, query):
    result = qa_chain({"query": query})
    answer = result["result"]
    sources = dict()
    print("Full source: ", result["source_documents"])
    for d in result["source_documents"]:
        if d.metadata["source"] not in sources:
            sources[d.metadata["source"]] = {str(d.metadata["page"])}
        else:
            sources[d.metadata["source"]].add(str(d.metadata["page"]))
    return answer, sources

In [57]:
def inference_wo_decompose(
        model_name="gpt-3.5-turbo",
        temperature=0.7):
    file_path = str(input("Please provide path to `.pdf` or `.txt` file or directory consists of multiple files: \n"))
    print(f"Processing documents from {file_path}")
    vectordb = load_chunk_persist_pdf(
        path=file_path
    )
    print("Vector database is initialized.")
    print("LLM: ", model_name)
    llm = init_llm(model_name=model_name, temperature=temperature)
    rag = init_naive_rag(
        llm=llm,
        retriever=vectordb.as_retriever(),
    )

    print(">>Bot: Welcome! Please ask me anything about the documents")
    print("Type `:q` to end the chat")
    while True:
        user_input = str(input(">>User:"))
        if user_input.lower().startswith(":q"):
            print(">>Bot: Bye!")
            break
        answer, sources = rag_pipeline_wo_decompose(rag, user_input)
        print(">>Bot: ", answer)
        sources_ = "\n".join([f"Source: {s} | Page: {p}" for s, p in sources.items()])
        print(f">>Bot: {sources_}")

In [59]:
inference_wo_decompose()

Please provide path to `.pdf` or `.txt` file or directory consists of multiple files: 
/content/Laws of the Game 2023_24.pdf
Processing documents from /content/Laws of the Game 2023_24.pdf
Collection already exists
Vector database is initialized.
LLM:  gpt-3.5-turbo
>>Bot: Welcome! Please ask me anything about the documents
Type `:q` to end the chat
>>User:Is a Handball a ‘Direct’ or ‘Indirect’ Kick?
Full source:  [Document(page_content='touched another player, an indirect free kick is awarded; if the kicker commits \na handball offence:\n•\u2002a direct free kick is awarded\n•\u2002 a penalty kick is awarded if the offence occurred inside the kicker’s penalty \narea, unless the kicker was the goalkeeper, in which case an indirect free kick \nis awardedThe Corner Kick', metadata={'page': 132, 'source': '/content/Laws of the Game 2023_24.pdf'}), Document(page_content='touched another player, an indirect free kick is awarded; if the kicker commits \na handball offence:\n•\u2002a direct f

# Decomposition (Addition)

In [37]:
from langchain.prompts import PromptTemplate

DECOMPOSE_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI assistant that specializes in breaking down complex questions into five different simpler, manageable sub-questions.
    When presented with a complex user question, your role is to generate a list of sub-questions that, when answered, will comprehensively address the original query.
    You have at your disposal a pre-defined set of functions and data sources to utilize in answering each sub-question.
    If a user question is straightforward, your task is to return the original question, identifying the appropriate function and data source to use for its solution.
    Please remember that you are limited to the provided functions and data sources, and that each sub-question should be a full question that can be answered using a single function and a single data source.
    Provide these alternative questions separated by newlines.
    Original question: {question}"""
)

def get_multi_query_retriever(llm, retriever, prompt=None):
    if not prompt:
        prompt = DECOMPOSE_PROMPT
    m_retriever = MultiQueryRetriever.from_llm(
        retriever=retriever, llm=llm, prompt=prompt
    )
    return m_retriever

In [32]:
from langchain.chains import LLMChain

QA_PROMPT = PromptTemplate(
    input_variables=["query", "contexts"],
    template="""You are a helpful assistant who answers user queries using the
    contexts provided. If the question cannot be answered using the information
    provided say "I don't know".

    Contexts:
    {contexts}

    Question: {query}"""
)

def get_qa_chain(llm, prompt=None):
    if not prompt:
        prompt = QA_PROMPT
    rag_chain = LLMChain(llm=llm, prompt=prompt)
    return rag_chain

In [48]:
def rag_pipeline_w_decompose(query_chain, qa_chain, query):
    docs = query_chain.get_relevant_documents(query=query)
    result = qa_chain(
        inputs={
            "query": query,
            "contexts": "\n===\n".join([d.page_content for d in docs])
        }
    )
    answer = result["text"]
    sources = dict()
    print("Full source: ", docs)
    for d in docs:
        if d.metadata["source"] not in sources:
            sources[d.metadata["source"]] = {str(d.metadata["page"])}
        else:
            sources[d.metadata["source"]].add(str(d.metadata["page"]))

    return answer, sources

In [49]:
def inference_w_decompose(
        model_name="gpt-3.5-turbo",
        temperature=0.7):
    file_path = str(input("Please provide path to `.pdf` or `.txt` file or directory consists of multiple files: \n"))
    print(f"Processing documents from {file_path}")
    vectordb = load_chunk_persist_pdf(
        path=file_path
    )
    print("Vector database is initialized.")
    print("LLM: ", model_name)
    llm = init_llm(model_name=model_name, temperature=temperature)

    multiquery = get_multi_query_retriever(llm, vectordb.as_retriever(), prompt=DECOMPOSE_PROMPT)
    qa_chain = get_qa_chain(llm, prompt=QA_PROMPT)

    print(">>Bot: Welcome! Please ask me anything about the documents")
    print("Type `:q` to end the chat")
    while True:
        user_input = str(input(">>User:"))
        if user_input.lower().startswith(":q"):
            print(">>Bot: Bye!")
            break
        answer, sources = rag_pipeline_w_decompose(multiquery, qa_chain, user_input)
        print(">>Bot: ", answer)
        sources_ = "\n".join([f"Source: {s} | Page: {p}" for s, p in sources.items()])
        print(f">>Bot: {sources_}")

In [55]:
inference_w_decompose()

Please provide path to `.pdf` or `.txt` file or directory consists of multiple files: 
/content/Laws of the Game 2023_24.pdf
Processing documents from /content/Laws of the Game 2023_24.pdf
Collection already exists
Vector database is initialized.
LLM:  gpt-3.5-turbo
>>Bot: Welcome! Please ask me anything about the documents
Type `:q` to end the chat
>>User:Is a Handball a ‘Direct’ or ‘Indirect’ Kick?


INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the definition of a Handball in the context of sports?', '2. What are the different types of kicks in handball?', "3. How is a 'Direct' kick defined in handball?", "4. How is an 'Indirect' kick defined in handball?", "5. Is a handball considered a 'Direct' or 'Indirect' kick in handball?"]


Full source:  [Document(page_content='hand/arm being hit by the ball and being penalised\n•\u2002scores in the opponents’ goal: \n\u2002• directly from their hand/arm, even if accidental, including by the \ngoalkeeper\n\u2002• immediately after the ball has touched their hand/arm, even if accidental\n The goalkeeper has the same restrictions on handling the ball as any other \nplayer outside the penalty area.  If the goalkeeper handles the ball inside their \npenalty area when not permitted to do so, an indirect free kick is awarded but \nthere is no disciplinary sanction. However, if the offence is playing the ball a \nsecond time (with or without the hand/arm) after a restart before it touches \nanother player, the goalkeeper must be sanctioned if the offence stops a \npromising attack or denies an opponent or the opposing team a goal or an \nobvious goal-scoring opportunity.11No handball\nHandball\n1111No handball\nHandballHandball', metadata={'page': 99, 'source': '/content/Laws of

INFO:langchain.retrievers.multi_query:Generated queries: ['1. How do weather conditions affect the outcome of football matches?', '2. What are the specific external factors, other than weather conditions, that can influence the outcome of football matches?', '3. Can we quantify the impact of weather conditions on the outcome of football matches?', '4. Are there any historical data or studies that analyze the relationship between weather conditions and the outcome of football matches?', '5. Are there any strategies or tactics that teams can use to mitigate the impact of external factors on the outcome of football matches?']


Full source:  [Document(page_content='particular match  (see Law 5)\nSuspend  \nTo stop a match for a period of time with the intention of eventually restarting \nplay e.g. due to fog, heavy rain, thunderstorm, serious injury\nT\nTackle  \nA challenge for the ball with the foot (on the ground or in the air)\nTeam list  \nOfficial team document usually listing the players, substitutes and team officials', metadata={'page': 171, 'source': '/content/Laws of the Game 2023_24.pdf'}), Document(page_content='should also pay attention to:\n\u2002•player confrontations off the ball\n\u2002•possible offences in the area towards which play is moving\n\u2002•offences occurring after the ball is played awayPositioning, movement \nand teamwork', metadata={'page': 177, 'source': '/content/Laws of the Game 2023_24.pdf'}), Document(page_content='156Law 7 – The Duration of the Match (p. 77)\n3. Allowance for time lost\nAmended text\nAllowance is made by the referee in each half for all playing time lost