In [1]:
import sys
sys.path.append("..")

from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = "path/to/directory"

In [3]:
def load_docs(data_dir):
    
    """ A simple function that loads data from all pdfs inside a directory.

    Args:
        DATA_DIR (path): Path to the directory where pdf documents are stored.

    Returns:
        list(tuple): returns a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source" and "page number" i.e., [Document(page_content = "Tax information ...",
        metadata = {"source": "doc_1.pdf", "page": 10})].
    """
    
    # create an instance of document loader using the pypdfdirectoryloader from langchain
    document_loader = PyPDFDirectoryLoader(data_dir)
    
    return document_loader.load()

def split_docs_into_chunks(documents, chunk_size = 250, chunk_overlap = 25):
    
    """ A function that split a document into chunks of specific sizes like 500 characters.

    Args:
        documents (list(tuple)): a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source" and "page number".

    Returns:
        list(tuples): similar to the format in which data came but any document larger than chunk_size has been split
        into multiple chunks.
    """
    
    # create an instance of recursive character text splitter from langchain
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        is_separator_regex = False,
    )
    
    return text_splitter.split_documents(documents)    

def get_chunk_ids(chunks):

    """ A function that creates a unique identifier for each chunk in the database like "docs/doc_name:page_num:chunk_id".
    A chunk id of "docs/doc_1.pdf:10:5" refers to the 5th chunk on page 10 of doc_1.pdf. 

    Returns:
        list(tuple): returns a list of tuples where each tuple contains "page content" of the document and
        some meta deta like "source", "page number", and "chunk_id"
    """

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        # get source and page number from metadata to create a current page id
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # if the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [4]:

documents = load_docs(DATA_PATH)

In [5]:
len(documents)

13

In [6]:
documents[0:5]

[Document(page_content='UNO\nRULES\nRULES .ORGUNOTHE OFFICIAL\nTHE RULES OF UNO ARE SIMPLE.\nHOWEVER, WE HAVE MADE IT EVEN\nEASIER TO SURVEY THE RULES AND\nADDED AN FAQ AT THE END!\n \nCHECK OUT UNORULES.ORG FOR MORE!RULES\n.ORGUNO\nPRESENTS', metadata={'source': '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf', 'page': 0}),
 Document(page_content='0-90-90-90-919 Red cards - 0 to 9\n19 Blue cards - 0 to 9\n19 Green cards - 0 to 9\n19 Yellow cards - 0 to 9\n2 Green Reverse cards\n2 Yellow Reverse cards\n2 Red Reverse cards\n2 Blue Reverse cards+2\n+2+2\n+2+2\n+2+2\n+2\n+4\n+4+4\n+4+4\n+4+4\n+4\n4 Wild cards 4 Wild Draw 4 cards\nEach player draws a card. The player with the highest point value is the\ndealer. Shuffle the deck. Each player is dealt seven cards.\nPlace the remaining cards facedown to form a DRAW pile. Turn over the\ntop card of the DRAW pile to begin a DISCARD pile. If the top card is a\nWild or Wild Draw 4, return it to the deck and pick anothe

In [7]:
chunks = split_docs_into_chunks(documents)
chunks_with_ids = get_chunk_ids(chunks)

In [8]:
chunks_with_ids[0:5]

[Document(page_content='UNO\nRULES\nRULES .ORGUNOTHE OFFICIAL\nTHE RULES OF UNO ARE SIMPLE.\nHOWEVER, WE HAVE MADE IT EVEN\nEASIER TO SURVEY THE RULES AND\nADDED AN FAQ AT THE END!\n \nCHECK OUT UNORULES.ORG FOR MORE!RULES\n.ORGUNO\nPRESENTS', metadata={'source': '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf', 'page': 0, 'id': '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:0:0'}),
 Document(page_content='0-90-90-90-919 Red cards - 0 to 9\n19 Blue cards - 0 to 9\n19 Green cards - 0 to 9\n19 Yellow cards - 0 to 9\n2 Green Reverse cards\n2 Yellow Reverse cards\n2 Red Reverse cards\n2 Blue Reverse cards+2\n+2+2\n+2+2\n+2+2\n+2\n+4\n+4+4\n+4+4\n+4+4\n+4', metadata={'source': '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf', 'page': 1, 'id': '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:0'}),
 Document(page_content='+2\n+4\n+4+4\n+4+4\n+4+4\n+4\n4 Wild cards 4 Wild Draw 4 ca

In [9]:
print(f"The number of loaded docs were {len(documents)} and they were split into {len(chunks)} chunks.")

The number of loaded docs were 13 and they were split into 120 chunks.


In [10]:
# pre-trained hugging face embedding model used to embed user query and loaded data from pdfs
model_name = "sentence-transformers/all-MiniLM-l6-v2"

# create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name = model_name,     
    model_kwargs = model_kwargs, 
    encode_kwargs = encode_kwargs 
)

In [11]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.038338553160429, 0.12346471846103668, -0.028642943128943443]

In [12]:
print(f"This embedding model creates vector embeddings for chunks in {len(query_result)} dimensions.")

This embedding model creates vector embeddings for chunks in 384 dimensions.


In [13]:
# creating a faiss vector database to run similarity searches for user queries
db = FAISS.from_documents(chunks_with_ids, embeddings)

In [14]:
question = "How many cards in an UNO deck?"
searchDocs = db.similarity_search_with_score(question, k = 3)
for doc, _ in searchDocs:
    print(doc.page_content)
    print(f"Chunk ID - {doc.metadata['id']}")
    print("-----")

text.
Q: How many cards are in an Uno deck?
A: 112 (108 + 4 blank cards is the standard); however, some versions have more cards.
Q: How many cards do you start with?
A: 7 cards each. 
Q: How do you win at Uno?
Chunk ID - /Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:4:4
-----
the other cards, see directions that follow.7OFFICIAL UNO RULES
Classic Uno
CONTENTS
112 cards as follows:
76 Numbered cards 8 Skip cards
8 Reverse cards 8 Draw 2 cards
2 Blue Draw 2 Cards
2 Green Draw 2 Cards
2 Red Draw 2 Cards
Chunk ID - /Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:3
-----
2 Red Draw 2 Cards
2 Yellow Draw 2 Cards2 Yellow Skip cards
2 Red Skip cards
2 Green Skip cards
2 Blue Skip cards
Some Uno decks can also contain various numbers of these different
Chunk ID - /Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:4
-----


In [15]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [16]:
query_text = "How many cards in an UNO deck?"

searchDocs = db.similarity_search_with_score(question, k = 3)
context = " ".join([doc.page_content for doc, _ in searchDocs])

prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context = context, question = query_text)

In [17]:
print(prompt)

Human: 
Answer the question based only on the following context:

text.
Q: How many cards are in an Uno deck?
A: 112 (108 + 4 blank cards is the standard); however, some versions have more cards.
Q: How many cards do you start with?
A: 7 cards each. 
Q: How do you win at Uno? the other cards, see directions that follow.7OFFICIAL UNO RULES
Classic Uno
CONTENTS
112 cards as follows:
76 Numbered cards 8 Skip cards
8 Reverse cards 8 Draw 2 cards
2 Blue Draw 2 Cards
2 Green Draw 2 Cards
2 Red Draw 2 Cards 2 Red Draw 2 Cards
2 Yellow Draw 2 Cards2 Yellow Skip cards
2 Red Skip cards
2 Green Skip cards
2 Blue Skip cards
Some Uno decks can also contain various numbers of these different

---

Answer the question based on the above context: How many cards in an UNO deck?



In [18]:
# initialise Ollama minstral model -> need to run ollama serve from terminal before using Ollama
model = Ollama(model = "llama3")

In [19]:
def get_answer(question: str):
    
    """ A function that retrives similar documents from our faiss database and pass an enhanced query
    through Ollama minstral model to receive a coherent and concide answer to the base query.

    Args:
        question (str): a question asked by the user.

    Returns:
        str: formatted text response from the minstral model that contains source material for the answer.
    """
    
    # retrieve top 3 relevant document chunks from the database based on user's query 
    results = db.similarity_search_with_score(question, k = 5)
    context = " ".join([doc.page_content for doc, _ in results])
    
    # create a prompt template for Ollama
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    
    # create a proper prompt using langchains LLM prompt
    prompt = prompt_template.format(context = context, question = question)
    
    # get sources of the relevant docs in this case the unique id we created for the chunks
    sources = [doc.metadata.get("id", None) for doc, _score in results]
    
    # get a response to the enhanced query from Ollama
    response_text = model.invoke(prompt)
    
    # format response so that it contains model's answer as well the source documents it used to generate that answer
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    
    return formatted_response

In [20]:
# question 1 from query.txt
query = "How many cards in an UNO deck?"
answer = get_answer(question = query)

In [21]:
print(answer)

Response: According to the provided text, there are 112 cards in a standard Uno deck.

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:4:4', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:3', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:4', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:5', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:3:6']


In [22]:
# question 2 from query.txt
query = "How do you win in UNO?"
answer = get_answer(question = query)

In [23]:
print(answer)

Response: According to the provided context, the first player to score 500 points wins the game. Additionally, if a player has no cards left at the end of each hand, they also win that round. However, the ultimate winner is determined by the player with the lowest points among all players who have reached 500 points or more.

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:6', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:3:11', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:4:5', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:3:2', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:0:0']


In [24]:
# question 3 from query.txt
query = "How to play UNO?"
answer = get_answer(question = query)

In [25]:
print(answer)

Response: According to the provided context, here's how to play UNO:

1. If you draw a card you can play, play it.
2. Otherwise, play moves to the next person.

Before playing your next-to-last card, you must say "UNO".

That's it! Simple rules, indeed

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:3:2', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:4:11', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:0:0', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:3', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:1:5']


In [26]:
# question 4 from query.txt
query = "How many players can play Monopoly?"
answer = get_answer(question = query)

In [27]:
print(answer)

Response: According to the context, 2 to 8 players can play Monopoly.

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:0', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:1', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:2:0', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/UNO.pdf:3:11', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:7:9']


In [28]:
# question 5 from query.txt
query = "What are the speed die rules in Monopoly?"
answer = get_answer(question = query)

In [29]:
print(answer)

Response: According to the given context, the Speed Die rules in Monopoly are as follows:

1. Roll the Speed Die along with the two white dice on your turn.
2. Depending on what you rolled, do the following:
	* If you roll a combination that allows you to use the Speed Die (not specified what this is), you can start using it before others.

Note: The rest of the rules are described as "additional" and assume you already know how to play Monopoly according to the classic rules.

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:2', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:5', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:1', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:1:1', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:1:5']


In [30]:
# question 6 from query.txt
query = "What is the Banker in Monopoly?"
answer = get_answer(question = query)

In [31]:
print(answer)

Response: According to the provided context, the Banker is a player who will also make a good Auctioneer. The Banker's main role is to keep their personal funds and issue as much more as needed by writing on any ordinary paper.

Sources: ['/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:2:1', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:2:5', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:0:0', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:2:2', '/Users/anuragkotiyal/Desktop/Projects/Board Games Rulebook/library/MONOPOLY.pdf:2:3']
