# RAG using Langchain

## Packages loading & import

In [None]:
# !pip install langchain
# !pip install langchain_community
# !pip install langchain_huggingface
# !pip install langchain_text_splitters
# !pip install langchain_chroma
# !pip install rank-bm25
# !pip install huggingface_hub

In [84]:
# import os
# import json
# import bs4
import nltk
# import torch
import pickle
# import numpy as np

# from pyserini.index import IndexWriter
# from pyserini.search import SimpleSearcher
# from numpy.linalg import norm
# from rank_bm25 import BM25Okapi
# from nltk.tokenize import word_tokenize

from langchain_community.llms import Ollama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma
# from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.embeddings import JinaEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate
# from langchain_community.document_loaders import WebBaseLoader
# from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\X6959\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\X6959\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Hugging face login
- Please apply the model first: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
- If you haven't been granted access to this model, you can use other LLM model that doesn't have to apply.
- You must save the hf token otherwise you need to regenrate the token everytime.
- When using Ollama, no login is required to access and utilize the llama model.

In [4]:
from huggingface_hub import login

hf_token = "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"
login(token=hf_token, add_to_git_credential=True)

In [5]:
!huggingface-cli whoami

�ϥΤ����r�X��: 950
iwtba4188


## TODO1: Set up the environment of Ollama

### Introduction to Ollama
- Ollama is a platform designed for running and managing large language models (LLMs) directly **on local devices**, providing a balance between performance, privacy, and control.
- There are also other tools support users to manage LLM on local devices and accelerate it like *vllm*, *Llamafile*, *GPT4ALL*...etc.

### Launch colabxterm

In [6]:
# TODO1-1: You should install colab-xterm and launch it.
# Write your commands here.

In [None]:
# TODO1-2: You should install Ollama.
# You may need root privileges if you use a local machine instead of Colab.
# %xterm

In [8]:
# %xterm

In [9]:
# TODO1-3: Pull Llama3.2:1b via Ollama and start the Ollama service in the xterm
# Write your commands in the xterm

## Ollama testing
You can test your Ollama status with the following cells.

In [10]:
# Setting up the model that this tutorial will use
MODEL = "llama3.2:1b" # https://ollama.com/library/llama3.2:3b
EMBED_MODEL = "jinaai/jina-embeddings-v2-base-en"

In [11]:
# Initialize an instance of the Ollama model
llm = Ollama(model=MODEL)
# Invoke the model to generate responses
response = llm.invoke("What is the capital of Taiwan?")
print(response)

  llm = Ollama(model=MODEL)


The capital of Taiwan is Taipei.


## Build a simple RAG system by using LangChain

### TODO2: Load the cat-facts dataset and prepare the retrieval database

In [12]:
# !wget https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt

In [13]:
# TODO2-1: Load the cat-facts dataset (as `refs`, which is a list of strings for all the cat facts)
# Write your code here
with open("cat-facts.txt", "r", encoding="utf-8") as f:
    refs = f.readlines()

In [14]:
# from langchain_core.documents import Document
docs = [Document(page_content=doc, metadata={"id": i}) for i, doc in enumerate(refs)]

In [15]:
# Create an embedding model
model_kwargs = {'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
embeddings_model = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [16]:
# TODO2-2: Prepare the retrieval database
# You should create a Chroma vector store.
# search_type can be “similarity” (default), “mmr”, or “similarity_score_threshold”
vector_store = Chroma.from_documents(
    documents=docs, embedding=embeddings_model
)
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 3, "fetch_k": 5}
)

### Prompt setting

In [28]:
# TODO3: Set up the `system_prompt` and configure the prompt.
system_prompt = (
    # "You are a cat expert. I will ask you some questions about cats. Please answer them to the best of your knowledge."  # shihtl> This prompt is generated by the GitHub Copilot.
    # shihtl> Prompt 1: 10
    # "You must answer the questions based your given document, i.e., the original texts in the cat-facts dataset must appear in your answers."
    # "This texts cannot be modified or paraphrased, giving the reference from the given documents."
    # "For efficiency, you must answer the questions in a single sentence."  # shihtl> This prompt is generated by the GitHub Copilot.
    # shihtl> Prompt 2: 8~9
    # "Answer the following questions about cats, in short, single sentence."
    # "Based on the given document, answer it without any modified or paraphrased, also giving reference in documents."
    # shihtl> Prompt 3: 8~10
    "You must answer the questions in a single sentense."
    "The original texts in the cat-facts dataset must appear in your answers."
    # shihtl> Prompt 4
    # ""

    # shihtl> Common part
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        # ("human", "Answering the following questions about cats."),
        ("human", "{input}"),
    ]
)

- For the vectorspace, the common algorithm would be used like Faiss, Chroma...(https://python.langchain.com/docs/integrations/vectorstores/) to deal with the extreme huge database.

In [29]:
# TODO4: Build and run the RAG system
# TODO4-1: Load the QA chain
# You should create a chain for passing a list of Documents to a model.
question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

# TODO4-2: Create retrieval chain
# You should create retrieval chain that retrieves documents and then passes them on.
chain = create_retrieval_chain(retriever=retriever, combine_docs_chain=question_answer_chain)

In [19]:
# Question (queries) and answer pairs
# Please do not modify this cell.
queries = [
    "How much of a day do cats spend sleeping on average?",
    "What is the technical term for a cat's hairball?",
    "What do scientists believe caused cats to lose their sweet tooth?",
    "What is the top speed a cat can travel over short distances?",
    "What is the name of the organ in a cat's mouth that helps it smell?",
    "Which wildcat is considered the ancestor of all domestic cats?",
    "What is the group term for cats?",
    "How many different sounds can cats make?",
    "What is the name of the first cat in space?",
    "How many toes does a cat have on its back paws?"
]
answers = [
    "2/3",
    "Bezoar",
    "a mutation in a key taste receptor",
    ["31 mph", "49 km"],
    "Jacobson’s organ",
    "the African Wild Cat",
    "clowder",
    "100",
    ["Felicette", "Astrocat"],
    "four",
]

In [32]:
counts = 0
for i, query in enumerate(queries):
    # TODO4-3: Run the RAG system
    response = chain.invoke({"input": query})
    # print(response)
    print(f"Query: {query}\nResponse: {response['answer']}\n")
    # The following lines perform evaluations.
    # if the answer shows up in your response, the response is considered correct.
    if type(answers[i]) == list:
        for answer in answers[i]:
            if answer.lower() in response["answer"].lower():
                counts += 1
                break
    else:
        if answers[i].lower() in response["answer"].lower():
            counts += 1

# TODO5: Improve to let the LLM correctly answer the ten questions.
print(f"Correct numbers: {counts}")

Query: How much of a day do cats spend sleeping on average?
Response: On average, a nine-year-old cat spends approximately one-third of its waking hours sleeping.

Query: What is the technical term for a cat's hairball?
Response: A bezoar, specifically a "bezoar of fur" or simply a hairbezoar.

Query: What do scientists believe caused cats to lose their sweet tooth?
Response: Scientists believe that a mutation in the taste receptor responsible for detecting sweetness is likely the reason why cats do not have a sweet tooth like dogs.

Query: What is the top speed a cat can travel over short distances?
Response: The top speed of a cat is approximately 31 miles per hour or around 49 kilometers per hour.

Query: What is the name of the organ in a cat's mouth that helps it smell?
Response: Cats have an additional organ called Jacobson’s organ located in the upper surface of their mouth.

Query: Which wildcat is considered the ancestor of all domestic cats?
Response: The ancestor of modern d

### Data Collection

In [69]:
# Question (queries) and answer pairs
# Please do not modify this cell.
queries = [
    "How much of a day do cats spend sleeping on average?",
    "What is the technical term for a cat's hairball?",
    "What do scientists believe caused cats to lose their sweet tooth?",
    "What is the top speed a cat can travel over short distances?",
    "What is the name of the organ in a cat's mouth that helps it smell?",
    "Which wildcat is considered the ancestor of all domestic cats?",
    "What is the group term for cats?",
    "How many different sounds can cats make?",
    "What is the name of the first cat in space?",
    "How many toes does a cat have on its back paws?"
]
answers = [
    "2/3",
    "Bezoar",
    "a mutation in a key taste receptor",
    ["31 mph", "49 km"],
    "Jacobson’s organ",
    "the African Wild Cat",
    "clowder",
    "100",
    ["Felicette", "Astrocat"],
    "four",
]

In [76]:
# TODO3: Set up the `system_prompt` and configure the prompt.
rules = [
    "You must answer the questions in a single sentense.",
    "The original texts in the dataser must appear in your answers.",
    "Answering without any modified or paraphrased the dataset.",
]


In [80]:
from itertools import combinations, chain

for index, rule in enumerate(
    chain.from_iterable(combinations(rules, r) for r in range(1, len(rules) + 1))
):  # shihtl> This line of code is generated by the Microsoft Copilot.
    print(f"Rule {index + 1}: " + "\n".join(rule))

Rule 1: You must answer the questions in a single sentense.
Rule 2: The original texts in the dataser must appear in your answers.
Rule 3: Answering without any modified or paraphrased the dataset.
Rule 4: You must answer the questions in a single sentense.
The original texts in the dataser must appear in your answers.
Rule 5: You must answer the questions in a single sentense.
Answering without any modified or paraphrased the dataset.
Rule 6: The original texts in the dataser must appear in your answers.
Answering without any modified or paraphrased the dataset.
Rule 7: You must answer the questions in a single sentense.
The original texts in the dataser must appear in your answers.
Answering without any modified or paraphrased the dataset.


In [86]:
from itertools import combinations, chain


statistics = {}

for index, rule in enumerate(
    chain.from_iterable(combinations(rules, r) for r in range(1, len(rules) + 1))
):  # shihtl> This line of code is generated by the Microsoft Copilot.
    statistics[index] = [0] * 10

    system_prompt = "\n".join(rule) + "\n" + "Context: {context}"
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )

    question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    chain = create_retrieval_chain(
        retriever=retriever, combine_docs_chain=question_answer_chain
    )

    round_num = 1
    round_bar = tqdm(range(round_num), desc=f"Rule Combination {index + 1}")
    for round in round_bar:
        query_bar = tqdm(queries, desc=f"Question {round + 1}")
        for i, query in enumerate(query_bar):
            round_bar.set_postfix(
                {
                    "Correct": statistics,
                }
            )

            response = chain.invoke({"input": query})
            # print(f"Query: {query}\nResponse: {response['answer']}\n")

            if type(answers[i]) == list:
                for answer in answers[i]:
                    if answer.lower() in response["answer"].lower():
                        statistics[index][i] += 1
                        break
            else:
                if answers[i].lower() in response["answer"].lower():
                    statistics[index][i] += 1

    print(f"Correct numbers: {statistics}")
    with open("statistics.pickle", "wb") as file:
        pickle.dump(statistics, file)

Question 1: 100%|██████████| 10/10 [00:39<00:00,  3.99s/it]orrect={0: [0, 1, 1, 1, 1, 1, 1, 1, 1, 0]}]
Rule Combination 1: 100%|██████████| 1/1 [00:39<00:00, 39.87s/it, Correct={0: [0, 1, 1, 1, 1, 1, 1, 1, 1, 0]}]


Correct numbers: {0: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Question 1:  70%|███████   | 7/10 [00:40<00:17,  5.84s/it]Correct={0: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 1: [1, 1, 1, 1, 0, 1, 1, 0, 0, 0]}]
Rule Combination 2:   0%|          | 0/1 [00:40<?, ?it/s, Correct={0: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 1: [1, 1, 1, 1, 0, 1, 1, 0, 0, 0]}]


KeyboardInterrupt: 