In [None]:
# Install libraries
!pip install datasets sentence-transformers faiss-cpu accelerate rouge-score

In [1]:
# Import libraries
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from rouge_score import rouge_scorer

### Dataset
We will be using a random dataset I found online that contains FAQ about a service called PetBacker. If you are finished with this notebook and like to experiment some more try out some other (small) datasets. 

You can find other datasets [here](https://huggingface.co/datasets?size_categories=or:%28size_categories:n%3C1K,size_categories:1K%3Cn%3C10K%29&sort=trending).

#### Note
I like to use the `cache_dir` parameter so that models and datasets gets saved in the working directory. This is useful if you want to later delete them.

In [2]:
# Download the dataset
dataset = load_dataset("JLK-ptbk/faq", cache_dir="datasets", split="train")

### Models
We need 3 models to solve this problem:
- One to generate embeddings of the input sequences (also know as the retriever)
- A LLM to generate the output sequences
- A tokenizer to convert the output sequences to the desired format

You can find the other options for the LLM model [here](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?params=0%2C3&official=true) on the open LLM leaderboard.
If you want to try out other embedding models have a look at [this leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [42]:
# Download the models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="models")

# Load the LLM and its tokenizer
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="models",
    device_map="cuda",
    trust_remote_code=True,
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="models")

pipe = pipeline(
    "text-generation",
    model=llm,
    tokenizer=tokenizer,
)

Device set to use cuda


### Inspect the dataset
If you inspect the dataset you will see that it includes HTML and CSS tags. This is not useful for our RAG model, so we have to remove them.

It also seems to be a Python list saved a string, that consist of 2 items.

In [4]:
# Inspect the dataset
print(dataset[0])

{'Unnamed: 0': 0, 'index': 0, 'data': '[\'How much does it cost to be a pet sitter?\', \'<p>Starting out as a pet sitter and offering your services on PetBacker is free, and you can set your own rates. When you get a reservation, a 15-25% service fee is deducted from the total booking amount.</p>\\r\\n<p>PetBacker service fees are reduced in levels based on the number of jobs you have completed. For more information do refer to the <a href="/help-center/pet-service-providers/pet-service-provider-incentive-program">PetBacker incentive program</a> for the benefits you will receive.</p>\\r\\n<p>Any pet sitter/host suspected of soliciting payment outside of PetBacker will be suspended without notice. <br />Example: <br />1. Requesting customers to revise a booking from many days/visits/walks to one<br />2. Promoting other platforms<br />3. Requesting customers contact you outside by visiting Facebook, Google search, Map or other social media<br />4. Exchange of contacts or meet up before b

### Cleaning the data
We want the dataset to contain data in this format for each row:

`{'text': "Q: What methods of payment does PetBacker accept? A: The methods of payment vary by region..."}`

You can choose to create a new column or adjust the existing column that contains the data.

In [43]:
# NOTE: Feel free to import any libary for this step
from bs4 import BeautifulSoup
import ast

# Remove HTML tags
def remove_html_tags(text: str) -> str:
    # TODO: Remove HTML tags from the text and return the cleaned text
    return BeautifulSoup(text, "html.parser").get_text()

def apply_cleaning(row) -> dict:
    # Get the data from the row
    data = row["data"]

    # TODO: Deal with list formatting of the data
    data_list = ast.literal_eval(data)

    # Remove HTML tags
    data_list = [remove_html_tags(item) for item in data_list]

    # TODO: Normalize the whitespace
    data_list = [" ".join(item.split()) for item in data_list]

    # TODO: Format the text to be like "Q: ... A: ..."
    return {"text": f"Q: {data_list[0]} A: {data_list[1]}"}

# Delete all the unused columns (and the old data column)
cleaned_dataset = dataset.map(apply_cleaning, remove_columns=["Unnamed: 0", "index", "data"])

# Inspect the cleaned dataset
print(cleaned_dataset[1])

{'text': 'Q: What methods of payment does PetBacker accept? A: The methods of payment vary by region. Some of the payment methods PetBacker accepts are direct bank in, and major credit cards, debit cards via Paypal.'}


### Generating the embeddings
Now that the dataset is clean we will generate the embeddings using the `embedding_model`. We will be using SentenceTransformers for this task. You can find the documentation [here](https://sbert.net/docs/sentence_transformer/usage/usage.html).

This step is used to convert the data to vectors, so we can quickly lookup the most similar vectors to the input prompt.

In [44]:
# TODO: Generate the embeddings for the cleaned dataset
ds_with_embeddings = cleaned_dataset.map(lambda x : {"embeddings": embedding_model.encode(x["text"])})

# TODO: Set the "embeddings" column as FAISS index
# HINT: Have a look at the documentation: https://huggingface.co/docs/datasets/en/faiss_es
ds_with_embeddings.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 182
})

### Retrieving similar embeddings
The next step is to retrieve the most similar embeddings to the input prompt. This is done in the following steps:
- Convert the text to an embedding
- Retrieve the most similar embeddings
- Return the most similar text

In [46]:
# Create a function to get the right document for the prompt
def search(prompt: str, k: int = 3) -> tuple:
    # TODO: Embed the query
    embedded_prompt = embedding_model.encode(prompt)

    # TODO: Search the dataset for the nearest examples
    # HINT: Look at the documentation from the cell above
    _, retrieved_examples = ds_with_embeddings.get_nearest_examples("embeddings", embedded_prompt, k=k)

    # Return the retrieved examples
    return retrieved_examples["text"]

# Check if the search function works
search("rebook")

['Q: How do I rebook/extend booking with a sitter again? A: Follow the steps to rebook/extend your ongoing booking: 1) Navigate to the chat room with the booked sitter 3) Tap on "Rebook/Extend Booking" To rebook again your completed booking: 1) Navigate to the chat room with the booked sitter 3) Tap on "Book Again" 4) Fill in all required fields and submit your Request!',
 'Q: What is a repeat customer? A: A repeat customer is a customer who has booked your services with payment through PetBacker and books your services again with payment through PetBacker a customer who extends an existing booking with payment through PetBacker You might be interested in What are the terms of use?',
 'Q: How to redeem your payment? A: 1. Go to profile. 2. Tab on Payment Account. 3. Select the Earning History. 4. Then click on the redeem now button to redeem your money. You might be interested in What are the terms of use?']

### Formatting the prompt
With the function above we can lookup relevant information for a given prompt. The next step is to format the prompt in a way that the LLM can generate the correct output. We want to format the given prompt in a way like this:

`"Question: {User prompt} Context: {Retrieved text}"`

In [47]:
def format_prompt(prompt: str, k:int) -> str:
    """using the retrieved documents we will prompt the model to generate our responses"""
    # TODO: Use the search function to get the retrieved documents
    retrieved_docs = search(prompt, k)

    # TODO: remove the part of the retrieved documents that is not the answer
    # HINT: So we only want to return the part that is after "A: "
    retrieved_docs = [doc.split("A: ")[1] for doc in retrieved_docs]

    # TODO: Join the retrieved documents into a single string
    retrieved_docs = "\n".join(retrieved_docs)

    # TODO: Add the retrieved documents to the prompt as context
    # HINT: You can use the following format: "Question: prompt \n Context: retrieved docs"
    formatted_prompt = f"Question: {prompt} \nContext: {retrieved_docs}"

    return formatted_prompt

print(format_prompt("rebook", 3))

Question: rebook 
Context: Follow the steps to rebook/extend your ongoing booking: 1) Navigate to the chat room with the booked sitter 3) Tap on "Rebook/Extend Booking" To rebook again your completed booking: 1) Navigate to the chat room with the booked sitter 3) Tap on "Book Again" 4) Fill in all required fields and submit your Request!
A repeat customer is a customer who has booked your services with payment through PetBacker and books your services again with payment through PetBacker a customer who extends an existing booking with payment through PetBacker You might be interested in What are the terms of use?
1. Go to profile. 2. Tab on Payment Account. 3. Select the Earning History. 4. Then click on the redeem now button to redeem your money. You might be interested in What are the terms of use?


### System prompt
In the cell below you can specify your system prompt. This is the prompt that the LLM will use to generate the output. You can experiment with different prompts to see how the output changes.

In [52]:
# TODO: Create a system prompt
system_prompt = """You are an assistant for answering questions.
The user provides their question and your task is to give a fitting answer based on the given context.
Your answer should be informative and concise.
"""

In [56]:
# Prompt the model
def prompt_model(prompt: str, k: int = 2):
    formatted_prompt = format_prompt(prompt, k)

    # Use the system prompt and the formatted prompt to generate the response
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": formatted_prompt},
    ]

    # The parameters for the LLM
    generation_args = {
        "max_new_tokens": 500, # Lower = less output
        "return_full_text": False,
        "temperature": 0.5, # Feel free to change this
        "do_sample": True
    }
    output = pipe(messages, **generation_args)

    # We only care about the generated text
    return output[0]["generated_text"]

prompt = "What methods of payment does PetBacker accept?"
response = prompt_model(prompt)
print(response)

There are several payment methods that PetBacker accepts, including:

1. Direct Bank in: This is a popular payment method that allows customers to pay for pet-related services directly from their bank account.
2. Credit cards via PayPal: PetBacker also accepts credit card payments through PayPal, which is a secure payment gateway that protects both the customer and the merchant.
3. Major credit cards: PetBacker accepts major credit cards such as Visa, Mastercard, and American Express.
4. Debit cards via PayPal: Debit cards are also accepted, allowing customers to make payments directly from their bank account.
5. PayPal: PetBacker accepts PayPal payments, which are processed through the PayPal platform.
6. Checks: PetBacker accepts checks, which are processed through the PetBacker platform.

The terms of use for PetBacker vary by region, and customers should ensure that they comply with the terms and conditions of use before making any payments.


### Evaluating
Now that we can generate the output from the RAG system we can evaluate the results. You can use the ROUGE score to evaluate the results. The ROUGE score is a metric that measures the similarity between two texts. You can find more information about the ROUGE score [here](https://en.wikipedia.org/wiki/ROUGE_(metric)).

Be sure to take a look at the [documentation](https://github.com/google-research/google-research/tree/master/rouge) on how to calculate the ROUGE score.

In [57]:
# TODO: Set the most fitting response as the reference
# The reference is the correct answer to the prompt
reference = """
The methods of payment vary by region. Some of the payment methods PetBacker accepts are direct bank in, and major credit cards, debit cards via Paypal.
"""

# Evaluation with ROUGE
scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)

# Use the previous response or generate a new one
scores = scorer.score(response, reference)
print("ROUGE scores:", scores)

ROUGE scores: {'rouge1': Score(precision=0.9615384615384616, recall=0.16891891891891891, fmeasure=0.28735632183908044), 'rougeL': Score(precision=0.5769230769230769, recall=0.10135135135135136, fmeasure=0.1724137931034483)}


### Optional: Use different metrics to evaluate the results
A few common options are:
- BLEU
- METEOR
- Embedding-Based Evaluation

In [None]:
# TODO: install the libraries to implement the other metrics
# TODO: Implement the other metrics

### Optional: Use a better (quantized) model
We are using a relatively small model for the task above. If you are unhappy with the results and want better performance, consider using a larger model like [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).

We will also use quantization to lower GPU usage.

In [None]:
!pip install bitsandbytes

In [None]:
# Optional: Use a larger (quantized) model with bitsandbytes
import torch
from transformers import BitsAndBytesConfig

model_id = "ibm-granite/granite-3.1-8b-instruct"

# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    quantization_config=bnb_config,
).to("cuda")

# TODO: Make a pipeline to use this model
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)


def prompt_model(prompt: str, k: int = 2):
    formatted_prompt = format_prompt(prompt, k)

    # Use the system prompt and the formatted prompt to generate the response
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": formatted_prompt},
    ]

    # The parameters for the LLM
    generation_args = {
        "max_new_tokens": 500,  # Lower = less output
        "return_full_text": False,
        "temperature": 0.5,  # Feel free to change this
        "do_sample": True,
    }
    output = pipe(messages, **generation_args)

    # We only care about the generated text
    return output[0]["generated_text"]


prompt = "What methods of payment does PetBacker accept?"
response = prompt_model(prompt)
print(response)

# TODO: Evaluate it against the previous model

tokenizer_config.json:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/29.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

In [None]:
!pip install gradio

In [None]:
# Optional: Make a web interface for the model using gradio
import gradio as gr

gr.Interface(...)