In [None]:
# Install libraries
%pip install datasets sentence-transformers faiss-cpu accelerate rouge-score bs4

In [6]:
# Import libraries
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from rouge_score import rouge_scorer

### Dataset
We will be using a random dataset I found online that contains FAQ about a service called PetBacker. If you are finished with this notebook and like to experiment some more try out some other (small) datasets. 

You can find other datasets [here](https://huggingface.co/datasets?size_categories=or:%28size_categories:n%3C1K,size_categories:1K%3Cn%3C10K%29&sort=trending).

#### Note
I like to use the `cache_dir` parameter so that models and datasets gets saved in the working directory. This is useful if you want to later delete them.

In [7]:
# Download the dataset
dataset = load_dataset("JLK-ptbk/faq", cache_dir="datasets", split="train")

### Models
We need 3 models to solve this problem:
- One to generate embeddings of the input sequences (also know as the retriever)
- A LLM to generate the output sequences
- A tokenizer to convert the output sequences to the desired format

You can find the other options for the LLM model [here](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?params=0%2C3&official=true) on the open LLM leaderboard.
If you want to try out other embedding models have a look at [this leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [None]:
# Download the models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="models")

# Load the LLM and its tokenizer
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="models",
    device_map="cuda",
    trust_remote_code=True,
)#.to("cuda")       Uncomment the "to" part to ensure the device is set to GPU rather than CPU, provided you have CUDA drivers
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="models")

pipe = pipeline(
    "text-generation",
    model=llm,
    tokenizer=tokenizer,
)

### Inspect the dataset
If you inspect the dataset you will see that it includes HTML and CSS tags. This is not useful for our RAG model, so we have to remove them.

It also seems to be a Python list saved a string, that consist of 2 items.

In [None]:
# TODO: Inspect the dataset

### Cleaning the data
We want the dataset to contain data in this format for each row:

`{'text': "Q: What methods of payment does PetBacker accept? A: The methods of payment vary by region..."}`

You can choose to create a new column or adjust the existing column that contains the data.

In [3]:
# NOTE: Feel free to import any libary for this step
from bs4 import BeautifulSoup
import ast

# Remove HTML tags
def remove_html_tags(text: str) -> str:
    # TODO: Remove HTML tags from the text and return the cleaned text
    return 

def apply_cleaning(row) -> dict:
    # Get the data from the row
    data = row["data"]

    # TODO: Deal with list formatting of the data
    data_list = ...

    # Remove HTML tags
    data_list = [remove_html_tags(item) for item in data_list]

    # TODO: Normalize the whitespace

    # TODO: Format the text to be like "Q: ... A: ..."
    return {"text" : ...}

# Delete all the unused columns (and the old data column)
cleaned_dataset = dataset.map(apply_cleaning, remove_columns=["Unnamed: 0", "index", "data"])

# Inspect the cleaned dataset
print(cleaned_dataset[1])

### Generating the embeddings
Now that the dataset is clean we will generate the embeddings using the `embedding_model`. We will be using SentenceTransformers for this task. You can find the documentation [here](https://sbert.net/docs/sentence_transformer/usage/usage.html).

This step is used to convert the data to vectors, so we can quickly lookup the most similar vectors to the input prompt.

In [4]:
# TODO: Generate the embeddings for the cleaned dataset
ds_with_embeddings = cleaned_dataset.map(lambda x : {"embeddings": ...})

# TODO: Set the "embeddings" column as FAISS index
# HINT: Have a look at the documentation: https://huggingface.co/docs/datasets/en/faiss_es


### Retrieving similar embeddings
The next step is to retrieve the most similar embeddings to the input prompt. This is done in the following steps:
- Convert the text to an embedding
- Retrieve the most similar embeddings
- Return the most similar text

In [None]:
# Create a function to get the right document for the prompt
def search(prompt: str, k: int = 3) -> tuple:
    # TODO: Embed the query
    embedded_prompt = ...

    # TODO: Search the dataset for the nearest examples
    # HINT: Look at the documentation from the cell above
    scores, retrieved_examples = ...

    # Return the retrieved examples
    return retrieved_examples["text"]

# Check if the search function works
search("rebook")

### Formatting the prompt
With the function above we can lookup relevant information for a given prompt. The next step is to format the prompt in a way that the LLM can generate the correct output. We want to format the given prompt in a way like this:

`"Question: {User prompt} Context: {Retrieved text}"`

In [None]:
def format_prompt(prompt: str, k:int) -> str:
    """using the retrieved documents we will prompt the model to generate our responses"""
    # TODO: Use the search function to get the retrieved documents
    retrieved_docs = search(prompt, k)

    # TODO: remove the part of the retrieved documents that is not the answer
    # HINT: So we only want to return the part that is after "A: "

    # TODO: Join the retrieved documents into a single string

    # TODO: Add the retrieved documents to the prompt as context
    # HINT: You can use the following format: "Question: prompt \n Context: retrieved docs"
    formatted_prompt = ...

    return formatted_prompt

print(format_prompt("rebook", 3))

### System prompt
In the cell below you can specify your system prompt. This is the prompt that the LLM will use to generate the output. You can experiment with different prompts to see how the output changes.

In [None]:
# TODO: Create a system prompt
system_prompt = """You are ..."""

In [None]:
# Prompt the model
def prompt_model(prompt: str, k: int = 3):
    formatted_prompt = format_prompt(prompt, k)

    # Use the system prompt and the formatted prompt to generate the response
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": formatted_prompt},
    ]

    # The parameters for the LLM
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.5,  # Feel free to change this
        "do_sample": True,
    }
    output = pipe(messages, **generation_args)

    # We only care about the generated text
    return output[0]["generated_text"]

prompt = "What methods of payment does PetBacker accept?"
response = prompt_model(prompt)
print(response)

### Evaluating
Now that we can generate the output from the RAG system we can evaluate the results. You can use the ROUGE score to evaluate the results. The ROUGE score is a metric that measures the similarity between two texts. You can find more information about the ROUGE score [here](https://en.wikipedia.org/wiki/ROUGE_(metric)).

Be sure to take a look at the [documentation](https://github.com/google-research/google-research/tree/master/rouge) on how to calculate the ROUGE score.

In [None]:
# TODO: Set the most fitting response as the reference
# The reference is the correct answer to the prompt
reference = ...

# TODO: Generate response (or use the respnse of the previous cell)
response = ...

# TODO: Evaluation with ROUGE
scorer = ...
scores = ...
print("ROUGE scores:", scores)

### Optional: Use different metrics to evaluate the results
A few common options are:
- BLEU
- METEOR
- Embedding-Based Evaluation

In [None]:
# TODO: install the libraries to implement the other metrics
# TODO: Implement the other metrics

### Optional: Use a better (quantized) model
We are using a relatively small model for the task above. If you are unhappy with the results and want better performance, consider using a larger model like [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).

We will also use quantization to lower GPU usage. If you are doing this Masterclass without GPU availability, you need to remove quantization as a whole (and possibly question if it is a good idea to try this part of the Masterclass).

In [None]:
%pip install bitsandbytes

In [None]:
# Optional: Use a larger (quantized) model with bitsandbytes
import torch
from transformers import BitsAndBytesConfig

model_id = "ibm-granite/granite-3.1-8b-instruct"

# use quantization to lower GPU usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
)#.to("cuda")       Uncomment the "to" part to ensure the device is set to GPU rather than CPU, provided you have CUDA drivers

# TODO: Make a pipeline to use this model

# TODO: Evaluate it against the previous model

In [None]:
%pip install gradio

In [None]:
# Optional: Make a web interface for the model using gradio
import gradio as gr

gr.Interface(...)