In [None]:
# Install libraries
!pip install datasets sentence-transformers faiss-cpu accelerate bitsandbytes gradio

In [6]:
# Import libraries
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import ast
import re
from typing import List
from bs4 import BeautifulSoup

### Dataset
We will be using a random dataset I found online that contains FAQ about a service called PetBacker. If you are finished with this notebook and like to experiment some more try out some other (small) datasets. 

You can find other datasets [here](https://huggingface.co/datasets?size_categories=or:%28size_categories:n%3C1K,size_categories:1K%3Cn%3C10K%29&sort=trending).

#### Note
I like to use the `cache_dir` parameter so that models and datasets gets saved in the working directory. This is useful if you want to later delete them.

In [7]:
# Download the dataset
dataset = load_dataset("JLK-ptbk/faq", cache_dir="datasets", split="train")

### Models
We need 3 models to solve this problem:
- One to generate embeddings of the input sequences
- A LLM to generate the output sequences
- A tokenizer to convert the output sequences to the desired format

You can find the other options for the LLM model [here](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?params=0%2C3&official=true) on the open LLM leaderboard.
If you want to try out other embedding models have a look at [this leaderboard](https://huggingface.co/spaces/mteb/leaderboard)

In [None]:
# Download the models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder="models")

# Load the LLM and its tokenizer
model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"

llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="models",
    device_map="cuda",
    trust_remote_code=True,
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="models")

If you inspect the dataset you will see that it includes HTML and CSS tags. This is not useful for our RAG model, so we have to remove them.

In [8]:
# Inspect the dataset

{'Unnamed: 0': 0, 'index': 0, 'data': '[\'How much does it cost to be a pet sitter?\', \'<p>Starting out as a pet sitter and offering your services on PetBacker is free, and you can set your own rates. When you get a reservation, a 15-25% service fee is deducted from the total booking amount.</p>\\r\\n<p>PetBacker service fees are reduced in levels based on the number of jobs you have completed. For more information do refer to the <a href="/help-center/pet-service-providers/pet-service-provider-incentive-program">PetBacker incentive program</a> for the benefits you will receive.</p>\\r\\n<p>Any pet sitter/host suspected of soliciting payment outside of PetBacker will be suspended without notice. <br />Example: <br />1. Requesting customers to revise a booking from many days/visits/walks to one<br />2. Promoting other platforms<br />3. Requesting customers contact you outside by visiting Facebook, Google search, Map or other social media<br />4. Exchange of contacts or meet up before b

### Cleaning the data
We want the dataset to contain data in this format for each row:

`"Q: What methods of payment does PetBacker accept? A: The methods of payment vary by region..."`

You can choose to create a new column or adjust the existing column that contains the data.

In [3]:
# Remove HTML tags
def remove_html_tags(text: str) -> str:
    # TODO: Remove HTML tags from the text and return the cleaned text
    return 

def apply_cleaning(row) -> dict:
    # Get the data from the row
    data = row["data"]

    # Remove HTML tags
    data = remove_html_tags(data)

    # TODO: Normalize the whitespace

    # TODO: Deal with list formatting of the data

    # TODO: Format the text to be like "Q: ... A: ..."
    return {"text" : ...}

# Delete all the unused columns (and the old data column)
cleaned_dataset = dataset.map(apply_cleaning, remove_columns=["Unnamed: 0", "index", "data"])

# Inspect the claned dataset
print(cleaned_dataset[1])


### Generating the embeddings
Now that the dataset is clean we will generate the embeddings using the `embedding_model`

This step is used to convert the data to vectors, so we can quickly lookup the most similar vectors to the input prompt.

In [4]:
# TODO: Generate the embeddings for the cleaned dataset
ds_with_embeddings = cleaned_dataset.map(lambda x : {"embeddings": ...})

# TODO: Set the "embeddings" column as FAISS index
# HINT: Have a look at the documentation: https://huggingface.co/docs/datasets/en/faiss_es


### Retrieving similar embeddings
The next step is to retrieve the most similar embeddings to the input prompt. This is done in the following steps:
- Convert the text to an embedding
- Retrieve the most similar embeddings
- Return the most similar text

In [None]:
# Create a function to get the right document for the prompt
def search(prompt: str, k: int = 3) -> tuple:
    # TODO: Embed the query
    embedded_prompt = ...

    # TODO: Search the dataset for the nearest examples
    # HINT: Look at the documentation from the cell above
    scores, retrieved_examples = ...

    # Return the retrieved examples
    return retrieved_examples["text"]

# Check if the search function works
search("rebook")

In [None]:
# Create a system prompt
system_prompt = """You are ..."""

In [None]:
# Create more context
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": formatted_prompt},
]

In [None]:
# Prompt the model

In [None]:
# Evaluate the model

In [None]:
# Optional: Use a larger (quantized) model with bitsandbytes

In [None]:
# Optional: Make a web interface for the model using gradio
import gradio as gr


def generate_response(query):
    return rag_pipeline(query, model, tokenizer, embedding_model, index, documents)


gr.Interface(fn=generate_response, inputs="text", outputs="text").launch()