In [1]:
import os
import pickle
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, CrossEncoder
import requests
from typing import TypedDict

In [2]:
device = "mps"

In [3]:
dataset = load_dataset("stanfordnlp/imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
dataset = dataset["train"]
dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [5]:
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
# embeddings = model.encode(dataset["text"], convert_to_tensor=True)
# embeddings.shape

In [7]:
# with open("imdb_embeddings.pkl", "wb") as f:
#     pickle.dump(embeddings, f)

In [8]:
with open("imdb_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [9]:
dataset = dataset.add_column("embeddings", embeddings.cpu().tolist())

In [10]:
dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [11]:
dataset.add_faiss_index(column="embeddings")

  0%|          | 0/25 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label', 'embeddings'],
    num_rows: 25000
})

In [12]:
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
reranker.model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [13]:
def retrieve(query, top_k=20, dataset=dataset, model=model):
    query_embedding = model.encode(query)

    _, samples = dataset.get_nearest_examples(
        "embeddings", query_embedding, k=top_k
    )

    return samples

def rerank(query, samples, top_k=5, reranker=reranker):
    reranker_input = [
        [query, text] for text in samples["text"]
]   
    
    reranker_scores = reranker.predict(reranker_input)

    ranking = np.argsort(reranker_scores)[::-1]

    return np.array(samples["text"])[ranking][:top_k]

In [14]:
query = "What are the films Paris is depicted in?"

retrieved = retrieve(query)
reranked = rerank(query, retrieved)
reranked

array(['Paris, je t\'aime (2006) is a film made up of 18 segments. You can do the math--18 segments in 120 minutes means each director had seven minutes to tell her or his story. The movie is based on the premise that you can, indeed, tell a story in that short amount of time. The premise works. Almost all of the segments are powerful, complete, and satisfying. Each presents a different aspect of the Parisian experience, and almost every director draws forth outstanding performances from a cast of great and near-great actors.<br /><br />There were so many powerful portrayals in this film that it\'s hard to pick one or two favorites. Probably the most memorable to me were Juliette Binoche as a grieving mother in the segment "Place des Victoires," Gena Rowlands as an aging beauty in "Quartier Latin," Catalina Sandino Moreno as a maid in the segment "Loin du 16ème" and Margo Martindale as a Colorado mail carrier who has learned to speak French so she can visit Paris ("14ème Arrondissement

In [15]:
CLARIN_KEY = os.environ["CLARIN_KEY"]
CLARIN_URL = "https://services.clarin-pl.eu/api/v1/oapi/chat/completions"

In [16]:
headers = {
    "Authorization": f"Bearer {CLARIN_KEY}",
    "accept": "application/json",
    "Content-Type": "application/json",
}

def send_request(message):
    payload = {
        "model": "mixtral-8x22B",
        "messages": [
            {
                "role": "user",
                "content": message
            },
        ],
    }

    try:
        response = requests.post(CLARIN_URL, json=payload, headers=headers)
    except requests.exceptions.RequestException as e:
        print(e)

    return response.json()

In [17]:
send_request("What are the films Paris is depicted in?")

{'id': 'chatcmpl-2213432322664aeeb25c6d0d93747b05',
 'choices': [{'finish_reason': 'stop',
   'index': 0,
   'logprobs': None,
   'message': {'content': ' Paris, France has been featured prominently in numerous films throughout the history of cinema due to its iconic landmarks, rich culture and romantic atmosphere. Here are some notable films that depict Paris:\n\n1. "Amélie" (2001) - Directed by Jean-Pierre Jeunet, this romantic comedy is set in the Montmartre district of Paris and follows a quirky waitress who sets out to improve the lives of those around her.\n\n2. "Midnight in Paris" (2011) - Directed by Woody Allen, this romantic comedy tells the story of a nostalgic screenwriter who magically travels back in time to the 1920s while visiting Paris, encountering famous historical figures and artists.\n\n3. "The Da Vinci Code" (2006) - Directed by Ron Howard, this thriller follows symbologist Robert Langdon as he travels to Paris to decipher clues surrounding the murder of the Louvr

In [37]:
class RagConfig(TypedDict):
    top_k_reranker: int
    top_k_retriever: int

prompt_template = """You are a helpful assistant. Answer the following QUERY utilizing your knowledge.

QUERY: {query}"""

prompt_template_rag = """You are a helpful assistant. Answer the following QUERY utilizing provided CONTEXT.

QUERY: {query}

CONTEXT: {context}"""

def get_answer(query, rag_config=None):
    if rag_config is None:
        prompt = prompt_template.format(query=query)

        response = send_request(prompt)
        answer = response["choices"][0]["message"]["content"]
        
        print(f"""===== NO RAG USED =====
        PROMPT: {prompt}
        ANSWER: {answer}
        """)

    else:
        retrieved = retrieve(query, top_k=rag_config["top_k_retriever"])
        reranked = rerank(query, retrieved, top_k=rag_config["top_k_reranker"])
        
        context = "\n".join([f"Document {i+1}: {document}" for i, document in enumerate(reranked)])
        prompt = prompt_template_rag.format(query=query, context=context)

        response = send_request(prompt)
        answer = response["choices"][0]["message"]["content"]

        print(f"""===== RAG USED =====
        PROMPT: {prompt}
        ANSWER: {answer}
        """)

In [34]:
get_answer("What are the films Paris is depicted in?")

===== NO RAG USED =====
        PROMPT: You are a helpful assistant. Answer the following QUERY utilizing your knowledge.

QUERY: What are the films Paris is depicted in?
        ANSWER:  Paris, the capital city of France, is one of the most iconic and frequently depicted cities in cinema. Its beautiful landmarks, rich history, and romantic atmosphere make it a favorite setting for filmmakers around the world. Here are some notable films that depict Paris:

1. Midnight in Paris (2011) - Directed by Woody Allen, this romantic comedy fantasy film follows a nostalgic screenwriter who is magically transported back to the 1920s in Paris, where he meets legendary writers, artists, and musicians.

2. Amélie (2001) - This whimsical romantic comedy directed by Jean-Pierre Jeunet tells the story of a shy waitress who decides to change the lives of those around her for the better while struggling with her own isolation.

3. Moulin Rouge! (2001) - Directed by Baz Luhrmann, this jukebox musical rom

In [39]:
rag_config: RagConfig = {
    "top_k_reranker": 50,
    "top_k_retriever": 5,
}

get_answer("What are the films Paris is depicted in?", rag_config=rag_config)

===== RAG USED =====
        PROMPT: You are a helpful assistant. Answer the following QUERY utilizing provided CONTEXT.

QUERY: What are the films Paris is depicted in?

CONTEXT: Document 1: ... for Paris is a moveable feast." Ernest Hemingway<br /><br />It is impossible to count how many great talents have immortalized Paris in paintings, novels, songs, poems, short but unforgettable quotes, and yes - movies. The celebrated film director Max Ophüls said about Paris, <br /><br />"It offered the shining wet boulevards under the street lights, breakfast in Montmartre with cognac in your glass, coffee and lukewarm brioche, gigolos and prostitutes at night. Everyone in the world has two fatherlands: his own and Paris." <br /><br />Paris is always associated with love and romance, and "Paris, Je T'Aime" which is subtitled "Petite romances," is a collection of short films, often sketches from 18 talented directors from all over the world. In each, we become familiar with one of the City of 