In [1]:
import torch
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, CrossEncoder
import pandas as pd
import random
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "mps"
device

'mps'

In [3]:
dataset = load_dataset("sentence-transformers/squad", split="train")
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 87599
})

In [4]:
dataset = dataset.select(range(10000))
# dataset = dataset.select(range(165, 200))
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 10000
})

## Retrieval

In [5]:
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

#### Calculate embeddings

In [6]:
questions_emb = model.encode(dataset["question"], convert_to_tensor=True)
questions_emb.shape

torch.Size([10000, 768])

Answer are encoded differently because they repeat in the dataset:

In [7]:
# This is very slow because we are embedding the same answer text multiple times

# dataset2 = dataset.map(
#     lambda x: {
#         "question_emb": model.encode(x["question"], convert_to_tensor=True),
#         "answer_emb": model.encode(x["answer"], convert_to_tensor=True),
#     },
#     batched=True
# )

In [8]:
df = dataset.to_pandas()

df["answer_map"] = pd.factorize(df["answer"])[0]
unique_answers = df["answer"].drop_duplicates().tolist()
print("Num of unique answers:", len(unique_answers))

dataset = Dataset.from_pandas(df)

Num of unique answers: 1867


In [9]:
answers_emb = model.encode(unique_answers, convert_to_tensor=True)
answers_emb.shape

torch.Size([1867, 768])

#### Making queries

In [10]:
top_k = 5

In [11]:
def print_query_results(question_idx, scores, indices):
    print("\nQuestion:", dataset["question"][question_idx])
    print(f"Correct answer IDX: {dataset["answer_map"][question_idx]}\n")

    for i, (score, idx) in enumerate(zip(scores, indices)):
        print(f"Answer {i+1} IDX: {idx}")
        print(f"Score: {score.item()}")
        print(f"Text: {unique_answers[idx]}\n")
    
    print("=====================================")

In [12]:
def analyze_question(question_idx, top_k=5):
    question_emb = questions_emb[question_idx]

    similarity_scores = model.similarity(question_emb, answers_emb)[0]

    scores, indices = torch.topk(similarity_scores, k=top_k)

    print_query_results(question_idx, scores, indices)

In [13]:
analyze_question(0)


Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Correct answer IDX: 0

Answer 1 IDX: 0
Score: 21.645751953125
Text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Answer 2 IDX: 589
Score: 16.99161148071289
Text: The first documented visit by a European was in 1524 by Giovanni da Verrazzano, a Florentine explorer i

In [14]:
n = 5

random.seed(42)
random_indices = random.sample(range(len(dataset)), n)
random_indices

[1824, 409, 4506, 4012, 3657]

In [15]:
for idx in random_indices:
    analyze_question(idx)


Question: What condition did Frédéric describe the piano that arrived to him through many dangerous obstacles?
Correct answer IDX: 244

Answer 1 IDX: 244
Score: 23.215896606445312
Text: On 3 December, Chopin complained about his bad health and the incompetence of the doctors in Majorca: "Three doctors have visited me ... The first said I was dead; the second said I was dying; and the third said I was about to die." He also had problems having his Pleyel piano sent to him. It finally arrived from Paris in December. Chopin wrote to Pleyel in January 1839: "I am sending you my Preludes [(Op. 28)]. I finished them on your little piano, which arrived in the best possible condition in spite of the sea, the bad weather and the Palma customs." Chopin was also able to undertake work on his Ballade No. 2, Op. 38; two Polonaises, Op. 40; and the Scherzo No. 3, Op. 39.

Answer 2 IDX: 296
Score: 22.439037322998047
Text: Jones comments that "Chopin's unique position as a composer, despite the fact 

#### Add FAISS

In [16]:
answers_emb_dict = {
    "answer_emb": [emb.detach().cpu().numpy() for emb in answers_emb],
    "idx": [i for i, _ in enumerate(unique_answers)],
}

answers_emb_dataset = Dataset.from_dict(answers_emb_dict)

answers_emb_dataset

Dataset({
    features: ['answer_emb', 'idx'],
    num_rows: 1867
})

In [17]:
answers_emb_dataset.add_faiss_index(column="answer_emb")

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['answer_emb', 'idx'],
    num_rows: 1867
})

In [18]:
def analyze_question_faiss(question_idx):
    question_emb = questions_emb[question_idx].detach().cpu().numpy()

    scores, samples = answers_emb_dataset.get_nearest_examples(
        "answer_emb", question_emb, k=top_k
    )

    print_query_results(question_idx, scores, samples["idx"])

In [19]:
analyze_question_faiss(0)


Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Correct answer IDX: 0

Answer 1 IDX: 0
Score: 37.27167510986328
Text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Answer 2 IDX: 589
Score: 45.78667449951172
Text: The first documented visit by a European was in 1524 by Giovanni da Verrazzano, a Florentine explorer

In [20]:
for idx in random_indices:
    analyze_question_faiss(idx)


Question: What condition did Frédéric describe the piano that arrived to him through many dangerous obstacles?
Correct answer IDX: 244

Answer 1 IDX: 217
Score: 35.46853256225586
Text: Frédéric François Chopin (/ˈʃoʊpæn/; French pronunciation: ​[fʁe.de.ʁik fʁɑ̃.swa ʃɔ.pɛ̃]; 22 February or 1 March 1810 – 17 October 1849), born Fryderyk Franciszek Chopin,[n 1] was a Polish and French (by citizenship and birth of father) composer and a virtuoso pianist of the Romantic era, who wrote primarily for the solo piano. He gained and has maintained renown worldwide as one of the leading musicians of his era, whose "poetic genius was based on a professional technique that was without equal in his generation." Chopin was born in what was then the Duchy of Warsaw, and grew up in Warsaw, which after 1815 became part of Congress Poland. A child prodigy, he completed his musical education and composed his earlier works in Warsaw before leaving Poland at the age of 20, less than a month before the outb

#### Calculate metrics (retrieval only)

In [21]:
similarity_scores = model.similarity(questions_emb, answers_emb)

_, retrieved_answers_cosine = torch.topk(similarity_scores, k=top_k)
retrieved_answers_cosine = retrieved_answers_cosine.tolist()

In [22]:
retrieved_answers_faiss = []

for i in tqdm(range(0, len(dataset))):
    question_emb = questions_emb[i].detach().cpu().numpy()

    _, samples = answers_emb_dataset.get_nearest_examples(
        "answer_emb", question_emb, k=top_k
    )

    retrieved_answers_faiss.append(samples["idx"])

100%|██████████| 10000/10000 [00:12<00:00, 793.15it/s]


In [23]:
from metrics import calc_recall, calc_mrr

In [24]:
y_true = dataset["answer_map"]

In [25]:
recall_cosine = calc_recall(y_true, retrieved_answers_cosine, num_classes=len(unique_answers))

recall_faiss = calc_recall(y_true, retrieved_answers_faiss, num_classes=len(unique_answers))

print(f"Recall@{top_k} Cosine: ", recall_cosine)
print(f"Recall@{top_k} Faiss: ", recall_faiss)

Recall@5 Cosine:  0.9423300644189855
Recall@5 Faiss:  0.9247229096034981


In [26]:
mrr_cosine = calc_mrr(y_true, retrieved_answers_cosine)

mrr_faiss = calc_mrr(y_true, retrieved_answers_faiss)

print(f"MRR Cosine: ", mrr_cosine)
print(f"MRR Faiss: ", mrr_faiss)

MRR Cosine:  0.8072366666666587
MRR Faiss:  0.7656399999999914


## Re-ranking

In [27]:
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [28]:
def print_rerank_results(
    question_idx: int,
    retriever_scores: torch.Tensor,
    retriever_indices: torch.Tensor,
    reranker_scores: torch.Tensor
):
    ranking_scores, ranking = torch.sort(reranker_scores, descending=True)

    sorted_retriever_scores = retriever_scores[ranking]
    sorted_retriever_indices = retriever_indices[ranking]
    
    print("\nQuestion:", dataset["question"][question_idx])
    print(f"Correct answer IDX: {dataset['answer_map'][question_idx]}")
    print("Original order: ", retriever_indices.tolist())
    print("Reranked order: ", sorted_retriever_indices.tolist())
    print()

    for (idx, retriever_score, reranker_score) in zip(sorted_retriever_indices, sorted_retriever_scores, ranking_scores):
        print(f"Answer IDX: {idx}")
        print(f"Original score: {retriever_score.item()}")
        print(f"Reranker score: {reranker_score.item()}")
        print(f"Text: {unique_answers[idx]}\n")

In [29]:
def analyze_question_with_rerank(question_idx):
    question = dataset["question"][question_idx]
    question_emb = questions_emb[question_idx]

    similarity_scores = model.similarity(question_emb, answers_emb)[0]

    scores, indices = torch.topk(similarity_scores, k=top_k)

    reranker_input = [
        [question, unique_answers[idx]] for idx in indices
    ]
    reranker_scores = reranker.predict(reranker_input)
    reranker_scores = torch.tensor(reranker_scores)

    print_rerank_results(question_idx, scores, indices, reranker_scores)

In [30]:
analyze_question_with_rerank(0)


Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Correct answer IDX: 0
Original order:  [0, 589, 983, 38, 999]
Reranked order:  [0, 983, 38, 589, 999]

Answer IDX: 0
Original score: 21.645751953125
Reranker score: 5.316110134124756
Text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Answer IDX: 983
Original score:

In [31]:
for idx in random_indices:
    analyze_question_with_rerank(idx)


Question: What condition did Frédéric describe the piano that arrived to him through many dangerous obstacles?
Correct answer IDX: 244
Original order:  [244, 296, 248, 217, 218]
Reranked order:  [244, 217, 248, 296, 218]

Answer IDX: 244
Original score: 23.215896606445312
Reranker score: -1.2244844436645508
Text: On 3 December, Chopin complained about his bad health and the incompetence of the doctors in Majorca: "Three doctors have visited me ... The first said I was dead; the second said I was dying; and the third said I was about to die." He also had problems having his Pleyel piano sent to him. It finally arrived from Paris in December. Chopin wrote to Pleyel in January 1839: "I am sending you my Preludes [(Op. 28)]. I finished them on your little piano, which arrived in the best possible condition in spite of the sea, the bad weather and the Palma customs." Chopin was also able to undertake work on his Ballade No. 2, Op. 38; two Polonaises, Op. 40; and the Scherzo No. 3, Op. 39.


In [32]:
def analyze_question_faiss_with_rerank(question_idx):
    question = dataset["question"][question_idx]
    question_emb = questions_emb[question_idx].detach().cpu().numpy()

    scores, samples = answers_emb_dataset.get_nearest_examples(
        "answer_emb", question_emb, k=top_k
    )

    reranker_input = [
        [question, unique_answers[idx]] for idx in samples["idx"]
    ]
    reranker_scores = reranker.predict(reranker_input)

    print_rerank_results(
        question_idx, torch.tensor(scores),
        torch.tensor(samples["idx"]),
        torch.tensor(reranker_scores)
    )

In [33]:
analyze_question_faiss_with_rerank(0)


Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Correct answer IDX: 0
Original order:  [0, 589, 268, 38, 983]
Reranked order:  [0, 983, 38, 589, 268]

Answer IDX: 0
Original score: 37.27167510986328
Reranker score: 5.316110134124756
Text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.

Answer IDX: 983
Original scor

In [34]:
for idx in random_indices:
    analyze_question_faiss_with_rerank(idx)


Question: What condition did Frédéric describe the piano that arrived to him through many dangerous obstacles?
Correct answer IDX: 244
Original order:  [217, 244, 296, 248, 232]
Reranked order:  [244, 217, 248, 296, 232]

Answer IDX: 244
Original score: 36.217437744140625
Reranker score: -1.2244844436645508
Text: On 3 December, Chopin complained about his bad health and the incompetence of the doctors in Majorca: "Three doctors have visited me ... The first said I was dead; the second said I was dying; and the third said I was about to die." He also had problems having his Pleyel piano sent to him. It finally arrived from Paris in December. Chopin wrote to Pleyel in January 1839: "I am sending you my Preludes [(Op. 28)]. I finished them on your little piano, which arrived in the best possible condition in spite of the sea, the bad weather and the Palma customs." Chopin was also able to undertake work on his Ballade No. 2, Op. 38; two Polonaises, Op. 40; and the Scherzo No. 3, Op. 39.
