In [None]:
# For Loading HuggingFace Dataset
import pandas as pd

# For Loading Retriever Model and Evaluation
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever')

In [None]:
dataset = pd.read_parquet("hf://datasets/LumberChunker/GutenQA/GutenQA.parquet", engine="pyarrow")
questions = pd.read_parquet("hf://datasets/LumberChunker/GutenQA/questions.parquet", engine="pyarrow")

In [None]:
book_name = "A_Christmas_Carol_-_Charles_Dickens"

# Filter the Chunks DataFrame to show only rows with the specified book name
single_book_chunks = dataset[dataset['Book Name'] == book_name].reset_index(drop=True)

# Filter the Questions DataFrame to show only the generated questions for the target book.
single_book_qa = questions[questions['Book Name'] == book_name].reset_index(drop=True)

In [None]:
# Mean Pooling for Embeddings
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings


# Apply tokenizer to book chunks and questions
inputs_chunks = tokenizer(single_book_chunks["Chunk"].tolist(), padding=True, truncation=True, return_tensors='pt')
inputs_questions = tokenizer(single_book_qa["Question"].tolist(), padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
outputs_chunks = model(**inputs_chunks)
outputs_questions = model(**inputs_questions)

embeddings_chunks = mean_pooling(outputs_chunks[0], inputs_chunks['attention_mask']).detach().cpu().numpy()
embeddings_questions = mean_pooling(outputs_questions[0], inputs_questions['attention_mask']).detach().cpu().numpy()

In [None]:
# Gold label is the substring that is present on the Chunk_Must_Contain column.
# We look if that substring is present on the retrieved chunks. 
# If it is, that index position on 'relevance' list receives the value = 1 and the remaining positions 0.
def find_index_of_match(answers, gold_label):
    relevance = []
    gold_label = gold_label.lower()
    for _, item in enumerate(answers):
        if gold_label in item.lower():
            relevance.append(1)
            relevance = relevance + ((len(answers) - len(relevance))* ([0]))
            break
        else:
            relevance.append(0)
    return relevance



def compute_DCG(rel):
    aux = 0
    for i in range(1, len(rel)+1):
        aux = aux + (np.power(2,rel[i-1])-1) / (np.log2(i+1))
    return(aux)

In [None]:
def get_top_k(top_k, query_individual_embedding_numpy):
    similarity = np.dot(embeddings_chunks, np.transpose(query_individual_embedding_numpy))
    top_indices = np.argsort(similarity, axis=0)[-top_k:]
    top_indices = top_indices[::-1]

    answers = []

    for i in range(len(top_indices)):
        answers.append(single_book_chunks.at[top_indices[i], 'Chunk'])

    return answers

In [None]:
# Loop to calculate DCG@k for k between 1 and 20
DCG_k_sweep = []
for j in [1, 2, 5, 10, 20]:
    DCG_list = []

    for k in range(len(single_book_qa)):
        query_embedding = embeddings_questions[k]
        answers = get_top_k(top_k = j, query_individual_embedding_numpy= embeddings_questions[k])
        gold_label = single_book_qa.loc[k, "Chunk Must Contain"]
        rel = find_index_of_match(answers=answers, gold_label=gold_label)
        DCG_list.append(compute_DCG(rel))

    DCG_k_sweep.append(np.mean(DCG_list))

# Print the DCG_k_sweep list
print(DCG_k_sweep)