In [None]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
import torch.nn.functional as F
from tqdm.auto import tqdm
from utils import get_data, get_batches
import numpy as np

In [None]:
DATASET_NAME = "tom-010/google_natural_questions_answerability"
DEVICE = "cpu"
BATCH_SIZE = 128

In [None]:
dataset = load_dataset(DATASET_NAME)

train_data = [{"question": q, "context": c} for q, c in zip(dataset["train"]['question'], dataset["train"]["answer"]) if c is not None]
valid_data = [{"question": q, "context": c} for q, c in zip(dataset["validation"]['question'], dataset["validation"]["answer"]) if c is not None]

with open("filtered_array.txt", "r") as f:
    filtered = list(map(int, f.read().split()))
    
indices = set(range(len(train_data))) - set(filtered)
queries_train, passages_train = get_data(indices, train_data)

with open("filtered_array_val.txt", "r") as f:
    filtered = list(map(int, f.read().split()))
    
indices = set(range(len(valid_data))) - set(filtered)
queries_valid, passages_valid = get_data(indices, valid_data)

train_data_batched = get_batches(queries_train, passages_train, BATCH_SIZE)
valid_data_batched = get_batches(queries_valid, passages_valid, BATCH_SIZE)

In [None]:
def get_ndcg(scores, labels):
    sorted_indices = torch.argsort(scores, descending=True)
    sorted_labels = labels[sorted_indices]

    dcg = sum((sorted_labels[i].item() / np.log2(i + 2)) for i in range(len(sorted_labels)))
    idcg = sum((1 / np.log2(i + 2)) for i in range(sum(labels).int().item()))

    return dcg / idcg

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./2_tokenizer_rubert-tiny2")
model = AutoModel.from_pretrained("./2_new_rubert-tiny2").to(DEVICE)
model.eval()

In [None]:
ndcg_scores = []
progressBar = tqdm(range(len(valid_data_batched)))

for batch in valid_data_batched:
    
    query = tokenizer(batch["question"], return_tensors="pt", truncation=True, padding=True).to(DEVICE)
    passage = tokenizer(batch["context"], return_tensors="pt", truncation=True, padding=True).to(DEVICE)
    
    with torch.no_grad():
        query_emb = model(**query).last_hidden_state.mean(dim=1)
        passage_emb = model(**passage).last_hidden_state.mean(dim=1)
    

    scores = torch.zeros(len(query_emb), len(passage_emb))
    for i in range(len(query_emb)):
        scores[i] = F.cosine_similarity(query_emb[i].unsqueeze(0), passage_emb)

    for i in range(len(batch["question"])):
        labels = torch.zeros(len(batch["context"]))
        labels[i] = 1
        ndcg_scores.append(get_ndcg(scores[i], labels))

    torch.cuda.empty_cache()
    progressBar.update(1)

print(np.mean(ndcg_scores))