In [1]:
!pip install pandarallel rank_bm25

ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_lp64.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_thread.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRE

In [2]:
import json
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from pandarallel import pandarallel
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

pandarallel.initialize(progress_bar=True)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# 1) Importing query and collection data

In [3]:
PATH_COLLECTION_DATA = 'subtask_4b/subtask4b_collection_data.pkl' 
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [4]:
PATH_QUERY_TRAIN_DATA = 'subtask_4b/subtask4b_query_tweets_train.tsv'
PATH_QUERY_DEV_DATA = 'subtask_4b/subtask4b_query_tweets_dev.tsv' 
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

# 2) Running the BM25 baseline
The following code runs a BM25 baseline.


In [5]:
from rank_bm25 import BM25Okapi

In [6]:
# Create the BM25 corpus
corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection[:]['cord_uid'].tolist()
tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [7]:
def get_top_cord_uids(query):
  text2bm25top = {}
  if query in text2bm25top.keys():
      return text2bm25top[query]
  else:
      tokenized_query = query.split(' ')
      doc_scores = bm25.get_scores(tokenized_query)
      indices = np.argsort(-doc_scores)[:100] # @k: how many docs shall the ranked list include?
      bm25_topk = [cord_uids[x] for x in indices]

      text2bm25top[query] = bm25_topk
      return bm25_topk


In [8]:
# Retrieve top100 candidates using the BM25 model

train_pkl_path = 'df_query_train_top100.pkl'
dev_pkl_path = 'df_query_dev_top100.pkl'

if not os.path.exists(train_pkl_path):
    df_query_train['bm25_topk'] = df_query_train['tweet_text'].parallel_apply(lambda x: get_top_cord_uids(x))
    df_query_train.to_pickle(train_pkl_path)
else:
    df_query_train = pd.read_pickle(train_pkl_path)

if not os.path.exists(dev_pkl_path):
    df_query_dev['bm25_topk'] = df_query_dev['tweet_text'].parallel_apply(lambda x: get_top_cord_uids(x))
    df_query_dev.to_pickle(dev_pkl_path)
else:
    df_query_dev = pd.read_pickle(dev_pkl_path)

In [9]:
df_query_train.head()

Unnamed: 0,post_id,tweet_text,cord_uid,normalized_tweet_text,cleaned_tweet_text,final_query,bm25_topk,in_topx
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,oral care in rehabilitation medicine: oral vul...,oral care rehabilit medicin oral vulner oral m...,oral care rehabilit medicin oral vulner oral m...,"[htlvpvz5, h7hj64q5, trmwm9qq, 65gedo6u, rwgqk...",1.0
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,this study isn't receiving sufficient attentio...,studi isnt receiv suffici attent reveal blackl...,studi isnt receiv suffici attent reveal blackl...,"[apqzyln2, asdcpvhx, 33znyrn8, ljcdfmbu, 296il...",0.0
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"thanks, xi jinping. a reminder that this study...",thank xi jinp remind studi conclud nonpharmace...,thank xi jinp remind studi conclud nonpharmace...,"[jtwb17u8, veeavho5, mwj0xc3q, 8hkxbxz9, a0q61...",1.0
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1,taiwan - a population of 23 million has had ju...,taiwan popul 23 million 600 case 7 death wides...,taiwan popul 23 million 600 case 7 death wides...,"[lsgm7y5t, l5ogbl5p, l4y7v729, x14iywtr, 0w9k8...",0.2
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69,obtaining a diagnosis of autism in lower incom...,obtain diagnosi autism lower incom countri tak...,obtain diagnosi autism lower incom countri tak...,"[tiqksd69, b0dzhsrh, k7smwz6w, aqbhxv1f, 0u330...",1.0


In [10]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

# 3) BERT Embeddings pre-computation

In [11]:
import torch.nn as nn

def get_device():
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

DEVICE = get_device()
print(f"Gonna run pytorch on {DEVICE}")

Gonna run pytorch on cuda


In [None]:
# get token embeddings of a specified text passage from some model
def get_token_embeddings(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0).bool()
    token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings

# pre compute all the token embeddings of the documents
def build_and_save_doc_embeddings(
    docs_df,
    model_name,
    save_dir,
    device,
    max_len=512,
    batch_size=16
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    save_path = Path("doc_embeddings_" + save_dir)
    save_path.mkdir(parents=True, exist_ok=True)
    metadata_path = save_path / "metadata.json"
    
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}
    
    print("Precomputing document embeddings.")
    
    texts = []
    doc_ids = []
    indices = []
    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))
        texts.append(text)
        doc_ids.append(doc_id)
        indices.append(i)
    
    for start_idx in tqdm(range(0, len(texts), batch_size), total=len(texts), desc="Saving doc embeddings"):
        end_idx = min(start_idx + batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        batch_doc_ids = doc_ids[start_idx:end_idx]

        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        q_emb_batch = outputs.last_hidden_state  # [batch_size, L, D]
        attention_mask = inputs['attention_mask'].to(torch.bool)
        
        for i in range(len(batch_texts)):
            att_mask = attention_mask[i]
            embeddings = q_emb_batch[i][att_mask]
            doc_id = batch_doc_ids[i]

            file_path = Path(f"doc_embeddings_{save_dir}") / f"{doc_id}.pt"
            torch.save(embeddings, file_path)
            n_tokens = embeddings.shape[0]

            if doc_id not in metadata:
                metadata[doc_id] = {
                    "length": min(n_tokens, max_len),
                    "path": str(file_path)
                }

    metadata_path = Path(f"doc_embeddings_{save_dir}") / "metadata.json"
    with open(metadata_path, "w") as f:
        json.dump(metadata, f)
    
    return metadata

# either precompute or load precomputed doc embeddings
def get_precomputed_doc_embeddings(save_name):
    def split_at_slash(s):
        if '/' in s:
            return s.split('/', 1)
        else:
            return ['', s]
        
    if not os.path.exists("doc_embeddings_" + split_at_slash(save_name)[1] + "/metadata.json"):
        metadata = build_and_save_doc_embeddings(df_collection, "allenai/scibert_scivocab_uncased", save_name, DEVICE)
    else:
        with open("doc_embeddings_" + save_name + "/metadata.json", "r") as f:
            metadata = json.load(f)
    return metadata

In [13]:
doc_embeddings_allenai = get_precomputed_doc_embeddings("allenai")

In [14]:
doc_embeddings_allenai = {
    k: {'length': v['length'], 'path': v['path'].replace('all_embeddings', 'allenai')}
    for k, v in doc_embeddings_allenai.items()
}

# 4) Neural Re-Ranking with Conv-KNRM

Let's create interface for generating convolutional layers for n-grams, calculate the similarity match matrix, generating kernel pooling object and the reranking function itself

## 4.1 Define the neural network, train and rerank functions

#### NGram Convolutional Layer:

In [51]:
class NGramConvLayer(nn.Module):
    def __init__(self, input_dim, out_channels, kernel_size=3):
        super().__init__()
        self.conv_q = nn.Conv1d(input_dim, out_channels, kernel_size, padding=kernel_size//2)
        self.conv_d = nn.Conv1d(input_dim, out_channels, kernel_size, padding=kernel_size//2)

    def forward(self, q_emb, d_emb):
        q_emb_conv = q_emb.transpose(0, 1).unsqueeze(0)  # [1, D, L_q]
        d_emb_conv = d_emb.transpose(0, 1).unsqueeze(0)  # [1, D, L_d]

        q_conv = F.relu(self.conv_q(q_emb_conv)).transpose(1, 2)  # [1, L_q, out_channels]
        d_conv = F.relu(self.conv_d(d_emb_conv)).transpose(1, 2)  # [1, L_d, out_channels]

        return q_conv.squeeze(0), d_conv.squeeze(0)

#### Kernel Pooling Definition

In [52]:
class KernelPooling(nn.Module):
    def __init__(self, _mus, _sigmas):
        super().__init__()
        self.mus = torch.tensor(_mus).view(1, -1).to(DEVICE)  # shape: [1, num_kernels]
        self.sigmas = torch.tensor(_sigmas).view(1, -1).to(DEVICE)

    def forward(self, sim_matrix):
        # sim_matrix: [L_q, L_d]
        sim_matrix = sim_matrix.unsqueeze(0).unsqueeze(0)  # [1,1,L_q,L_d]
        kernel_vals = torch.exp(- (sim_matrix - self.mus.reshape(1, -1, 1, 1))**2 / (2 * self.sigmas.reshape(1, -1, 1, 1)**2))
        pooled = kernel_vals.sum(dim=3).sum(dim=2)  # shape: [1, num_kernels]
        return pooled.squeeze(0)  # shape: [num_kernels]

#### KNRM Definition

In [68]:
def compute_similarity_matrix(q_embs, d_embs):
    q_norm = F.normalize(q_embs, p=2, dim=1)
    d_norm = F.normalize(d_embs, p=2, dim=1)
    return torch.mm(q_norm, d_norm.T).to(DEVICE)

class KNRM(nn.Module):
    def __init__(self, _mus, _sigmas, use_conv, conv_channels=8, ngram_size=1):
        super().__init__()
        self.kernel_pool = KernelPooling(_mus, _sigmas)
        if use_conv:
            self.conv_layer = NGramConvLayer(input_dim=768, out_channels=conv_channels, kernel_size=ngram_size)
        else:
            self.conv_layer = None

        num_kernels = len(_mus)
        feature_dim = num_kernels * (conv_channels if use_conv else 1)
        self.scorer = nn.Linear(feature_dim, 1)

    def forward(self, q_emb, d_emb):
        if self.conv_layer:
            q_conv, d_conv = self.conv_layer(q_emb, d_emb)
            
            channels = q_conv.shape[1]
            all_features = []
            
            for c in range(channels):
                q_c = q_conv[:, c].unsqueeze(1)  # [L_q, 1]
                d_c = d_conv[:, c].unsqueeze(1)  # [L_d, 1]
                
                sim_c = compute_similarity_matrix(q_c, d_c)
                
                features_c = self.kernel_pool(sim_c)  # [num_kernels]
                all_features.append(features_c)
            
            pooled_features = torch.cat(all_features, dim=0)  # [num_kernels * channels]
        else:
            sim_matrix = compute_similarity_matrix(q_emb, d_emb)
            pooled_features = self.kernel_pool(sim_matrix)
        score = self.scorer(pooled_features)
        return score

#### KNRM Triplet Dataset

In [69]:
class KNRMTripletDataset(Dataset):
    def __init__(self, df_query, metadata, tokenizer, num_negatives=1):
        self.queries = []
        self.pos_ids = []
        self.neg_ids = []
        self.tokenizer = tokenizer
        self.metadata = metadata
        
        for _, row in df_query.iterrows():
            query = row['tweet_text']
            pos_id = row['cord_uid']
            candidates = row['bm25_topk']
            neg_candidates = [doc for doc in candidates if doc != pos_id]
            
            if neg_candidates:
                neg_ids = random.sample(neg_candidates, min(num_negatives, len(neg_candidates)))
                for neg_id in neg_ids:
                    self.queries.append(query)
                    self.pos_ids.append(pos_id)
                    self.neg_ids.append(neg_id)
    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        return self.queries[idx], self.pos_ids[idx], self.neg_ids[idx]

#### KNRM training function

In [74]:
def knrm_train(mus, sigmas, use_conv, ngram_size, save_name, MARGIN=0.5, BATCH_SIZE=8, EPOCHS=6, LR=1e-3):
    model_name = "allenai/scibert_scivocab_uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name).to(DEVICE)
    knrm = KNRM(mus, sigmas, use_conv, ngram_size=ngram_size).to(DEVICE)
    metadata = get_precomputed_doc_embeddings(model_name)

    train_dataset = KNRMTripletDataset(df_query_train, metadata, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    optimizer = torch.optim.Adam(knrm.parameters(), lr=LR)
    
    for epoch in range(EPOCHS):
        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            queries, pos_ids, neg_ids = batch
            
            q_embs = []
            for query in queries:
                q_emb = get_token_embeddings(query, tokenizer, bert_model, DEVICE)
                q_embs.append(q_emb)

            score_pos_list = []
            score_neg_list = []

            for i in range(len(queries)):
                d_pos_emb = torch.load(metadata[pos_ids[i]]["path"]).to(DEVICE)[:metadata[pos_ids[i]]["length"]]
                d_neg_emb = torch.load(metadata[neg_ids[i]]["path"]).to(DEVICE)[:metadata[neg_ids[i]]["length"]]

                score_pos = knrm(q_embs[i], d_pos_emb)
                score_neg = knrm(q_embs[i], d_neg_emb)

                score_pos_list.append(score_pos)
                score_neg_list.append(score_neg)

            score_pos_batch = torch.stack(score_pos_list)
            score_neg_batch = torch.stack(score_neg_list)

            loss = F.relu(MARGIN + score_neg_batch - score_pos_batch).mean()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

    torch.save(knrm.state_dict(), f"{save_name}.pt")
    return knrm

#### Embeddings Caching Layer

In [71]:
class EmbeddingsLoader:
    doc_embeddings = {}

    @classmethod
    def get_embeddings(cls):
        if cls.doc_embeddings:
            return cls.doc_embeddings
        else:
            for doc_id, data in tqdm(doc_embeddings_allenai.items(), desc="loading_doc_embeddings"):
                emb = torch.load(data["path"], map_location=DEVICE)
                cls.doc_embeddings[doc_id] = emb
            return cls.doc_embeddings

#### Reranking function

In [72]:
def rerank(df, knrm_model_name, knrm_model_instance, device):
    df[f'{knrm_model_name}_scores'] = [[]] * len(df)

    knrm_model_instance = knrm_model_instance.to(device)
    knrm_model_instance.load_state_dict(torch.load(f"{knrm_model_name}.pt"))
    knrm_model_instance.eval()

    model_name = "allenai/scibert_scivocab_uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), total=len(df), desc='calculating_scores_for_reranking'):
            tweet_text = row['tweet_text']
            pre_ranked_docs = row['bm25_topk']
            q_emb = get_token_embeddings(tweet_text, tokenizer, model, device=device)

            scores = []
            for doc in pre_ranked_docs:
                emb = EmbeddingsLoader().get_embeddings()[doc]
                length = doc_embeddings_allenai[doc]["length"]
                d_emb = emb[:length]

                score = knrm_model_instance(q_emb, d_emb).item()
                scores.append(score)

            df.at[idx, f'{knrm_model_name}_scores'] = scores

        def sort_docs_by_score(row):
            doc_ids = row['bm25_topk']
            scores = row[f'{knrm_model_name}_scores']
            sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
            return sorted_docs

        df[f'{knrm_model_name}_topk'] = df.parallel_apply(sort_docs_by_score, axis=1)

    return df

## 4.2 Evaluation of different knrm models

### 4.2.1 Plain KNRM with 1-grams and 3 epochs

##### Train

In [41]:
mus = [-1.0, -0.5, 0.0, 0.5, 1.0]
sigmas = [0.1] * len(mus)
use_conv = False
ngram_size = 1
KNRM_MODEL_NAME = "knrm"
EPOCHS = 3

knrm_model = knrm_train(mus, sigmas, use_conv=use_conv, ngram_size=ngram_size, save_name=KNRM_MODEL_NAME, EPOCHS=EPOCHS)

Precomputing document embeddings.


  0%|          | 0/7718 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 1 Loss: 12846.5115


Epoch 2:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 2 Loss: 714.3910


Epoch 3:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 3 Loss: 745.6552


##### Rerank

In [42]:
reranked_knrm_query_dev_df = rerank(df_query_dev, KNRM_MODEL_NAME, knrm_model, DEVICE)
reranked_knrm_query_dev_df.head()

# ---- KNRM Re-Ranking ----
results_dev = get_performance_mrr(reranked_knrm_query_dev_df, 'cord_uid', f'{KNRM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: KNRM ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

calculating_scores_for_reranking:   0%|          | 0/1400 [00:00<?, ?it/s]

loading_doc_embeddings:   0%|          | 0/7718 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=88), Label(value='0 / 88'))), HBox…

---- Re-Ranking Finetune: KNRM ----
MRR@5 on dev set: 0.36279761904761904


### 4.2.2 Conv-KNRM with 1-grams and 3 epochs

##### Train

In [75]:
mus = [-1.0, -0.5, 0.0, 0.5, 1.0]
sigmas = [0.1] * len(mus)
use_conv = True
ngram_size = 1
CONV_KNRM_MODEL_NAME = "conv_knrm"
EPOCHS = 3

conv_knrm_model = knrm_train(mus, sigmas, use_conv=use_conv, ngram_size=ngram_size, save_name=CONV_KNRM_MODEL_NAME, EPOCHS=3)

Precomputing document embeddings.


  0%|          | 0/7718 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 1 Loss: 39152.5070


Epoch 2:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 2 Loss: 8364.2339


Epoch 3:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 3 Loss: 8031.9155


##### Rerank

In [76]:
reranked_conv_knrm_query_dev_df = rerank(df_query_dev, CONV_KNRM_MODEL_NAME, conv_knrm_model, DEVICE)
reranked_conv_knrm_query_dev_df.head()

# ---- Conv-KNRM with 1-gram Re-Ranking ----
results_dev = get_performance_mrr(reranked_conv_knrm_query_dev_df, 'cord_uid', f'{CONV_KNRM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: Conv-KNRM 1-gram ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

calculating_scores_for_reranking:   0%|          | 0/1400 [00:00<?, ?it/s]

loading_doc_embeddings:   0%|          | 0/7718 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=88), Label(value='0 / 88'))), HBox…

---- Re-Ranking Finetune: Conv-KNRM 1-gram ----
MRR@5 on dev set: 0.03085714285714286


### 4.2.3 Conv-KNRM with Bigrams and 3 epochs

##### Train

In [77]:
mus = [-1.0, -0.5, 0.0, 0.5, 1.0]
sigmas = [0.1] * len(mus)
use_conv = True
ngram_size = 2
CONV_KNRM_BIGRAM_MODEL_NAME = "conv_knrm_bigram"
EPOCHS = 3

conv_knrm_bigram_model = knrm_train(mus, sigmas, use_conv=use_conv, ngram_size=ngram_size, save_name=CONV_KNRM_BIGRAM_MODEL_NAME, EPOCHS=EPOCHS)


Precomputing document embeddings.


  0%|          | 0/7718 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 1 Loss: 72564.8783


Epoch 2:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 2 Loss: 9399.7255


Epoch 3:   0%|          | 0/1607 [00:00<?, ?it/s]

Epoch 3 Loss: 8109.0142


##### Rerank

In [78]:
reranked_conv_knrm_bigram_query_dev_df = rerank(df_query_dev, CONV_KNRM_BIGRAM_MODEL_NAME, conv_knrm_bigram_model, DEVICE)
reranked_conv_knrm_bigram_query_dev_df.head()

# ---- Conv-KNRM with Bigram Re-Ranking ----
results_dev = get_performance_mrr(reranked_conv_knrm_bigram_query_dev_df, 'cord_uid', f'{CONV_KNRM_BIGRAM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: Conv-KNRM Bigram ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

calculating_scores_for_reranking:   0%|          | 0/1400 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=88), Label(value='0 / 88'))), HBox…

---- Re-Ranking Finetune: Conv-KNRM Bigram ----
MRR@5 on dev set: 0.02830952380952381


# 5) Evaluation
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5) and compares it to the knrm models discovered in section 4

In [None]:
# ---- BM25 Baseline ----
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk')

print("---- BM25 Baseline ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

# ---- KNRM Re-Ranking ----
results_dev = get_performance_mrr(reranked_knrm_query_dev_df, 'cord_uid', f'{KNRM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: KNRM ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

# ---- Conv-KNRM with 1-gram Re-Ranking ----
results_dev = get_performance_mrr(reranked_conv_knrm_query_dev_df, 'cord_uid', f'{CONV_KNRM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: Conv-KNRM 1-gram ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

# ---- Conv-KNRM with Bigram Re-Ranking ----
results_dev = get_performance_mrr(reranked_conv_knrm_bigram_query_dev_df, 'cord_uid', f'{CONV_KNRM_BIGRAM_MODEL_NAME}_topk')
print("---- Re-Ranking Finetune: Conv-KNRM Bigram ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

## Results documentation

### 1) KNRM
Re-Ranking of top 100 BM25 results for each query:\
MRR@5: 0.025 :(

### 2) Conv-KNRM with 1-grams
Re-Ranking of top 100 BM25 results for each query:\
MRR@5:

### 3) Conv-KNRM with Bigrams
Re-Ranking of top 100 BM25 results for each query:\
MRR@5: