In [1]:
!pip install pandarallel rank_bm25

ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_lp64.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_intel_thread.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_def.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_avx2.so' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/opt/conda/lib/libmkl_core.so' from LD_PRE

In [2]:
import torch
from torch.nn.functional import cosine_similarity
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from tqdm.notebook import tqdm
import json
from pathlib import Path
import re
import random
import os
import numpy as np
import pandas as pd
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# 1) Importing query and collection data

In [3]:
PATH_COLLECTION_DATA = 'subtask_4b/subtask4b_collection_data.pkl' 
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [4]:
PATH_QUERY_TRAIN_DATA = 'subtask_4b/subtask4b_query_tweets_train.tsv'
PATH_QUERY_DEV_DATA = 'subtask_4b/subtask4b_query_tweets_dev.tsv' 
df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

# 2) Running the BM25 baseline
The following code runs a BM25 baseline.


In [5]:
from rank_bm25 import BM25Okapi

In [6]:
# Create the BM25 corpus
corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection[:]['cord_uid'].tolist()
tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [7]:
def get_top_cord_uids(query):
  text2bm25top = {}
  if query in text2bm25top.keys():
      return text2bm25top[query]
  else:
      tokenized_query = query.split(' ')
      doc_scores = bm25.get_scores(tokenized_query)
      indices = np.argsort(-doc_scores)[:100] # @k: how many docs shall the ranked list include?
      bm25_topk = [cord_uids[x] for x in indices]

      text2bm25top[query] = bm25_topk
      return bm25_topk


In [8]:
# Retrieve top100 candidates using the BM25 model

train_pkl_path = 'df_query_train_top100.pkl'
dev_pkl_path = 'df_query_dev_top100.pkl'

if not os.path.exists(train_pkl_path):
    df_query_train['bm25_topk'] = df_query_train['tweet_text'].parallel_apply(lambda x: get_top_cord_uids(x))
    df_query_train.to_pickle(train_pkl_path)
else:
    df_query_train = pd.read_pickle(train_pkl_path)

if not os.path.exists(dev_pkl_path):
    df_query_dev['bm25_topk'] = df_query_dev['tweet_text'].parallel_apply(lambda x: get_top_cord_uids(x))
    df_query_dev.to_pickle(dev_pkl_path)
else:
    df_query_dev = pd.read_pickle(dev_pkl_path)

In [9]:
df_query_train.head()

Unnamed: 0,post_id,tweet_text,cord_uid,normalized_tweet_text,cleaned_tweet_text,final_query,bm25_topk,in_topx
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5,oral care in rehabilitation medicine: oral vul...,oral care rehabilit medicin oral vulner oral m...,oral care rehabilit medicin oral vulner oral m...,"[htlvpvz5, h7hj64q5, trmwm9qq, 65gedo6u, rwgqk...",1.0
1,1,this study isn't receiving sufficient attentio...,4kfl29ul,this study isn't receiving sufficient attentio...,studi isnt receiv suffici attent reveal blackl...,studi isnt receiv suffici attent reveal blackl...,"[apqzyln2, asdcpvhx, 33znyrn8, ljcdfmbu, 296il...",0.0
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8,"thanks, xi jinping. a reminder that this study...",thank xi jinp remind studi conclud nonpharmace...,thank xi jinp remind studi conclud nonpharmace...,"[jtwb17u8, veeavho5, mwj0xc3q, 8hkxbxz9, a0q61...",1.0
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1,taiwan - a population of 23 million has had ju...,taiwan popul 23 million 600 case 7 death wides...,taiwan popul 23 million 600 case 7 death wides...,"[lsgm7y5t, l5ogbl5p, l4y7v729, x14iywtr, 0w9k8...",0.2
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69,obtaining a diagnosis of autism in lower incom...,obtain diagnosi autism lower incom countri tak...,obtain diagnosi autism lower incom countri tak...,"[tiqksd69, b0dzhsrh, k7smwz6w, aqbhxv1f, 0u330...",1.0


In [10]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [11]:
# Evaluate retrieved candidates using MRR@k
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk')

# Printed MRR@k results
print(f"Results on the train set: {dict((k, float(v)) for k, v in results_train.items())}")
print(f"Results on the dev set: {dict((k, float(v)) for k, v in results_dev.items())}")

Results on the train set: {1: 0.5731735781529604, 5: 0.625250914183459, 10: 0.6308237901348459}
Results on the dev set: {1: 0.5657142857142857, 5: 0.616095238095238, 10: 0.6224325396825396}


In [12]:
df_query_dev.head()

Unnamed: 0,post_id,tweet_text,cord_uid,normalized_tweet_text,cleaned_tweet_text,final_query,bm25_topk,in_topx
0,16,covid recovery: this study from the usa reveal...,3qvh482o,covid recovery: this study from the usa reveal...,covid recoveri studi usa reveal proport case e...,covid recoveri studi usa reveal proport case e...,"[25aj8rj5, 66g5lpm6, o4vvlmr4, vmmwtdia, trrg1...",0.0
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"""among 139 clients exposed to two symptomatic ...",among 139 client expos two symptomat hair styl...,among 139 client expos two symptomat hair styl...,"[r58aohnu, p0kg6dyz, s2vckt2w, yrowv62k, g5hg3...",1.0
2,73,I recall early on reading that researchers who...,sts48u9i,i recall early on reading that researchers wor...,recal earli read research who examin coronavir...,recal earli read research who examin coronavir...,"[mkwgkkoi, gruir7aw, xavegbty, vx1hjh26, ntxuf...",0.0
3,93,You know you're credible when NIH website has ...,3sr2exq9,you know you're credible when national institu...,know your credibl nih websit paper 💃💃 someon p...,know your credibl nih websit paper 💃💃 someon p...,"[3sr2exq9, sv48gjkk, tx8ypqsm, z795y51f, k0f4c...",1.0
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,resistance to antifungal medications is a grow...,resist antifung medic grow issu global scope d...,resist antifung medic grow issu global scope d...,"[ybwwmyqy, ouvq2wpq, rs3umc1x, sxx3yid9, vabb2...",1.0


# 3) BERT Embeddings pre-computation

In [13]:
import torch.nn as nn

def get_device():
    if torch.cuda.is_available():
        return "cuda"
    elif torch.backends.mps.is_available():
        return "mps"
    else:
        return "cpu"

DEVICE = get_device()
print(f"Gonna run pytorch on {DEVICE}")

Gonna run pytorch on cuda


In [14]:
# get token embeddings of a specified text passage from some model
def get_token_embeddings(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0).bool()
    token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings

# pre compute all the token embeddings of the documents
def build_and_save_doc_embeddings(
    docs_df,
    model_name,
    save_dir,
    device,
    max_len=512,
    batch_size=16
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    save_path = Path("doc_embeddings_" + save_dir)
    save_path.mkdir(parents=True, exist_ok=True)
    metadata_path = save_path / "metadata.json"
    
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}
    
    print("Precomputing document embeddings.")
    
    texts = []
    doc_ids = []
    indices = []
    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))
        texts.append(text)
        doc_ids.append(doc_id)
        indices.append(i)
    
    for start_idx in tqdm(range(0, len(texts), batch_size)):
        end_idx = min(start_idx + batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        batch_doc_ids = doc_ids[start_idx:end_idx]
        batch_indices = indices[start_idx:end_idx]
        
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        q_emb_batch = outputs.last_hidden_state  # [batch_size, L, D]
        attention_mask = inputs['attention_mask'].to(torch.bool)
        
        for i in range(len(batch_texts)):
            att_mask = attention_mask[i]
            embeddings = q_emb_batch[i][att_mask]
            doc_id = batch_doc_ids[i]

            file_path = Path(f"doc_embeddings_{save_dir}") / f"{doc_id}.pt"
            torch.save(embeddings, file_path)
            n_tokens = embeddings.shape[0]

            if doc_id not in metadata:
                metadata[doc_id] = {
                    "length": min(n_tokens, max_len),
                    "path": str(file_path)
                }

    metadata_path = Path(f"doc_embeddings_{save_dir}") / "metadata.json"
    with open(metadata_path, "w") as f:
        json.dump(metadata, f)
    
    return metadata

# either precompute or load precomputed doc embeddings
def get_precomputed_doc_embeddings(save_name):
    def split_at_slash(s):
        if '/' in s:
            return s.split('/', 1)
        else:
            return ['', s]
        
    if not os.path.exists("doc_embeddings_" + split_at_slash(save_name)[1] + "/metadata.json"):
        metadata = build_and_save_doc_embeddings(df_collection, "allenai/scibert_scivocab_uncased", save_name, DEVICE)
    else:
        with open("doc_embeddings_" + save_name + "/metadata.json", "r") as f:
            metadata = json.load(f)
    return metadata

In [15]:
doc_embeddings_allenai = get_precomputed_doc_embeddings("allenai")

In [16]:
doc_embeddings_allenai = {
    k: {'length': v['length'], 'path': v['path'].replace('all_embeddings', 'allenai')}
    for k, v in doc_embeddings_allenai.items()
}

# 4) Neural Re-Ranking

Let's create interface for generating convolutional layers for n-grams, calculate the similarity match matrix, generating kernel pooling object and the reranking function itself

### NGram Convolutional Layers class and Similarity Matrix computation:

In [17]:
class NGramConvLayer(nn.Module):
    def __init__(self, in_channels=1, out_channels=8, kernel_size=3):
        super().__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2)
    
    def forward(self, x):
        # x shape: [batch_size, 1, seq_len_q, seq_len_d]
        return F.relu(self.conv(x))

def compute_similarity_matrix(q_embs, d_embs):
    q_norm = F.normalize(q_embs, p=2, dim=1)
    d_norm = F.normalize(d_embs, p=2, dim=1)
    # cosine similarity matrix
    return torch.mm(q_norm, d_norm.T)

def process_query_doc(q_embs, d_embs, conv_layer=None):
    """
    q_embs: [L_q, D]
    d_embs: [L_d, D]
    conv_layer: nn.Module, optional - for modeling n-grams.
    """
    sim_matrix = compute_similarity_matrix(q_embs, d_embs)  # [L_q, L_d]
    if conv_layer:
        input_tensor = sim_matrix.unsqueeze(0).unsqueeze(0)  # shape: [1,1,L_q,L_d]
        conv_output = conv_layer(input_tensor)  # shape: [1,out_channels,L_q,L_d]
        pooled = conv_output.max(dim=2)[0].max(dim=2)[0]  # [out_channels]
        features = pooled
    else:
        features = sim_matrix.flatten()
    return features

def create_ngram_conv_layer(ngram_size):
    return NGramConvLayer(in_channels=1, out_channels=8, kernel_size=ngram_size)

#### Kernel Pooling Definition

In [18]:
class KernelPooling(nn.Module):
    def __init__(self, mus, sigmas):
        super().__init__()
        self.mus = torch.tensor(mus).view(1, -1)  # shape: [1, num_kernels]
        self.sigmas = torch.tensor(sigmas).view(1, -1)

    def forward(self, sim_matrix):
        # sim_matrix: [L_q, L_d]
        sim_matrix = sim_matrix.unsqueeze(0).unsqueeze(0)  # [1,1,L_q,L_d]

        mus = self.mus.to(sim_matrix.device)
        sigmas = self.sigmas.to(sim_matrix.device)

        kernel_vals = torch.exp(- (sim_matrix - mus.reshape(1, -1, 1, 1))**2 / (2 * sigmas.reshape(1, -1, 1, 1)**2))
        pooled = kernel_vals.sum(dim=3).sum(dim=2)  # shape: [1, num_kernels]
        return pooled.squeeze(0)  # shape: [num_kernels]

#### KNRM Definition

In [19]:
class KNRM(nn.Module):
    def __init__(self, mus, sigmas, conv_layer=None, conv_channels=8):
        super().__init__()
        self.kernel_pool = KernelPooling(mus, sigmas)
        self.conv_layer = conv_layer
        n_features = len(mus) * (conv_channels if conv_layer else 1)
        self.scorer = nn.Linear(n_features, 1)
        
    def forward(self, q_emb, d_emb):
        q_norm = F.normalize(q_emb, p=2, dim=1)
        d_norm = F.normalize(d_emb, p=2, dim=1)
        sim_matrix = torch.mm(q_norm, d_norm.T)
        pooled_features = self.kernel_pool(sim_matrix)
        score = self.scorer(pooled_features)
        return score

#### KNRM Triplet Dataset

In [20]:
class KNRMTripletDataset(Dataset):
    def __init__(self, df_query, metadata, tokenizer, num_negatives=1):
        self.queries = []
        self.pos_ids = []
        self.neg_ids = []
        
        for _, row in df_query.iterrows():
            query = row['tweet_text']
            pos_id = row['cord_uid']
            candidates = row['bm25_topk']
            neg_candidates = [doc for doc in candidates if doc != pos_id]
            
            if neg_candidates:
                neg_ids = random.sample(neg_candidates, min(num_negatives, len(neg_candidates)))
                for neg_id in neg_ids:
                    self.queries.append(query)
                    self.pos_ids.append(pos_id)
                    self.neg_ids.append(neg_id)
    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        return self.queries[idx], self.pos_ids[idx], self.neg_ids[idx]

#### KNRM training function

In [21]:
def knrm_train(mus, sigmas, save_name, MARGIN=0.5, BATCH_SIZE=8, EPOCHS=6, LR=1e-3):
    device = DEVICE
    model_name = "allenai/scibert_scivocab_uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name).to(device)
    knrm_model = KNRM(mus, sigmas).to(device)
    metadata = get_precomputed_doc_embeddings(model_name)

    train_dataset = KNRMTripletDataset(df_query_train, metadata, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    optimizer = torch.optim.Adam(knrm_model.parameters(), lr=LR)
    
    for epoch in range(EPOCHS):
        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            queries, pos_ids, neg_ids = batch
            
            q_embs = []
            for query in queries:
                q_emb = get_token_embeddings(query, tokenizer, bert_model, device)
                q_embs.append(q_emb)
            
            score_pos_list = []
            score_neg_list = []
            
            for i in range(len(queries)):
                d_pos_emb = torch.load(metadata[pos_ids[i]]["path"]).to(device)[:metadata[pos_ids[i]]["length"]]
                d_neg_emb = torch.load(metadata[neg_ids[i]]["path"]).to(device)[:metadata[neg_ids[i]]["length"]]
                
                score_pos = knrm_model(q_embs[i], d_pos_emb)
                score_neg = knrm_model(q_embs[i], d_neg_emb)
                
                score_pos_list.append(score_pos)
                score_neg_list.append(score_neg)
            
            score_pos_batch = torch.stack(score_pos_list)
            score_neg_batch = torch.stack(score_neg_list)
            
            loss = F.relu(MARGIN + score_neg_batch - score_pos_batch).mean()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")
    
    torch.save(knrm_model.state_dict(), f"{save_name}.pt")
    return knrm_model

In [None]:
mus = [-1.0, -0.5, 0.0, 0.5, 1.0]
sigmas = [0.1] * len(mus)
knrm_model = knrm_train(mus, sigmas, "knrm", EPOCHS=3)

Precomputing document embeddings.


  0%|          | 0/7718 [00:00<?, ?it/s]

  0%|          | 0/483 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1607 [00:00<?, ?it/s]

#### Embeddings Caching Layer

In [22]:
class EmbeddingsLoader():
    doc_embeddings = {}

    @classmethod
    def get_embeddings(cls):
        if cls.doc_embeddings:
            return cls.doc_embeddings
        else:
            for doc_id, data in tqdm(doc_embeddings_allenai.items(), desc="loading_doc_embeddings"):
                emb = torch.load(data["path"], map_location=DEVICE)
                cls.doc_embeddings[doc_id] = emb
            return cls.doc_embeddings

### Reranking

In [23]:
def rerank(df, save_name, device, mus=None, sigmas=None,
           ngram_size=1,  # 1 means no convolution, higher for n-grams
           conv_channels=8):  # number of convolution filters
    df[f'{save_name}_scores'] = [[]] * len(df)
    if mus is None:
        mus = [-1.0, -0.5, 0.0, 0.5, 1.0]
    if sigmas is None:
        sigmas = [0.1] * len(mus)

    knrm_model = KNRM(mus, sigmas).to(device)
    knrm_model.load_state_dict(torch.load(model_path))
    knrm_model.eval()

    model_name = "allenai/scibert_scivocab_uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    kernel_pool = KernelPooling(mus, sigmas).to(device)
    conv_layer = None
    if ngram_size > 1:
        conv_layer = NGramConvLayer(in_channels=1, out_channels=conv_channels, kernel_size=ngram_size).to(device)

    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), total=len(df), desc='calculating_scores_for_reranking'):
            tweet_text = row['tweet_text']
            pre_ranked_docs = row['bm25_topk']
            q_emb = get_token_embeddings(tweet_text, tokenizer, model, device=device)
            q_emb = q_emb.to(device)
            q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)

            scores = []
            for doc in pre_ranked_docs:
                emb = EmbeddingsLoader().get_embeddings()[doc]
                length = doc_embeddings_allenai[doc]["length"]
                d_emb = emb[:length]
                d_emb = d_emb.to(device)
                d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)

                if conv_layer:
                    features = process_query_doc(q_norm, d_norm, conv_layer)
                    score = features.sum().item()
                else:
                    sim_matrix = torch.mm(q_norm, d_norm.T)
                    pooled_features = kernel_pool(sim_matrix)
                    score = pooled_features.sum().item()
                scores.append(score)

            df.at[idx, f'{save_name}_scores'] = scores

        def sort_docs_by_score(row):
            doc_ids = row['bm25_topk']
            scores = row[f'{save_name}_scores']
            sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
            return sorted_docs

        df[f'{save_name}_topk'] = df.parallel_apply(sort_docs_by_score, axis=1)

    return df

## 4.1) KNRM with plain BERT 

In [25]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [26]:
reranked_knrm_query_dev_df = rerank(df_query_dev, 'knrm', DEVICE)

calculating_scores_for_reranking:   0%|          | 0/1400 [00:00<?, ?it/s]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=88), Label(value='0 / 88'))), HBox…

In [27]:
reranked_knrm_query_dev_df.head()

Unnamed: 0,post_id,tweet_text,cord_uid,normalized_tweet_text,cleaned_tweet_text,final_query,bm25_topk,in_topx,knrm_scores,knrm_topk
0,16,covid recovery: this study from the usa reveal...,3qvh482o,covid recovery: this study from the usa reveal...,covid recoveri studi usa reveal proport case e...,covid recoveri studi usa reveal proport case e...,"[25aj8rj5, 66g5lpm6, o4vvlmr4, vmmwtdia, trrg1...",0.0,"[4837.27880859375, 9445.9326171875, 9120.74023...","[hfaiddki, qv31t2vh, mck3rgcm, es8l29ub, xndph..."
1,69,"""Among 139 clients exposed to two symptomatic ...",r58aohnu,"""among 139 clients exposed to two symptomatic ...",among 139 client expos two symptomat hair styl...,among 139 client expos two symptomat hair styl...,"[r58aohnu, p0kg6dyz, s2vckt2w, yrowv62k, g5hg3...",1.0,"[8609.88671875, 8009.35107421875, 8584.9609375...","[4e82s91a, pfvf8ujb, 6hl6rtsh, q5wiqpcb, yjm6a..."
2,73,I recall early on reading that researchers who...,sts48u9i,i recall early on reading that researchers wor...,recal earli read research who examin coronavir...,recal earli read research who examin coronavir...,"[mkwgkkoi, gruir7aw, xavegbty, vx1hjh26, ntxuf...",0.0,"[4086.349853515625, 3776.436767578125, 5343.52...","[nj94rv6f, 9miesbf1, o877uul1, o47v5vgw, vblfe..."
3,93,You know you're credible when NIH website has ...,3sr2exq9,you know you're credible when national institu...,know your credibl nih websit paper 💃💃 someon p...,know your credibl nih websit paper 💃💃 someon p...,"[3sr2exq9, sv48gjkk, tx8ypqsm, z795y51f, k0f4c...",1.0,"[9972.6298828125, 9163.21484375, 7944.3359375,...","[l9u3c1dg, 4jri92pu, mwj0xc3q, pq3n18ae, 0jwed..."
4,96,Resistance to antifungal medications is a grow...,ybwwmyqy,resistance to antifungal medications is a grow...,resist antifung medic grow issu global scope d...,resist antifung medic grow issu global scope d...,"[ybwwmyqy, ouvq2wpq, rs3umc1x, sxx3yid9, vabb2...",1.0,"[4706.14599609375, 6618.2578125, 6553.40722656...","[9h74xlvv, 2t1zzigc, 896uzyvv, j1kzjre0, bti9a..."


# 5) Evaluation
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5).

In [28]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [29]:
# ---- BM25 Baseline ----
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk')

print("---- BM25 Baseline ----")
print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

---- BM25 Baseline ----
Results on the train set: {1: 0.5731735781529604, 5: 0.625250914183459, 10: 0.6308237901348459}
Results on the dev set: {1: 0.5657142857142857, 5: 0.616095238095238, 10: 0.6224325396825396}


In [30]:
# ---- KNRM Re-Ranking ----
model_name = "knrm"

results_dev = get_performance_mrr(reranked_knrm_query_dev_df, 'cord_uid', f'{model_name}_topk')
print("---- Re-Ranking Finetune: KNRM ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

---- Re-Ranking Finetune: KNRM ----
MRR@5 on dev set: 0.024595238095238097


## Results documentation

### 1) KNRM
Re-Ranking of top 100 BM25 results for each query:\
MRR@5: 0.025 :(

### 2) Conv-KNRM with 1-Grams
Re-Ranking of top 100 BM25 results for each query:\
MRR@5: 

# 6) Exporting results to prepare the submission on Codalab

In [None]:
model_name = "bm25"

df_query_dev['preds'] = df_query_dev[f'{model_name}_topk'].parallel_apply(lambda x: x[:5])

In [None]:
df_query_dev[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')