# 1) Importing query and collection data

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
PATH_COLLECTION_DATA = 'subtask_4b/subtask4b_collection_data.pkl' 

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [5]:
PATH_QUERY_TRAIN_DATA = 'subtask_4b/subtask4b_query_tweets_train.tsv'
PATH_QUERY_DEV_DATA = 'subtask_4b/subtask4b_query_tweets_dev.tsv' 

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

In [6]:
df_query_train.head()

Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69


In [23]:
df_collection.iloc[7263]

cord_uid                                                     0d6sa9pe
source_x                                       Elsevier; Medline; PMC
title               Inoculum at the time of SARS-CoV-2 exposure an...
doi                                        10.1016/j.ijid.2020.06.035
pmcid                                                      PMC7293836
pubmed_id                                                    32553720
license                                                     els-covid
abstract            Abstract A relationship between the infecting ...
publish_time                                               2020-06-14
authors             Guallar, María Pilar; Meiriño, Rosa; Donat-Var...
journal                                              Int J Infect Dis
mag_id                                                            NaN
who_covidence_id                                                  NaN
arxiv_id                                                          NaN
label               

# 2) Running the baseline
The following code runs a BM25 baseline.


In [15]:
from rank_bm25 import BM25Okapi

In [16]:
# Create the BM25 corpus
corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection[:]['cord_uid'].tolist()
tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [18]:
def get_top_cord_uids(query):
  text2bm25top = {}
  if query in text2bm25top.keys():
      return text2bm25top[query]
  else:
      tokenized_query = query.split(' ')
      doc_scores = bm25.get_scores(tokenized_query)
      indices = np.argsort(-doc_scores)[:50] # @k: how many docs shall the ranked list include?
      bm25_topk = [cord_uids[x] for x in indices]

      text2bm25top[query] = bm25_topk
      return bm25_topk


In [8]:
# Retrieve top50 candidates using the BM25 model

train_pkl_path = 'df_query_train_top50.pkl'
dev_pkl_path = 'df_query_dev_top50.pkl'

if not os.path.exists(train_pkl_path):
    df_query_train['bm25_topk'] = df_query_train['tweet_text'].apply(lambda x: get_top_cord_uids(x))
    df_query_train.to_pickle(train_pkl_path)
else:
    df_query_train = pd.read_pickle(train_pkl_path)

if not os.path.exists(dev_pkl_path):
    df_query_dev['bm25_topk'] = df_query_dev['tweet_text'].apply(lambda x: get_top_cord_uids(x))
    df_query_dev.to_pickle(dev_pkl_path)
else:
    df_query_dev = pd.read_pickle(dev_pkl_path)

# 3) Neural Re-Ranking Approach

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
from tqdm import tqdm
import json
from pathlib import Path
import re

## 3.1) Baseline: ColBERT architecture with SciBERT
Use a pretrained SciBERT model to:
- embed each query-token
- embed each doc-token (can be pre-computed)

For each query-doc pair:
- calculate match-matrix: each query-token – doc-token pair gets cosine similarity value
- aggregate the score: 
    - for each query-token take max cosine similarity value with corresponding doc-tokens
    - sum over all of the max elements

In [10]:
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# model.eval()

In [11]:
def get_token_embeddings(text, tokenizer, model, device='cpu'):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state.squeeze(0)  # [seq_len, hidden_dim]
        attention_mask = inputs['attention_mask'].squeeze(0).bool()
        token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings  # Shape: [num_tokens, hidden_dim]


In [None]:
def build_and_save_doc_embeddings(
    docs_df,
    model_name="allenai/scibert_scivocab_uncased",
    max_len=512,
    save_dir="doc_chunks/",
    device="cpu"
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    save_path = Path(save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    metadata_path = save_path / "metadata.json"
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}

    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        file_path = save_path / f"{doc_id}.pt"

        if file_path.exists() and doc_id in metadata:
            continue

        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))

        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_len)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
            token_embeddings = output.last_hidden_state.squeeze(0)
            attention_mask = inputs['attention_mask'].squeeze(0).bool()
            token_embeddings = token_embeddings[attention_mask]

        n_tokens = token_embeddings.size(0)
        pad_len = max_len - n_tokens

        if pad_len > 0:
            padding = torch.zeros(pad_len, token_embeddings.size(1), device=device)
            token_embeddings = torch.cat([token_embeddings, padding], dim=0)
        else:
            token_embeddings = token_embeddings[:max_len]

        try:
            torch.save(token_embeddings.cpu(), file_path)
        except Exception as e:
            print(f"Fehler beim Speichern von {doc_id}: {e}")
            continue

        metadata[doc_id] = {
            "title": row.get("title", ""),
            "abstract": row.get("abstract", ""),
            "authors": row.get("authors", ""),
            "length": min(n_tokens, max_len),
            "path": str(file_path)
        }

    with open(metadata_path, "w") as f:
        json.dump(metadata, f)

    return metadata

Pre-compute document embeddings:

In [None]:
if not os.path.exists("doc_chunks/metadata.json"):
    metadata = build_and_save_doc_embeddings(df_collection, device="cpu")
else:
    with open("doc_chunks/metadata.json", "r") as f:
        metadata = json.load(f)

In [None]:
def colbert_score(query, doc_tensor, doc_len, tokenizer, model):
    q_emb = get_token_embeddings(query, tokenizer, model)
    q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)

    d_emb = doc_tensor[:doc_len]
    d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)

    sim_matrix = torch.matmul(q_norm, d_norm.T)
    max_sim_per_q = sim_matrix.max(dim=1).values
    return max_sim_per_q.sum().item()

def rerank(df):
    df['scibert_baseline_scores'] = [[] for _ in range(len(df))]

    for idx, row in df.iterrows():
        tweet_text = row['tweet_text']
        pre_ranked_docs = row['bm25_topk']

        scores = []
        for doc in pre_ranked_docs:
            emb = torch.load(metadata[doc]["path"])
            length = metadata[doc]["length"]
            score = colbert_score(tweet_text, emb, length, tokenizer, model)
            scores.append(score)
        
        df.at[idx, 'scibert_baseline_scores'] = scores

    def sort_docs_by_score(row):
        doc_ids = row['bm25_topk']
        scores = row['scibert_baseline_scores']
        sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
        return sorted_docs

    df['scibert_baseline_topk'] = df.apply(sort_docs_by_score, axis=1)
    return df

In [None]:
df_query_dev = rerank(df_query_dev)

## 3.2) ColBERT w/ fine-tuned SciBERT

## 3.3) ColBERT w/ fine-tuned SciBERT for docs and CTBERT for queries

# 4) Evaluation
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5).

In [19]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance


In [None]:
# ---- BM25 Baseline ----
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print(f"Baseline BM25 results on the train set: {results_train}")
print(f"Baseline BM25 results on the dev set: {results_dev}")

# ---- SciBERT Re-Ranking Baseline ----
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'scibert_baseline_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'scibert_baseline_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print(f"SciBERT Re-Ranking baseline results on the train set: {results_train}")
print(f"SciBERT Re-Ranking baseline results on the dev set: {results_dev}")

Results on the dev set: {1: np.float64(0.5057142857142857), 5: np.float64(0.5522738095238094), 10: np.float64(0.5522738095238094)}


# 5) Exporting results to prepare the submission on Codalab

In [21]:
df_query_dev['preds'] = df_query_dev['bm25_topk'].apply(lambda x: x[:5])

In [22]:
df_query_dev[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')