# 1) Importing query and collection data

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
PATH_COLLECTION_DATA = 'subtask_4b/subtask4b_collection_data.pkl' 

df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [4]:
PATH_QUERY_TRAIN_DATA = 'subtask_4b/subtask4b_query_tweets_train.tsv'
PATH_QUERY_DEV_DATA = 'subtask_4b/subtask4b_query_tweets_dev.tsv' 

df_query_train = pd.read_csv(PATH_QUERY_TRAIN_DATA, sep = '\t')
df_query_dev = pd.read_csv(PATH_QUERY_DEV_DATA, sep = '\t')

In [5]:
df_query_train.head()

Unnamed: 0,post_id,tweet_text,cord_uid
0,0,Oral care in rehabilitation medicine: oral vul...,htlvpvz5
1,1,this study isn't receiving sufficient attentio...,4kfl29ul
2,2,"thanks, xi jinping. a reminder that this study...",jtwb17u8
3,3,Taiwan - a population of 23 million has had ju...,0w9k8iy1
4,4,Obtaining a diagnosis of autism in lower incom...,tiqksd69


In [6]:
df_collection.head()

Unnamed: 0,cord_uid,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,label,time,timet
162,umvrwgaw,PMC,Professional and Home-Made Face Masks Reduce E...,10.1371/journal.pone.0002618,PMC2440799,18612429,cc-by,BACKGROUND: Governments are preparing for a po...,2008-07-09,"van der Sande, Marianne; Teunis, Peter; Sabel,...",PLoS One,,,,umvrwgaw,2008-07-09,1215561600
611,spiud6ok,PMC,The Failure of R (0),10.1155/2011/527610,PMC3157160,21860658,cc-by,"The basic reproductive ratio, R (0), is one of...",2011-08-16,"Li, Jing; Blakeley, Daniel; Smith?, Robert J.",Comput Math Methods Med,,,,spiud6ok,2011-08-16,1313452800
918,aclzp3iy,PMC,Pulmonary sequelae in a patient recovered from...,10.4103/0970-2113.99118,PMC3424870,22919170,cc-by-nc-sa,The pandemic of swine flu (H1N1) influenza spr...,2012,"Singh, Virendra; Sharma, Bharat Bhushan; Patel...",Lung India,,,,aclzp3iy,2012-01-01,1325376000
993,ycxyn2a2,PMC,What was the primary mode of smallpox transmis...,10.3389/fcimb.2012.00150,PMC3509329,23226686,cc-by,The mode of infection transmission has profoun...,2012-11-29,"Milton, Donald K.",Front Cell Infect Microbiol,,,,ycxyn2a2,2012-11-29,1354147200
1053,zxe95qy9,PMC,"Lessons from the History of Quarantine, from P...",10.3201/eid1902.120312,PMC3559034,23343512,no-cc,"In the new millennium, the centuries-old strat...",2013-02-03,"Tognotti, Eugenia",Emerg Infect Dis,,,,zxe95qy9,2013-02-03,1359849600


# 2) Running the BM25 baseline
The following code runs a BM25 baseline.


In [15]:
from rank_bm25 import BM25Okapi

In [16]:
# Create the BM25 corpus
corpus = df_collection[:][['title', 'abstract']].apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
cord_uids = df_collection[:]['cord_uid'].tolist()
tokenized_corpus = [doc.split(' ') for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
def get_top_cord_uids(query):
  text2bm25top = {}
  if query in text2bm25top.keys():
      return text2bm25top[query]
  else:
      tokenized_query = query.split(' ')
      doc_scores = bm25.get_scores(tokenized_query)
      indices = np.argsort(-doc_scores)[:100] # @k: how many docs shall the ranked list include?
      bm25_topk = [cord_uids[x] for x in indices]

      text2bm25top[query] = bm25_topk
      return bm25_topk


In [None]:
# Retrieve top50 candidates using the BM25 model

train_pkl_path = 'df_query_train_top100.pkl'
dev_pkl_path = 'df_query_dev_top100.pkl'

if not os.path.exists(train_pkl_path):
    df_query_train['bm25_topk'] = df_query_train['tweet_text'].apply(lambda x: get_top_cord_uids(x))
    df_query_train.to_pickle(train_pkl_path)
else:
    df_query_train = pd.read_pickle(train_pkl_path)

if not os.path.exists(dev_pkl_path):
    df_query_dev['bm25_topk'] = df_query_dev['tweet_text'].apply(lambda x: get_top_cord_uids(x))
    df_query_dev.to_pickle(dev_pkl_path)
else:
    df_query_dev = pd.read_pickle(dev_pkl_path)

# 3) Neural Re-Ranking

In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
from tqdm import tqdm
import json
from pathlib import Path
import re

  from .autonotebook import tqdm as notebook_tqdm


## 3.1) Baseline: ColBERT architecture with SciBERT
Use a pretrained SciBERT model to:
- embed each query-token
- embed each doc-token (can be pre-computed)

For each query-doc pair:
- calculate match-matrix: each query-token – doc-token pair gets cosine similarity value
- aggregate the score: 
    - for each query-token take max cosine similarity value with corresponding doc-tokens
    - sum over all of the max elements

In [None]:
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
def get_token_embeddings(text, tokenizer, model, device='cpu'):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # [seq_len, hidden_dim]
    attention_mask = inputs['attention_mask'].squeeze(0).bool()
    token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings  # Shape: [num_tokens, hidden_dim]


In [None]:
def build_and_save_doc_embeddings(
    docs_df,
    model_name,
    max_len=512,
    save_dir="doc_chunks/",
    device="cpu"
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    save_path = Path(save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    metadata_path = save_path / "metadata.json"
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}

    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        file_path = save_path / f"{doc_id}.pt"

        if file_path.exists() and doc_id in metadata:
            continue

        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))

        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_len)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
            token_embeddings = output.last_hidden_state.squeeze(0)
            attention_mask = inputs['attention_mask'].squeeze(0).bool()
            token_embeddings = token_embeddings[attention_mask]

        n_tokens = token_embeddings.size(0)
        pad_len = max_len - n_tokens

        if pad_len > 0:
            padding = torch.zeros(pad_len, token_embeddings.size(1), device=device)
            token_embeddings = torch.cat([token_embeddings, padding], dim=0)
        else:
            token_embeddings = token_embeddings[:max_len]

        try:
            torch.save(token_embeddings.cpu(), file_path)
        except Exception as e:
            print(f"Fehler beim Speichern von {doc_id}: {e}")
            continue

        metadata[doc_id] = {
            "title": row.get("title", ""),
            "abstract": row.get("abstract", ""),
            "authors": row.get("authors", ""),
            "length": min(n_tokens, max_len),
            "path": str(file_path)
        }

    with open(metadata_path, "w") as f:
        json.dump(metadata, f)

    return metadata

Pre-compute document embeddings:

In [None]:
if not os.path.exists("doc_chunks/metadata.json"):
    metadata = build_and_save_doc_embeddings(df_collection, model_name=model_name, device="cpu")
else:
    with open("doc_chunks/metadata.json", "r") as f:
        metadata = json.load(f)

Re-Rank BM25 pre-ranked list:

In [15]:
def rerank(df, metadata, tokenizer, model):
    df['scibert_baseline_scores'] = [[] for _ in range(len(df))]

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        tweet_text = row['tweet_text']
        pre_ranked_docs = row['bm25_topk']

        q_emb = get_token_embeddings(tweet_text, tokenizer, model)
        q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)

        scores = []
        for doc in pre_ranked_docs:
            emb = torch.load(metadata[doc]["path"])
            length = metadata[doc]["length"]
            d_emb = emb[:length]
            d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)

            sim_matrix = torch.matmul(q_norm, d_norm.T)
            max_sim_per_q = sim_matrix.max(dim=1).values
            score = max_sim_per_q.sum().item()
            scores.append(score)

        df.at[idx, 'scibert_baseline_scores'] = scores

    def sort_docs_by_score(row):
        doc_ids = row['bm25_topk']
        scores = row['scibert_baseline_scores']
        sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
        return sorted_docs

    df['scibert_baseline_topk'] = df.apply(sort_docs_by_score, axis=1)
    return df


In [17]:
df_query_dev = rerank(df_query_dev, metadata, tokenizer, model)

  emb = torch.load(metadata[doc]["path"])
100%|██████████| 1400/1400 [05:40<00:00,  4.11it/s]


In [22]:
df_query_train = rerank(df_query_train, metadata, tokenizer, model)

  emb = torch.load(metadata[doc]["path"])
100%|██████████| 12853/12853 [50:30<00:00,  4.24it/s] 


In [26]:
train_pkl_path = 'df_query_train_reranked_scibert.pkl'
dev_pkl_path = 'df_query_dev_reranked_scibert.pkl'

df_query_train.to_pickle(train_pkl_path)
df_query_dev.to_pickle(dev_pkl_path)

## 3.2) ColBERT w/ fine-tuned SciBERT

In [16]:
train_pkl_path = 'df_query_train_reranked_scibert.pkl'
dev_pkl_path = 'df_query_dev_reranked_scibert.pkl'

df_query_dev = pd.read_pickle(dev_pkl_path)
df_query_train = pd.read_pickle(train_pkl_path)

In [17]:
from torch.utils.data import Dataset, DataLoader
import random
import torch.nn.functional as F

In [24]:
class ColBERTTripletDataset(Dataset):
    def __init__(self, df, metadata, tokenizer, num_negatives=1):
        self.data = []
        self.tokenizer = tokenizer
        self.metadata = metadata
        for _, row in df.iterrows():
            query = row["tweet_text"]
            pos = row["cord_uid"]
            negatives = [doc for doc in row["bm25_topk"] if doc != pos]
            if negatives:
                for _ in range(num_negatives):
                    neg = random.choice(negatives)
                    self.data.append((query, pos, neg))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def colbert_score_from_emb(q_emb, d_emb):
    q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)
    d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)
    sim_matrix = torch.matmul(q_norm, d_norm.T)
    max_sim_per_q = sim_matrix.max(dim=1).values
    return max_sim_per_q.sum()

# hyperparameters
BATCH_SIZE = 8
EPOCHS = 2
LR = 2e-5
MARGIN = 0.2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# create training triplets
train_dataset = ColBERTTripletDataset(df_query_train, metadata, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# optimizer
model.train()
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

for epoch in range(EPOCHS):
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        queries, pos_ids, neg_ids = batch

        inputs = tokenizer(list(queries), return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        outputs = model(**inputs)
        q_emb_batch = outputs.last_hidden_state  # [B, L, D]
        attention_mask = inputs["attention_mask"].bool()
        q_embs = [emb[mask] for emb, mask in zip(q_emb_batch, attention_mask)]  # list of [num_tokens, D]

        score_pos_list = []
        score_neg_list = []

        for i in range(len(queries)):
            d_pos_emb = torch.load(metadata[pos_ids[i]]["path"]).to(DEVICE)[:metadata[pos_ids[i]]["length"]]
            d_neg_emb = torch.load(metadata[neg_ids[i]]["path"]).to(DEVICE)[:metadata[neg_ids[i]]["length"]]

            q_emb = q_embs[i]
            score_pos = colbert_score_from_emb(q_emb, d_pos_emb)
            score_neg = colbert_score_from_emb(q_emb, d_neg_emb)

            score_pos_list.append(score_pos)
            score_neg_list.append(score_neg)

        score_pos_batch = torch.stack(score_pos_list)
        score_neg_batch = torch.stack(score_neg_list)

        loss = F.relu(MARGIN + score_neg_batch - score_pos_batch).mean()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")


  d_pos_emb = torch.load(metadata[pos_ids[i]]["path"]).to(DEVICE)[:metadata[pos_ids[i]]["length"]]
  d_neg_emb = torch.load(metadata[neg_ids[i]]["path"]).to(DEVICE)[:metadata[neg_ids[i]]["length"]]
Epoch 1: 100%|██████████| 1607/1607 [25:11<00:00,  1.06it/s]


Epoch 1 Loss: 80.1016


Epoch 2: 100%|██████████| 1607/1607 [25:14<00:00,  1.06it/s]

Epoch 2 Loss: 28.7130





In [25]:
model.save_pretrained("colbert_scibert_finetune-1")
tokenizer.save_pretrained("colbert_scibert_finetune-1")

('colbert_scibert_finetune-1/tokenizer_config.json',
 'colbert_scibert_finetune-1/special_tokens_map.json',
 'colbert_scibert_finetune-1/vocab.txt',
 'colbert_scibert_finetune-1/added_tokens.json',
 'colbert_scibert_finetune-1/tokenizer.json')

#### Test run:

In [26]:
model_name = "colbert_scibert_finetune-1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
embed_dir = "doc_chunks-finetune-1/"

def get_token_embeddings(text, tokenizer, model, device='cpu'):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # [seq_len, hidden_dim]
    attention_mask = inputs['attention_mask'].squeeze(0).bool()
    token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings  # Shape: [num_tokens, hidden_dim]

def build_and_save_doc_embeddings(
    docs_df,
    model_name,
    max_len=512,
    save_dir=embed_dir,
    device="cpu"
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    save_path = Path(save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    metadata_path = save_path / "metadata.json"
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}

    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        file_path = save_path / f"{doc_id}.pt"

        if file_path.exists() and doc_id in metadata:
            continue

        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))

        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_len)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
            token_embeddings = output.last_hidden_state.squeeze(0)
            attention_mask = inputs['attention_mask'].squeeze(0).bool()
            token_embeddings = token_embeddings[attention_mask]

        n_tokens = token_embeddings.size(0)
        pad_len = max_len - n_tokens

        if pad_len > 0:
            padding = torch.zeros(pad_len, token_embeddings.size(1), device=device)
            token_embeddings = torch.cat([token_embeddings, padding], dim=0)
        else:
            token_embeddings = token_embeddings[:max_len]

        try:
            torch.save(token_embeddings.cpu(), file_path)
        except Exception as e:
            print(f"Fehler beim Speichern von {doc_id}: {e}")
            continue

        metadata[doc_id] = {
            "title": row.get("title", ""),
            "abstract": row.get("abstract", ""),
            "authors": row.get("authors", ""),
            "length": min(n_tokens, max_len),
            "path": str(file_path)
        }

    with open(metadata_path, "w") as f:
        json.dump(metadata, f)

    return metadata

# pre-compute doc embeddings
if not os.path.exists(embed_dir + "metadata.json"):
    metadata = build_and_save_doc_embeddings(df_collection, model_name=model_name, device="cpu")
else:
    with open(embed_dir + "metadata.json", "r") as f:
        metadata = json.load(f)

def rerank(df, metadata, tokenizer, model):
    df['scibert_finetune-1_scores'] = [[] for _ in range(len(df))]

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        tweet_text = row['tweet_text']
        pre_ranked_docs = row['bm25_topk']

        q_emb = get_token_embeddings(tweet_text, tokenizer, model)
        q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)

        scores = []
        for doc in pre_ranked_docs:
            emb = torch.load(metadata[doc]["path"])
            length = metadata[doc]["length"]
            d_emb = emb[:length]
            d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)

            sim_matrix = torch.matmul(q_norm, d_norm.T)
            max_sim_per_q = sim_matrix.max(dim=1).values
            score = max_sim_per_q.sum().item()
            scores.append(score)

        df.at[idx, 'scibert_finetune-1_scores'] = scores

    def sort_docs_by_score(row):
        doc_ids = row['bm25_topk']
        scores = row['scibert_finetune-1_scores']
        sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
        return sorted_docs

    df['scibert_finetune-1_topk'] = df.apply(sort_docs_by_score, axis=1)
    return df


100%|██████████| 7718/7718 [21:19<00:00,  6.03it/s]


In [27]:
df_query_dev = rerank(df_query_dev, metadata, tokenizer, model)

  emb = torch.load(metadata[doc]["path"])
100%|██████████| 1400/1400 [05:25<00:00,  4.31it/s]


In [None]:
df_query_train = rerank(df_query_train, metadata, tokenizer, model)

## 3.3) ColBERT w/ fine-tuned SciBERT for docs and CTBERT for queries

# 4) Evaluation
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5).

In [29]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [23]:
# ---- BM25 Baseline ----
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'bm25_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'bm25_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print("---- BM25 Baseline ----")
print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

---- BM25 Baseline ----
Results on the train set: {1: 0.5079747918773827, 5: 0.5508999196037242, 10: 0.5558827906275973}
Results on the dev set: {1: 0.505, 5: 0.5520357142857142, 10: 0.5574200680272109}


In [24]:
# ---- ColBERT Re-Ranking Baseline ----
results_train = get_performance_mrr(df_query_train, 'cord_uid', 'scibert_baseline_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'scibert_baseline_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print("---- Re-Ranking Baseline: ColBERT (SciBERT) ----")
print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

---- Re-Ranking Baseline: ColBERT (SciBERT) ----
Results on the train set: {1: 0.5081303975725512, 5: 0.5548354469773593, 10: 0.5600201422927634}
Results on the dev set: {1: 0.53, 5: 0.569404761904762, 10: 0.5742800453514738}


In [30]:
# ---- ColBERT Re-Ranking Finetune-1 ----
#results_train = get_performance_mrr(df_query_train, 'cord_uid', 'scibert_finetune-1_topk')
results_dev = get_performance_mrr(df_query_dev, 'cord_uid', 'scibert_finetune-1_topk')
# Printed MRR@k results in the following format: {k: MRR@k}
print("---- Re-Ranking Finetune-1: ColBERT (SciBERT) ----")
#print(f"Results on the train set: {results_train}")
print(f"Results on the dev set: {results_dev}")

---- Re-Ranking Finetune-1: ColBERT (SciBERT) ----
Results on the dev set: {1: 0.5978571428571429, 5: 0.6351428571428571, 10: 0.6379787414965986}


# 5) Exporting results to prepare the submission on Codalab

In [21]:
df_query_dev['preds'] = df_query_dev['bm25_topk'].apply(lambda x: x[:5])

In [22]:
df_query_dev[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')