In [1]:
import torch
from torch.nn.functional import cosine_similarity
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from tqdm.notebook import tqdm
import json
from pathlib import Path
import re
import random
import os
import numpy as np
import pandas as pd

# 1) Importing BM25 preranked docs for each query

In [4]:
# load document collection
PATH_COLLECTION_DATA = '../subtask_4b/subtask4b_collection_data.pkl' 
df_collection = pd.read_pickle(PATH_COLLECTION_DATA)

In [6]:
# load BM25 pre-ranked query dataframes
train_pkl_path = '../df_query_train_top100.pkl'
dev_pkl_path = '../df_query_dev_top100.pkl'
test_pkl_path = '../df_query_test_top100.pkl'

df_query_dev = pd.read_pickle(dev_pkl_path)
df_query_train = pd.read_pickle(train_pkl_path)
df_query_test = pd.read_pickle(test_pkl_path)

# 2.) ColBERT reranking

## 2.1) ColBERT finetuning

The following functions are meant to build BERT token embeddings from text. <br>
Additionally, the document embeddings get pre-computed or loaded if they already exist.

In [7]:
# get token embeddings of a specified text passage from some model
def get_token_embeddings(text, tokenizer, model, device='cpu'):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)
    attention_mask = inputs['attention_mask'].squeeze(0).bool()
    token_embeddings = token_embeddings[attention_mask] 
    return token_embeddings

# pre compute all the token embeddings of the documents
def build_and_save_doc_embeddings(
    docs_df,
    model_name,
    save_dir,
    max_len=512,
    device="cuda"
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    save_path = Path("doc_embeddings_" + save_dir)
    save_path.mkdir(parents=True, exist_ok=True)

    metadata_path = save_path / "metadata.json"
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            metadata = json.load(f)
    else:
        metadata = {}

    print("Precomputing document embeddings.")
    for i, row in tqdm(docs_df.iterrows(), total=len(docs_df)):
        doc_id = row.get("cord_uid", f"doc_{i}")
        file_path = save_path / f"{doc_id}.pt"

        if file_path.exists() and doc_id in metadata:
            continue

        text = str(row.get('title', '')) + " " + str(row.get('abstract', '')) + " Authors: " + str(row.get('authors', ''))

        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_len)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        output = model(**inputs)
        token_embeddings = output.last_hidden_state.squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0).bool()
        token_embeddings = token_embeddings[attention_mask]

        n_tokens = token_embeddings.size(0)
        pad_len = max_len - n_tokens

        if pad_len > 0:
            padding = torch.zeros(pad_len, token_embeddings.size(1), device=device)
            token_embeddings = torch.cat([token_embeddings, padding], dim=0)
        else:
            token_embeddings = token_embeddings[:max_len]

        try:
            torch.save(token_embeddings.cuda(), file_path)
        except Exception as e:
            print(f"Error saving document {doc_id}: {e}")
            continue

        metadata[doc_id] = {
            "title": row.get("title", ""),
            "abstract": row.get("abstract", ""),
            "authors": row.get("authors", ""),
            "length": min(n_tokens, max_len),
            "path": str(file_path)
        }

    with open(metadata_path, "w") as f:
        json.dump(metadata, f)

    return metadata

# either precompute or load precomputed doc embeddings
def get_precomputed_doc_embeddings(save_name):
    def split_at_slash(s):
        if '/' in s:
            return s.split('/', 1)
        else:
            return ['', s]
        
    if not os.path.exists("doc_embeddings_" + split_at_slash(save_name)[1] + "/metadata.json"):
        metadata = build_and_save_doc_embeddings(df_collection, model_name=save_name, save_dir=save_name, device="cuda")
    else:
        with open("doc_embeddings_" + save_name + "/metadata.json", "r") as f:
            metadata = json.load(f)
    return metadata

These functions create the triplet dataset (query, pos_doc, neg_doc) from the training dataset and define the SciBERT finetuning procedure.

In [8]:
# creating training dataset by getting the positive and a random negative document for each query
class ColBERTTripletDataset(Dataset):
    def __init__(self, df, metadata, tokenizer, num_negatives=1):
        self.data = []
        self.tokenizer = tokenizer
        self.metadata = metadata
        for _, row in df.iterrows():
            query = row["tweet_text"]
            pos = row["cord_uid"]
            negatives = [doc for doc in row["bm25_topk"] if doc != pos]
            if negatives:
                for _ in range(num_negatives):
                    neg = random.choice(negatives)
                    self.data.append((query, pos, neg))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# basic ColBERT scoring i.e. match matrix aggregation
def colbert_score_from_emb(q_emb, d_emb):
    q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)
    d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)
    sim_matrix = torch.matmul(q_norm, d_norm.T)
    max_sim_per_q = sim_matrix.max(dim=1).values
    return max_sim_per_q.sum()

# finetuning some BERT-model to get higher ColBERT-score 
# for the positive document than for the negative (per query)
def bert_finetune(save_name, MARGIN=0.5, BATCH_SIZE=8, EPOCHS=6, LR=2e-5, num_negatives=1):    
    model_name = "allenai/scibert_scivocab_uncased" # specify baseline BERT model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    metadata = get_precomputed_doc_embeddings(model_name)

    # create training triplets
    train_dataset = ColBERTTripletDataset(df_query_train, metadata, tokenizer, num_negatives)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    # optimizer
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    model.train()
    model.to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    
    for epoch in range(EPOCHS):
        total_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            queries, pos_ids, neg_ids = batch
    
            inputs = tokenizer(list(queries), return_tensors='pt', padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
            outputs = model(**inputs)
            q_emb_batch = outputs.last_hidden_state  # [B, L, D]
            attention_mask = inputs["attention_mask"].bool()
            q_embs = [emb[mask] for emb, mask in zip(q_emb_batch, attention_mask)]
    
            score_pos_list = []
            score_neg_list = []
    
            for i in range(len(queries)):
                d_pos_emb = torch.load(metadata[pos_ids[i]]["path"]).to(DEVICE)[:metadata[pos_ids[i]]["length"]]
                d_neg_emb = torch.load(metadata[neg_ids[i]]["path"]).to(DEVICE)[:metadata[neg_ids[i]]["length"]]
    
                q_emb = q_embs[i]
                score_pos = colbert_score_from_emb(q_emb, d_pos_emb)
                score_neg = colbert_score_from_emb(q_emb, d_neg_emb)
    
                score_pos_list.append(score_pos)
                score_neg_list.append(score_neg)
    
            score_pos_batch = torch.stack(score_pos_list)
            score_neg_batch = torch.stack(score_neg_list)
    
            loss = F.relu(MARGIN + score_neg_batch - score_pos_batch).mean()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
    
        print(f"Epoch {epoch+1} Loss: {total_loss:.4f}")

    model.save_pretrained(save_name)
    tokenizer.save_pretrained(save_name)

Finetuning SciBERT with specified parameters:

In [None]:
bert_finetune("colB_sciB_marg05", MARGIN=0.5)

## 2.2) Finetuned ColBERT Reranking:

This function takes a dataframe containing queries and corresponding preranked document lists, embeds the queries through a forward pass in the finetuned SciBERT model, loads the precomputed document embeddings and computes a matchmatrix for each query-doc pair. Following the ColBERT approach, it then aggregates a single matching score from the matrix. Finally, it sorts the preranked document list according to the newly computed matching scores.

In [5]:
def rerank(df, metadata, tokenizer, model, save_name):
    device = next(model.parameters()).device
    df[save_name + '_scores'] = [[] for _ in range(len(df))]

    doc_embeddings = {}
    for doc_id, data in metadata.items():
        emb = torch.load(data["path"], map_location="cpu")
        doc_embeddings[doc_id] = emb

    with torch.no_grad():
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            tweet_text = row['tweet_text']
            pre_ranked_docs = row['bm25_topk']

            q_emb = get_token_embeddings(tweet_text, tokenizer, model).to(device)
            q_norm = q_emb / q_emb.norm(dim=1, keepdim=True)

            scores = []
            for doc in pre_ranked_docs:
                emb = doc_embeddings[doc].to(device)
                length = metadata[doc]["length"]
                d_emb = emb[:length]
                d_norm = d_emb / d_emb.norm(dim=1, keepdim=True)

                sim_matrix = torch.matmul(q_norm, d_norm.T)
                max_sim_per_q = sim_matrix.max(dim=1).values
                score = max_sim_per_q.sum().item()
                scores.append(score)

            df.at[idx, save_name + '_scores'] = scores

    def sort_docs_by_score(row):
        doc_ids = row['bm25_topk']
        scores = row[save_name + '_scores']
        sorted_docs = [doc for doc, _ in sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)]
        return sorted_docs

    df[save_name + '_topk'] = df.apply(sort_docs_by_score, axis=1)
    return df

In [6]:
# specify model for re-ranking
model_name = "colB_sciB_marg05"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# pre-compute embeddings
metadata = get_precomputed_doc_embeddings(model_name)

In [None]:
# re-rank BM25 list for dev data
df_query_dev = rerank(df_query_dev, metadata, tokenizer, model, model_name)

# 3) Evaluation
The following code evaluates the BM25 retrieval baseline on the query set using the Mean Reciprocal Rank score (MRR@5).

In [8]:
# Evaluate retrieved candidates using MRR@k
def get_performance_mrr(data, col_gold, col_pred, list_k = [1, 5, 10]):
    d_performance = {}
    for k in list_k:
        data["in_topx"] = data.apply(lambda x: (1/([i for i in x[col_pred][:k]].index(x[col_gold]) + 1) if x[col_gold] in [i for i in x[col_pred][:k]] else 0), axis=1)
        #performances.append(data["in_topx"].mean())
        d_performance[k] = data["in_topx"].mean()
    return d_performance

In [9]:
# ---- ColBERT Re-Ranking ----
model_name = "colB_sciB_marg05"

results_dev = get_performance_mrr(df_query_dev, 'cord_uid', f'{model_name}_topk')
print("---- Finetuned ColBERT (SciBERT) Reranking ----")
print(f"MRR@5 on dev set: {results_dev[5]}")

---- Finetuned ColBERT (SciBERT) Reranking ----
MRR@5 on dev set: 0.6806309523809524


# 4) Exporting results to prepare the submission on Codalab

In [15]:
model_name = "colB_sciB_marg05"

df_query_test['preds'] = df_query_test[f'{model_name}_topk'].apply(lambda x: x[:5])
df_query_test["post_id"] = df_query_test["post_id"].astype(str)

In [16]:
df_query_test[['post_id', 'preds']].to_csv('predictions.tsv', index=None, sep='\t')