In [17]:
!pip install -q sentence-transformers pandas scikit-learn

In [18]:
!pip install -U sentence-transformers
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")




In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [4]:
import pandas as pd
import transformers
import numpy as np
import torch



In [5]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

file_path = '/content/drive/MyDrive/check_that/subtask4b_collection_data.pkl'


In [None]:
 df_collection = pd.read_pickle(file_path)

In [None]:

train_file_path = '/content/drive/MyDrive/check_that/subtask4b_query_tweets_train.tsv'
df_query_train = pd.read_csv(train_file_path, sep='\t')
dev_file_path = '/content/drive/MyDrive/check_that/subtask4b_query_tweets_dev.tsv'
df_query_dev = pd.read_csv(dev_file_path, sep='\t')

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())
print("CUDA available:", torch.cuda.is_available())


In [None]:
df_query_train.head()


In [None]:
df_query_dev.head()

In [None]:
df_collection.head()


In [None]:
df_collection['paper_text'] = df_collection['title'] + ". " + df_collection['abstract']
train_query_list = df_query_train['tweet_text']
dev_query_list = df_query_dev['tweet_text']

# Encode Corpus and Queries with SentenceTransformer and Normalize Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

bi_encoder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")


corpus_embeddings = bi_encoder.encode(df_collection['paper_text'].tolist(), convert_to_tensor=True, show_progress_bar=True)

train_query_embeddings = bi_encoder.encode(train_query_list, convert_to_tensor=True, show_progress_bar=True)
dev_query_embeddings = bi_encoder.encode(dev_query_list, convert_to_tensor=True, show_progress_bar=True)
from torch.nn.functional import normalize

corpus_embeddings = normalize(corpus_embeddings, p=2, dim=1)
train_query_embeddings = normalize(train_query_embeddings, p=2, dim=1)
dev_query_embeddings = normalize(dev_query_embeddings, p=2, dim=1)



In [None]:
ground_truth_train = dict(zip(df_query_train['post_id'], df_query_train['cord_uid']))
query_ids_train = df_query_train['post_id'].tolist()

ground_truth_dev = dict(zip(df_query_dev['post_id'], df_query_dev['cord_uid']))
query_ids_dev = df_query_dev['post_id'].tolist()


# Cosine similartiy for query development

In [None]:
from torch.nn.functional import cosine_similarity

top_k = 5
top_k_results_train = []
top_k_results_dev = []


for query_vec in train_query_embeddings:
    cos_scores = cosine_similarity(query_vec.unsqueeze(0), corpus_embeddings).squeeze(0)

    if cos_scores.ndim == 0:
        cos_scores = cos_scores.unsqueeze(0)

    top_k_val = min(top_k, cos_scores.shape[0])
    top_results = torch.topk(cos_scores, k=top_k_val)
    top_k_results_train.append(top_results.indices.tolist())


for query_vec in dev_query_embeddings:
    cos_scores = cosine_similarity(query_vec.unsqueeze(0), corpus_embeddings).squeeze(0)

    if cos_scores.ndim == 0:
        cos_scores = cos_scores.unsqueeze(0)

    top_k_val = min(top_k, cos_scores.shape[0])
    top_results = torch.topk(cos_scores, k=top_k_val)
    top_k_results_dev.append(top_results.indices.tolist())

top_k_cord_uids_train = [
    df_collection['cord_uid'].iloc[doc_indices].tolist() for doc_indices in top_k_results_train
]

top_k_cord_uids_dev = [
    df_collection['cord_uid'].iloc[doc_indices].tolist() for doc_indices in top_k_results_dev
]



In [None]:

def get_cord_uids_from_indices(indices, df_collection):
    return df_collection['cord_uid'].iloc[indices].tolist()


top_k_results_train_with_ids = [
    get_cord_uids_from_indices(doc_ids, df_collection) for doc_ids in top_k_results_train
]

top_k_results_dev_with_ids = [
    get_cord_uids_from_indices(doc_ids, df_collection) for doc_ids in top_k_results_dev
]


train_actual_vs_predicted = [
    {
        'actual': ground_truth_train.get(query_ids_train[i]),
        'predicted': top_k_results_train_with_ids[i]
    }
    for i in range(len(query_ids_train))
]

dev_actual_vs_predicted = [
    {
        'actual': ground_truth_dev.get(query_ids_dev[i]),
        'predicted': top_k_results_dev_with_ids[i]
    }
    for i in range(len(query_ids_dev))
]



In [None]:
def compute_mrr(predictions, ground_truth_dict, query_ids):
    total_score = 0.0


    for i, query_id in enumerate(query_ids):
        predicted_docs = predictions[i]
        relevant_doc = ground_truth_dict.get(query_id)

        if not relevant_doc:
            continue

        for rank, doc_id in enumerate(predicted_docs, start=1):
            if doc_id == relevant_doc:
                total_score += 1 / rank
                break


    return total_score / len(query_ids)

def compute_recall_at_k(predictions, ground_truth_dict, query_ids):
    total_hits = 0

    for i, query_id in enumerate(query_ids):
        predicted_docs = predictions[i]
        relevant_doc = ground_truth_dict.get(query_id)

        if not relevant_doc:
            continue

        if relevant_doc in predicted_docs:
            total_hits += 1


    recall = total_hits / len(query_ids)
    return recall



# Evaluation the pretrained model

In [None]:
for k in [1, 5, 10]:
    trimmed_predictions_train = [doc_ids[:k] for doc_ids in top_k_results_train_with_ids]

    mrr_score_train = compute_mrr(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train
    )

    print(f"MRR@{k} for Train Set: {mrr_score_train:.4f}")

for k in [1, 5, 10]:
    trimmed_predictions_dev = [doc_ids[:k] for doc_ids in top_k_results_dev_with_ids]

    mrr_score_dev = compute_mrr(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev
    )

    print(f"MRR@{k} for Dev Set: {mrr_score_dev:.4f}")

for k in [1, 5, 10]:
    trimmed_predictions_train = [doc_ids[:k] for doc_ids in top_k_results_train_with_ids]
    recall_score_train = compute_recall_at_k(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train,
    )
    print(f"Recall@{k} for Train Set: {recall_score_train:.4f}")

for k in [1, 5, 10]:
    trimmed_predictions_dev = [doc_ids[:k] for doc_ids in top_k_results_dev_with_ids]
    recall_score_dev = compute_recall_at_k(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev,
    )
    print(f"Recall@{k} for Dev Set: {recall_score_dev:.4f}")


| Metric       | Train Set | Dev Set  |
|--------------|-----------|----------|
| MRR@1        | 0.4312    | 0.4271   |
| MRR@5        | 0.4943    | 0.4921   |
| MRR@10       | 0.4943    | 0.4921   |
| Recall@1     | 0.4312    | 0.4271   |
| Recall@5     | 0.5959    | 0.6043   |
| Recall@10    | 0.5959    | 0.6043   |


## Reranking Top-K Candidates with CrossEncoder


In [None]:
from tqdm import tqdm

def rerank_query(tweet_text, candidate_paper_texts):
    pairs = [(tweet_text, doc_text) for doc_text in candidate_paper_texts]
    scores = cross_encoder.predict(pairs)
    sorted_indices = torch.argsort(torch.tensor(scores), descending=True)
    return sorted_indices.tolist()

reranked_top_k_results_train = []
reranked_top_k_results_dev = []

top_k = 5

cord_uid_to_text = dict(zip(df_collection['cord_uid'], df_collection['paper_text']))

# --- Rerank Train Queries ---
for i, query_text in tqdm(enumerate(train_query_list), total=len(train_query_list), desc="Reranking Train Queries"):
    candidate_doc_ids = top_k_results_train[i]
    candidate_texts = [cord_uid_to_text[df_collection['cord_uid'].iloc[idx]] for idx in candidate_doc_ids]
    new_order = rerank_query(query_text, candidate_texts)
    reranked_candidates = [candidate_doc_ids[idx] for idx in new_order]
    reranked_top_k_results_train.append(reranked_candidates)

# --- Rerank Dev Queries ---
for i, query_text in tqdm(enumerate(dev_query_list), total=len(dev_query_list), desc="Reranking Dev Queries"):
    candidate_doc_ids = top_k_results_dev[i]
    candidate_texts = [cord_uid_to_text[df_collection['cord_uid'].iloc[idx]] for idx in candidate_doc_ids]
    new_order = rerank_query(query_text, candidate_texts)
    reranked_candidates = [candidate_doc_ids[idx] for idx in new_order]
    reranked_top_k_results_dev.append(reranked_candidates)




In [None]:
def get_cord_uids_from_indices(indices, df_collection):
    return df_collection['cord_uid'].iloc[indices].tolist()

reranked_top_k_results_train_with_ids = [
    get_cord_uids_from_indices(doc_ids, df_collection) for doc_ids in reranked_top_k_results_train
]

reranked_top_k_results_dev_with_ids = [
    get_cord_uids_from_indices(doc_ids, df_collection) for doc_ids in reranked_top_k_results_dev
]


## Evaluate MRR and Recall@K after ReRanking on Train and Dev Sets


In [None]:
for k in [1, 5, 10]:
    trimmed_predictions_train = [doc_ids[:k] for doc_ids in reranked_top_k_results_train_with_ids]
    trimmed_predictions_dev = [doc_ids[:k] for doc_ids in reranked_top_k_results_dev_with_ids]

    mrr_score_train = compute_mrr(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train
    )

    mrr_score_dev = compute_mrr(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev
    )

    print(f"MRR@{k} for Train Set after ReRanking: {mrr_score_train:.4f}")
    print(f"MRR@{k} for Dev Set after ReRanking: {mrr_score_dev:.4f}")


for k in [1, 5, 10]:
    trimmed_predictions_train = [doc_ids[:k] for doc_ids in reranked_top_k_results_train_with_ids]
    trimmed_predictions_dev = [doc_ids[:k] for doc_ids in reranked_top_k_results_dev_with_ids]

    recall_score_train = compute_recall_at_k(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train
    )

    recall_score_dev = compute_recall_at_k(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev
    )

    print(f"Recall@{k} for Train Set after ReRanking: {recall_score_train:.4f}")
    print(f"Recall@{k} for Dev Set after ReRanking: {recall_score_dev:.4f}")


| Metric      | k  | Train Set | Dev Set |
|-------------|----|-----------|---------|
| MRR         | 1  | 0.5014    | 0.5243  |
| MRR         | 5  | 0.5400    | 0.5576  |
| MRR         | 10 | 0.5400    | 0.5576  |
| Recall      | 1  | 0.5014    | 0.5243  |
| Recall      | 5  | 0.5959    | 0.6043  |
| Recall      | 10 | 0.5959    | 0.6043  |


## Creating Training Data with Positive and Negative Pairs


In [None]:
from sentence_transformers import InputExample
import random

train_examples = []

all_paper_texts = list(cord_uid_to_text.values())

for idx, row in df_query_train.iterrows():
    tweet_text = row['tweet_text']
    correct_cord_uid = row['cord_uid']
    correct_paper_text = cord_uid_to_text.get(correct_cord_uid, "")


    train_examples.append(InputExample(texts=[tweet_text, correct_paper_text], label=1.0))


    while True:
        negative_paper_text = random.choice(all_paper_texts)
        if negative_paper_text != correct_paper_text:
            break

    train_examples.append(InputExample(texts=[tweet_text, negative_paper_text], label=0.0))



In [None]:
from torch.utils.data import DataLoader

batch_size = 64

def simple_collate_fn(batch):
    return batch

train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=simple_collate_fn
)


model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model = CrossEncoder(model_name, num_labels=1)

## Optimization and Fine-Tuning of CrossEncoder for Semantic Relevance Matching


In [None]:

from torch import nn
from torch.optim import AdamW



device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

learning_rate = 2e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 1

loss_fct = nn.BCEWithLogitsLoss()

model.train()
tokenizer = model.tokenizer

for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    running_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        texts = [example.texts for example in batch]
        labels = torch.tensor([example.label for example in batch]).to(device)

        texts_a = [text_pair[0] for text_pair in texts]
        texts_b = [text_pair[1] for text_pair in texts]

        inputs = tokenizer(
            texts_a,
            texts_b,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}


        model.zero_grad()
        scores = model(**inputs).logits.squeeze()


        loss = loss_fct(scores, labels.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} finished with avg loss: {avg_loss:.4f}")

output_model_path = "./fine_tuned_cross_encoder"
model.save(output_model_path)

print(f"\nModel fine-tuned and saved at: {output_model_path}")



In [None]:
for k in [1, 5, 10]:

    trimmed_predictions_train = [doc_ids[:k] for doc_ids in reranked_top_k_results_train_with_ids]
    trimmed_predictions_dev = [doc_ids[:k] for doc_ids in reranked_top_k_results_dev_with_ids]


    mrr_score_train = compute_mrr(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train
    )

    mrr_score_dev = compute_mrr(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev
    )

    recall_score_train = compute_recall_at_k(
        predictions=trimmed_predictions_train,
        ground_truth_dict=ground_truth_train,
        query_ids=query_ids_train
    )

    recall_score_dev = compute_recall_at_k(
        predictions=trimmed_predictions_dev,
        ground_truth_dict=ground_truth_dev,
        query_ids=query_ids_dev
    )


    print(f"MRR@{k} for Train after Re-ranking: {mrr_score_train:.4f}")
    print(f"MRR@{k} for Dev after Re-ranking: {mrr_score_dev:.4f}")


    print(f"Recall@{k} for Train after Re-ranking: {recall_score_train:.4f}")
    print(f"Recall@{k} for Dev after Re-ranking: {recall_score_dev:.4f}")


## MRR and Recall after Re-Ranking with 1 epoch

| Metric    | Train Set | Dev Set |
|-----------|-----------|---------|
| MRR@1     | 0.5359    | 0.5493  |
| Recall@1  | 0.5359    | 0.5493  |
| MRR@5     | 0.5623    | 0.5706  |
| Recall@5  | 0.5959    | 0.6043  |
| MRR@10    | 0.5623    | 0.5706  |
| Recall@10 | 0.5959    | 0.6043  |
