# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*



# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*


**This file inlcudes the method to fine-tune the pre-trained Lanaguage Model**

**This method uses dual-encoder and cross-encoder**

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [12]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import json
import os

# Set paths
data_dir = "data"
train_claims_file = os.path.join(data_dir, "train-claims.json")
dev_claims_file = os.path.join(data_dir, "dev-claims.json")
test_claims_file = os.path.join(data_dir, "test-claims-unlabelled.json")
evidence_file = os.path.join(data_dir, "evidence.json")


# Load train claims
with open(train_claims_file, 'r') as f:
    train_claims = json.load(f)
train_ids = list(train_claims.keys())
train_texts = [train_claims[claim_id]['claim_text'] for claim_id in train_ids]
claim_id_to_train_inidce = {claim_id: i for i, claim_id in enumerate(train_ids)}

print(f"Loaded {len(train_claims)} train claims.")

# Load dev claims
with open(dev_claims_file, 'r') as f:
    dev_claims = json.load(f)
dev_ids = list(dev_claims.keys())
dev_texts = [dev_claims[claim_id]['claim_text'] for claim_id in dev_ids]

print(f"Loaded {len(dev_claims)} dev claims.")

# Load test claims
with open(test_claims_file, 'r') as f:
    test_claims = json.load(f)
test_texts = [test_claims[claim_id]['claim_text'] for claim_id in test_claims.keys()]

print(f"Loaded {len(test_claims)} test claims.")

# Load evidence texts
with open(evidence_file, 'r') as f:
    evidence = json.load(f)

evidence_ids = list(evidence.keys())
evidence_texts = [evidence[claim_id] for claim_id in evidence_ids]
evidence_id_to_train_index = {claim_id: i for i, claim_id in enumerate(evidence_ids)}
print(f"Loaded {len(evidence)} evidence documents.")

Loaded 1228 train claims.
Loaded 154 dev claims.
Loaded 153 test claims.
Loaded 1208827 evidence documents.


In [3]:
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
import torch

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=512)
# model = AutoModel.from_pretrained(model_name).cuda()

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Define loss function

In [4]:
def contrastive_loss(claim_embedding, pos_evidence_embeddings, neg_evidence_embeddings,  temperature=0.02):
    """Compute improved contrastive loss with lower temperature for better discrimination"""
    # Compute positive and negative similarities
    pos_sim = torch.exp(torch.matmul(claim_embedding, pos_evidence_embeddings.T) / temperature).sum()
    neg_sim = torch.exp(torch.matmul(claim_embedding, neg_evidence_embeddings.T) / temperature).sum()

    # Compute contrastive loss
    loss = -torch.log(pos_sim / (pos_sim + neg_sim))
    return loss

In [5]:
from tqdm import tqdm
import torch.nn.functional as F

def generate_all_embeddings(model, texts, batch_size=32):
    """Generate embeddings for all texts"""
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu()  # CLS token
            all_embeddings.append(embeddings)
            torch.cuda.empty_cache()
    return torch.cat(all_embeddings)


import numpy as np
def get_test_f_scores(test_texts, test_claims, test_ids, evidence_texts, evidence_indice_to_claim_id, model, batch_size=32):
    """Get F-scores for test claims"""
    model.eval()

    # Generate test claim embeddings
    test_claim_embeddings = generate_all_embeddings(model, test_texts, batch_size=batch_size)

    # Generate evidence embeddings
    evidence_embeddings = generate_all_embeddings(model, evidence_texts, batch_size=batch_size*10)

    # Compute cosine similarities
    norm_test_claim_embeddings = F.normalize(test_claim_embeddings, p=2, dim=1)
    norm_evidence_embeddings = F.normalize(evidence_embeddings, p=2, dim=1)
    similarities = torch.matmul(norm_test_claim_embeddings, norm_evidence_embeddings.T)

    # Get top-k evidence indices for each claim
    top_k = 5
    top_k_indices = torch.topk(similarities, top_k, dim=1).indices
    top_k_indices = top_k_indices.numpy()

    # Compute F-scores
    f_scores = []
    for i, claim_id in enumerate(test_ids):
        # Get the evidence indices for the claim
        true_evidence_indices = test_claims[claim_id]["evidences"]

        # Get the predicted evidence indices
        predicted_evidence_indices = top_k_indices[i]
        predicted_evidence_indices = [evidence_indice_to_claim_id[evidence_index] for evidence_index in predicted_evidence_indices]

        # Compute precision and recall
        true_positives = len(set(true_evidence_indices) & set(predicted_evidence_indices))
        precision = true_positives / len(predicted_evidence_indices) if len(predicted_evidence_indices) > 0 else 0.0
        recall = true_positives / len(true_evidence_indices) if len(true_evidence_indices) > 0 else 0.0

        # Compute F-score
        f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        f_scores.append(f_score)
    model.train()
    return np.mean(f_scores)

### Training part —— Alternating between TF-IDF and model-based hard negatives every two epochs

In [6]:
import random
# Training configuration
max_epochs = 15
batch_size = 64
learning_rate = 5e-5
test_interval = 100  # Test every 100 steps
log_interval = 20    # Log every 20 steps
update_frequency = 2 # Update hard negatives every 2 epochs
best_f1 = 0

# Initialize model
model = AutoModel.from_pretrained(model_name)
model.cuda()

# Define optimizer with learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_epochs)

# Set random seed for reproducibility 
random.seed(330)
torch.manual_seed(330)
torch.cuda.manual_seed_all(330)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load pre-computed TF-IDF hard negatives
with open(os.path.join(data_dir, "hard_negatives.json"), 'r') as f:
    tfidf_hard_negatives = json.load(f)

# Initialize hard negatives dictionary with TF-IDF results
hard_negatives_dict = tfidf_hard_negatives.copy()

step = 0
max_score = 0.0

for epoch in range(max_epochs):
    print(f"Epoch {epoch + 1}/{max_epochs}")
    random.shuffle(train_ids)

    # Alternate between TF-IDF and model-based hard negatives
    use_tfidf = (epoch % (update_frequency * 2) == 0)  # True for epochs 0, 4, 8...
    use_model = (epoch % update_frequency == 0 and not use_tfidf)  # True for epochs 2, 6...

    if use_tfidf:
        print(f"Epoch {epoch + 1}: Using TF-IDF hard negatives")
        hard_negatives_dict = tfidf_hard_negatives.copy()

    elif use_model:
        print(f"Epoch {epoch + 1}: Updating hard negatives using current model")
        model.eval()
        with torch.no_grad():
            # Filter TF-IDF hard negatives using current model
            for claim_id in tqdm(train_claims.keys(), desc="Filtering hard negatives"):
                # Get claim embedding
                claim_text = train_claims[claim_id]['claim_text']
                inputs = tokenizer(claim_text, padding=True, truncation=True, return_tensors='pt').to(model.device)
                claim_embedding = model(**inputs).last_hidden_state[:, 0, :]
                norm_claim_embedding = F.normalize(claim_embedding, p=2, dim=1)

                # Fix KeyError - check if claim_id exists in tfidf_hard_negatives
                if claim_id not in tfidf_hard_negatives:
                    hard_negatives_dict[claim_id] = []
                    continue
                    
                # Get TF-IDF candidates
                tfidf_candidates = tfidf_hard_negatives[claim_id]
                candidate_evidence_ids = [neg['evidence_id'] for neg in tfidf_candidates]
                candidate_texts = [evidence[eid] for eid in candidate_evidence_ids]

                if not candidate_texts:  # Skip if no candidates
                    continue

                # Calculate similarities with current model
                inputs = tokenizer(
                    candidate_texts,
                    padding=True,
                    truncation=True,
                    max_length=256,
                    return_tensors='pt'
                ).to(model.device)

                evidence_embeddings = model(**inputs).last_hidden_state[:, 0, :]
                norm_evidence_embeddings = F.normalize(evidence_embeddings, p=2, dim=1)

                similarities = torch.matmul(norm_claim_embedding, norm_evidence_embeddings.T)
                similarities = similarities[0].cpu().numpy()

                # Get moderate difficulty negatives (rank 10-20)
                if len(similarities) > 20:
                    top_indices = np.argsort(similarities)[-20:-10]  # Get samples ranked 10-20
                else:
                    top_indices = np.argsort(similarities)[-5:]  # Fallback to top 5 if not enough candidates

                # Update hard negatives for this claim
                new_hard_negatives = []
                for idx in top_indices:
                    evidence_id = candidate_evidence_ids[idx]
                    if evidence_id not in train_claims[claim_id]['evidences']:
                        new_hard_negatives.append({
                            'evidence_id': evidence_id,
                            'similarity': float(similarities[idx])
                        })

                hard_negatives_dict[claim_id] = new_hard_negatives

        model.train()

    # Training loop
    for i in range(0, len(train_ids), batch_size):
        step += 1
        print(f"Step {step}")
        batch_ids = train_ids[i:i + batch_size]
        batch_claims = [train_claims[claim_id] for claim_id in batch_ids]
        batch_indices = [claim_id_to_train_inidce[claim_id] for claim_id in batch_ids]

        # Get evidence indices including current hard negatives
        evidence_indices = []
        pos_evidence_positive_indices = []
        for claim_id, claim in zip(batch_ids, batch_claims):
            # Add positive samples
            positive_indices = []
            for evidence_id in claim["evidences"][:2]:  # Use at most 2 positive samples
                evidence_idx = evidence_id_to_train_index[evidence_id]
                if evidence_idx not in evidence_indices:
                    evidence_indices.append(evidence_idx)
                positive_indices.append(len(evidence_indices) - 1)

            # Fix KeyError - check if claim_id exists in hard_negatives_dict
            if claim_id in hard_negatives_dict:
                hard_negs = hard_negatives_dict[claim_id][:3]  # Use top 3 hard negatives
                for neg in hard_negs:
                    neg_idx = evidence_id_to_train_index[neg['evidence_id']]
                    if neg_idx not in evidence_indices:
                        evidence_indices.append(neg_idx)
            
            pos_evidence_positive_indices.append(positive_indices)

        # Get claim embeddings
        claim_texts = [train_texts[i] for i in batch_indices]
        model_inputs = tokenizer(claim_texts, padding=True, truncation=True, return_tensors='pt').to(model.device)
        claim_embeddings = model(**model_inputs).last_hidden_state[:, 0, :]
        norm_claim_embeddings = F.normalize(claim_embeddings, p=2, dim=1)

        # Get evidence embeddings
        cur_evidence_indices = [evidence_texts[evidence_indice] for evidence_indice in evidence_indices]
        evidence_model_inputs = tokenizer(
            cur_evidence_indices,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors='pt'
        ).to(model.device)

        evidence_embeddings = model(**evidence_model_inputs).last_hidden_state[:, 0, :]
        norm_evidence_embeddings = F.normalize(evidence_embeddings, p=2, dim=1)

        # Calculate loss
        loss = []
        for i, claim_embedding in enumerate(norm_claim_embeddings):
            pos_evidence_embeddings = norm_evidence_embeddings[torch.tensor(pos_evidence_positive_indices[i])]
            neg_evidence_embeddings = norm_evidence_embeddings[torch.tensor([j for j in range(len(evidence_indices))
                                                               if j not in pos_evidence_positive_indices[i]])]
            loss.append(contrastive_loss(claim_embedding, pos_evidence_embeddings, neg_evidence_embeddings))
        loss = torch.mean(torch.stack(loss))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Clear cache
        del evidence_embeddings, norm_evidence_embeddings
        torch.cuda.empty_cache()

        if step % log_interval == 0:
            print(f"Loss: {loss.item()}")

        if step % test_interval == 0:
            # Evaluate on dev set
            f_score = get_test_f_scores(dev_texts, dev_claims, dev_ids, evidence_texts, evidence_ids, model, batch_size=batch_size)
            print(f"F-score on dev set: {f_score}")
            # Save model
            torch.save(model.state_dict(), f"model_epoch_{epoch + 1}_step_{step}.pth")
            if f_score > max_score:
                max_score = f_score
                print(f"New best F-score: {max_score}")
                torch.save(model.state_dict(), "best_model.pth")
    
    # Update learning rate
    scheduler.step()

2025-05-15 11:31:02.626031: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 11:31:02.645752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747279862.671287    3814 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747279862.678983    3814 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747279862.698431    3814 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/15
Epoch 1: Using TF-IDF hard negatives
Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
Step 11
Step 12
Step 13
Step 14
Step 15
Step 16
Step 17
Step 18
Step 19
Step 20
Loss: 1.1409674882888794
Epoch 2/15
Step 21
Step 22
Step 23
Step 24
Step 25
Step 26
Step 27
Step 28
Step 29
Step 30
Step 31
Step 32
Step 33
Step 34
Step 35
Step 36
Step 37
Step 38
Step 39
Step 40
Loss: 0.037253983318805695
Epoch 3/15
Epoch 3: Updating hard negatives using current model


Filtering hard negatives: 100%|██████████| 1228/1228 [00:35<00:00, 34.70it/s]


Step 41
Step 42
Step 43
Step 44
Step 45
Step 46
Step 47
Step 48
Step 49
Step 50
Step 51
Step 52
Step 53
Step 54
Step 55
Step 56
Step 57
Step 58
Step 59
Step 60
Loss: 0.6349079012870789
Epoch 4/15
Step 61
Step 62
Step 63
Step 64
Step 65
Step 66
Step 67
Step 68
Step 69
Step 70
Step 71
Step 72
Step 73
Step 74
Step 75
Step 76
Step 77
Step 78
Step 79
Step 80
Loss: 0.18634456396102905
Epoch 5/15
Epoch 5: Using TF-IDF hard negatives
Step 81
Step 82
Step 83
Step 84
Step 85
Step 86
Step 87
Step 88
Step 89
Step 90
Step 91
Step 92
Step 93
Step 94
Step 95
Step 96
Step 97
Step 98
Step 99
Step 100
Loss: 0.15157179534435272


100%|██████████| 3/3 [00:00<00:00, 83.90it/s]
100%|██████████| 1889/1889 [07:02<00:00,  4.47it/s]


F-score on dev set: 0.16284271284271284
New best F-score: 0.16284271284271284
Epoch 6/15
Step 101
Step 102
Step 103
Step 104
Step 105
Step 106
Step 107
Step 108
Step 109
Step 110
Step 111
Step 112
Step 113
Step 114
Step 115
Step 116
Step 117
Step 118
Step 119
Step 120
Loss: 0.24049368500709534
Epoch 7/15
Epoch 7: Updating hard negatives using current model


Filtering hard negatives: 100%|██████████| 1228/1228 [00:35<00:00, 34.40it/s]


Step 121
Step 122
Step 123
Step 124
Step 125
Step 126
Step 127
Step 128
Step 129
Step 130
Step 131
Step 132
Step 133
Step 134
Step 135
Step 136
Step 137
Step 138
Step 139
Step 140
Loss: 0.06903477758169174
Epoch 8/15
Step 141
Step 142
Step 143
Step 144
Step 145
Step 146
Step 147
Step 148
Step 149
Step 150
Step 151
Step 152
Step 153
Step 154
Step 155
Step 156
Step 157
Step 158
Step 159
Step 160
Loss: 0.2752847969532013
Epoch 9/15
Epoch 9: Using TF-IDF hard negatives
Step 161
Step 162
Step 163
Step 164
Step 165
Step 166
Step 167
Step 168
Step 169
Step 170
Step 171
Step 172
Step 173
Step 174
Step 175
Step 176
Step 177
Step 178
Step 179
Step 180
Loss: 0.09241817891597748
Epoch 10/15
Step 181
Step 182
Step 183
Step 184
Step 185
Step 186
Step 187
Step 188
Step 189
Step 190
Step 191
Step 192
Step 193
Step 194
Step 195
Step 196
Step 197
Step 198
Step 199
Step 200
Loss: 0.18410101532936096


100%|██████████| 3/3 [00:00<00:00, 73.27it/s]
100%|██████████| 1889/1889 [07:09<00:00,  4.40it/s]


F-score on dev set: 0.15777674706246136
Epoch 11/15
Epoch 11: Updating hard negatives using current model


Filtering hard negatives: 100%|██████████| 1228/1228 [00:35<00:00, 34.68it/s]


Step 201
Step 202
Step 203
Step 204
Step 205
Step 206
Step 207
Step 208
Step 209
Step 210
Step 211
Step 212
Step 213
Step 214
Step 215
Step 216
Step 217
Step 218
Step 219
Step 220
Loss: 0.04094647616147995
Epoch 12/15
Step 221
Step 222
Step 223
Step 224
Step 225
Step 226
Step 227
Step 228
Step 229
Step 230
Step 231
Step 232
Step 233
Step 234
Step 235
Step 236
Step 237
Step 238
Step 239
Step 240
Loss: 0.05835933983325958
Epoch 13/15
Epoch 13: Using TF-IDF hard negatives
Step 241
Step 242
Step 243
Step 244
Step 245
Step 246
Step 247
Step 248
Step 249
Step 250
Step 251
Step 252
Step 253
Step 254
Step 255
Step 256
Step 257
Step 258
Step 259
Step 260
Loss: 0.15600687265396118
Epoch 14/15
Step 261
Step 262
Step 263
Step 264
Step 265
Step 266
Step 267
Step 268
Step 269
Step 270
Step 271
Step 272
Step 273
Step 274
Step 275
Step 276
Step 277
Step 278
Step 279
Step 280
Loss: 0.22706279158592224
Epoch 15/15
Epoch 15: Updating hard negatives using current model


Filtering hard negatives: 100%|██████████| 1228/1228 [00:35<00:00, 34.40it/s]


Step 281
Step 282
Step 283
Step 284
Step 285
Step 286
Step 287
Step 288
Step 289
Step 290
Step 291
Step 292
Step 293
Step 294
Step 295
Step 296
Step 297
Step 298
Step 299
Step 300
Loss: 0.029318323358893394


100%|██████████| 3/3 [00:00<00:00, 84.28it/s]
100%|██████████| 1889/1889 [07:04<00:00,  4.45it/s]


F-score on dev set: 0.15389610389610392


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Cross-encoder for reranking

In [7]:
model = AutoModel.from_pretrained(model_name)  # First initialize the original model architecture
model.load_state_dict(torch.load("best_model.pth"), strict=False)  # Then load the trained weights
model.cuda()
model.eval()

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Initialize cross-encoder
cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
cross_encoder = AutoModelForSequenceClassification.from_pretrained(cross_encoder_name).cuda()
cross_encoder_tokenizer = AutoTokenizer.from_pretrained(cross_encoder_name)

optimizer = torch.optim.AdamW(cross_encoder.parameters(), lr=1e-5)

# Reranking function
def rerank_evidences(claim, candidate_evidence_ids, candidate_texts, model, tokenizer, top_k=10):
    """Rerank candidate evidences using a cross-encoder model"""
    model.eval()
    
    # Create input pairs: (claim, evidence)
    paired_texts = []
    for evidence_text in candidate_texts:
        paired_texts.append([claim, evidence_text])
    
    # Score each pair
    with torch.no_grad():
        inputs = tokenizer(
            paired_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=256
        ).to(model.device)
        
        scores = model(**inputs).logits.squeeze(-1).cpu().numpy()
    
    # Rerank based on scores
    reranked_indices = np.argsort(-scores)
    reranked_evidence_ids = [candidate_evidence_ids[i] for i in reranked_indices[:top_k]]
    
    return reranked_evidence_ids

# Evaluation function to calculate F-scores
def evaluate_with_reranking(test_texts, test_claims, test_ids, evidence_texts, evidence_ids,
                           initial_model, cross_encoder, cross_encoder_tokenizer,
                           batch_size=16, initial_k=50, final_k=5):
    """Evaluate F-score of the two-stage retrieval system"""
    initial_model.eval()
    cross_encoder.eval()
    
    # Generate test claim embeddings
    test_claims_embeddings = generate_all_embeddings(initial_model, test_texts, batch_size=batch_size)
    
    # Generate evidence embeddings
    test_evidence_embeddings = generate_all_embeddings(initial_model, evidence_texts, batch_size=batch_size*10)
    
    # Normalize embeddings
    norm_test_claim_embeddings = F.normalize(test_claims_embeddings, p=2, dim=1)
    norm_evidence_embeddings = F.normalize(test_evidence_embeddings, p=2, dim=1)
    
    # First stage: compute similarities and get initial candidates
    similarities = torch.matmul(norm_test_claim_embeddings, norm_evidence_embeddings.T)
    top_k_initial = torch.topk(similarities, initial_k, dim=1).indices.numpy()
    
    # Calculate F-scores
    f_scores = []
    for i, claim_id in enumerate(test_ids):
        # Get true evidence
        true_evidence_ids = test_claims[claim_id]["evidences"]
        
        # Get claim text
        claim_text = test_claims[claim_id]["claim_text"]
        
        # Get first-stage candidates
        candidate_indices = top_k_initial[i]
        candidate_evidence_ids = [evidence_ids[idx] for idx in candidate_indices]
        candidate_texts = [evidence_texts[idx] for idx in candidate_indices]
        
        # Rerank using cross-encoder
        reranked_evidence_ids = rerank_evidences(
            claim_text, 
            candidate_evidence_ids, 
            candidate_texts, 
            cross_encoder, 
            cross_encoder_tokenizer, 
            top_k=final_k
        )
        
        # Calculate precision and recall
        true_positives = len(set(true_evidence_ids) & set(reranked_evidence_ids))
        precision = true_positives / len(reranked_evidence_ids) if len(reranked_evidence_ids) > 0 else 0.0
        recall = true_positives / len(true_evidence_ids) if len(true_evidence_ids) > 0 else 0.0
        
        # Calculate F-score
        f_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        f_scores.append(f_score)
        
        # Print results for each sample
        # print(f"Claim {i+1}/{len(test_ids)}: F-score = {f_score:.4f}, P = {precision:.4f}, R = {recall:.4f}")
        
    return np.mean(f_scores)

#  Evaluate on dev set
print("Evaluating on dev set...")
# First evaluate baseline without reranking
print("Baseline without reranking:")
baseline_f1 = get_test_f_scores(dev_texts, dev_claims, dev_ids, evidence_texts, evidence_ids, model, batch_size=64)
print(f"Baseline F-score on dev set: {baseline_f1:.4f}")

# Then evaluate with reranking
print("\nWith reranking:")
reranked_f1 = evaluate_with_reranking(
    dev_texts, dev_claims, dev_ids, evidence_texts, evidence_ids,
    model, cross_encoder, cross_encoder_tokenizer,
    batch_size=32, initial_k=50, final_k=5)

print(f"Reranked F-score on dev set: {reranked_f1:.4f}")
print(f"Improvement: {(reranked_f1 - baseline_f1) * 100:.2f}%")




Evaluating on dev set...
Baseline without reranking:


100%|██████████| 3/3 [00:00<00:00, 66.61it/s]
100%|██████████| 1889/1889 [07:05<00:00,  4.44it/s]


Baseline F-score on dev set: 0.1628

With reranking:


100%|██████████| 5/5 [00:00<00:00, 67.29it/s]
100%|██████████| 3778/3778 [06:27<00:00,  9.74it/s]


Reranked F-score on dev set: 0.2064
Improvement: 4.35%


In [8]:
# Function to generate test predictions with reranking
def generate_test_predictions_with_reranking(test_texts, test_claims, test_ids, evidence_texts, evidence_ids,
                                           initial_model, cross_encoder, cross_encoder_tokenizer,
                                           batch_size=16, initial_k=50, final_k=5):
    """Generate test predictions with two-stage retrieval (retrieval + reranking)"""
    initial_model.eval()
    cross_encoder.eval()
    
    # Generate test claim embeddings
    test_claims_embeddings = generate_all_embeddings(initial_model, test_texts, batch_size=batch_size)
    
    # Generate evidence embeddings
    test_evidence_embeddings = generate_all_embeddings(initial_model, evidence_texts, batch_size=batch_size*10)
    
    # Normalize embeddings
    norm_test_claim_embeddings = F.normalize(test_claims_embeddings, p=2, dim=1)
    norm_evidence_embeddings = F.normalize(test_evidence_embeddings, p=2, dim=1)
    
    # First stage: compute similarities and get initial candidates
    similarities = torch.matmul(norm_test_claim_embeddings, norm_evidence_embeddings.T)
    top_k_initial = torch.topk(similarities, initial_k, dim=1).indices.numpy()
    
    # Second stage: rerank and save results
    results = test_claims.copy()
    for i, claim_id in enumerate(test_ids):
        # Get claim text
        claim_text = test_claims[claim_id]['claim_text']
        
        # Get first-stage candidates
        candidate_indices = top_k_initial[i]
        candidate_evidence_ids = [evidence_ids[idx] for idx in candidate_indices]
        candidate_texts = [evidence_texts[idx] for idx in candidate_indices]
        
        # Rerank with cross-encoder
        reranked_evidence_ids = rerank_evidences(
            claim_text, 
            candidate_evidence_ids, 
            candidate_texts, 
            cross_encoder, 
            cross_encoder_tokenizer, 
            top_k=final_k
        )
        
        # Store final results
        results[claim_id]['evidences'] = reranked_evidence_ids
    
    return results

In [9]:
# Generate test set predictions
print("\nGenerating test set predictions...")
test_claims_ids = list(test_claims.keys())
results = generate_test_predictions_with_reranking(
    test_texts, test_claims, test_claims_ids, evidence_texts, evidence_ids,
    model, cross_encoder, cross_encoder_tokenizer,
    batch_size=32, initial_k=50, final_k=5
)

output_file = os.path.join(data_dir, "test_claims_retrieved_reranked.json")
with open(output_file, 'w') as f:
    json.dump(results, f, indent=4)
print(f"Results saved to {output_file}")


Generating test set predictions...


100%|██████████| 5/5 [00:00<00:00, 88.90it/s]
100%|██████████| 3778/3778 [06:34<00:00,  9.58it/s]


Results saved to data/test_claims_retrieved_reranked.json


In [10]:
# print(results)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*