## Sentence Bert Training

In [20]:
# Imports
import json
import pandas as pd
from datasets import Dataset
import random
from datasets import load_dataset
import torch 
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# load data
path = '../data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

with open(path+"hard_negatives.json", "r") as f:
    hard_negatives = json.load(f)

In [22]:
claims = []
positives = []
hard_negs = []

evidence_map = dict(evidence)
evidence_ids = list(evidence_map.keys())

# For each claim, add positives and hard negatives
for id, info in train_claims.items():
    # Anchor, positive, negative triplets
    for pos_id in info["evidences"]:
        for neg_id in hard_negatives[id]:
            claims.append(info["claim_text"])
            positives.append(evidence_map[pos_id])
            hard_negs.append(evidence_map[neg_id])

    # Hard negatives
    # for neg_id in hard_negatives[id]:
    #     claims.append(info["claim_text"])
    #     evidence_texts.append(evidence_map[neg_id])
    #     labels.append(0)
    

# for _, info in train_claims.items():
#     claim_text = info["claim_text"]
#     positive_ids = set(info["evidences"])  # ensure no duplicates

#     # === Add all golden (positive) evidences ===
#     for eid in positive_ids:
#         if eid in evidence_map:
#             claims.append(claim_text)
#             evidence_texts.append(evidence_map[eid])
#             labels.append(1)

#     # === Add at least 5 unique negatives ===
#     negatives_added = 0
#     tried_ids = set()

#     while negatives_added < 10:
#         neg_id = random.choice(evidence_ids)

#         if neg_id in positive_ids or neg_id in tried_ids:
#             continue

#         tried_ids.add(neg_id)

#         if neg_id in evidence_map:
#             claims.append(claim_text)
#             evidence_texts.append(evidence_map[neg_id])
#             labels.append(-1)
#             negatives_added += 1

# convert to HuggingFace Dataset
data = {
    "claim": claims,
    "positive": positives,
    "hard_negative": hard_negs
}

dataset = Dataset.from_dict(data)

# Confirm
# print(dataset)
# print(dataset[0])


In [23]:
model1 = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
model2 = SentenceTransformer('all-MiniLM-L6-v2')
model3 = SentenceTransformer('all-MiniLM-L12-v2')

loss1 = MultipleNegativesRankingLoss(model1)
loss2 = MultipleNegativesRankingLoss(model2)
loss3 = MultipleNegativesRankingLoss(model3)

split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]


In [26]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="output",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=400,
    save_strategy="steps",
    save_steps=400,
    save_total_limit=2,
    logging_steps=400,
    run_name="sentence-transformer-training",
    load_best_model_at_end=True,
    greater_is_better=True,
)

In [28]:
# Training model 1
model1.to(device)
trainer1 = SentenceTransformerTrainer(
        model=model1,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        loss=loss1,
    )

# Train the model
trainer1.train()

# Save the model
trainer1.save_model(f"out_models/model_0")

Step,Training Loss,Validation Loss
400,0.0605,0.05124
800,0.0593,0.054207
1200,0.0553,0.057476
1600,0.0525,0.056499
2000,0.0469,0.05183
2400,0.0425,0.051336
2800,0.0424,0.046891
3200,0.0375,0.047405
3600,0.0329,0.047101
4000,0.0294,0.04471


In [34]:
# Training model 2
model2.to(device)
trainer2 = SentenceTransformerTrainer(
       model=model2,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        loss=loss2,
    )

# Train the model
trainer2.train()

# Save the model
trainer2.save_model(f"out_models/model_1")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
400,1.3854,1.073813
800,0.9407,0.709425
1200,0.6652,0.465106
1600,0.4757,0.331138
2000,0.373,0.250588
2400,0.29,0.188991
2800,0.2416,0.144086
3200,0.1999,0.118779
3600,0.1527,0.109786
4000,0.1313,0.098437


In [35]:
# Training model 3
model3.to(device)
trainer3 = SentenceTransformerTrainer(
        model=model3,
        args=args,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        loss=loss3,
    )

# Train the model
trainer3.train()

# Save the model
trainer3.save_model(f"out_models/model_2")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
400,1.2785,0.962963
800,0.8162,0.595223
1200,0.5498,0.376929
1600,0.3798,0.259661
2000,0.2925,0.18711
2400,0.2173,0.137826
2800,0.1812,0.10914
3200,0.1439,0.099014
3600,0.1143,0.085716
4000,0.0991,0.075084


### First Stage Revrieval

In [23]:
model1 = SentenceTransformer('out_models/model_0')
model2 = SentenceTransformer('out_models/model_1')
model3 = SentenceTransformer('out_models/model_2')

In [None]:
embeddings1 = model1.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')
embeddings2 = model2.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')
embeddings3 = model3.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

In [None]:
claim_embeddings1 = model1.encode(train_df['claim_text'], show_progress_bar=True, device='cuda')
claim_embeddings2 = model2.encode(train_df['claim_text'], show_progress_bar=True, device='cuda')
claim_embeddings3 = model3.encode(train_df['claim_text'], show_progress_bar=True, device='cuda')

  sentences_sorted = [sentences[idx] for idx in length_sorted_idx]


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

total_embeddings = (embeddings1 + embeddings2 + embeddings3) / 3
claim_embeddings = (claim_embeddings1 + claim_embeddings2 + claim_embeddings3) / 3

similarity_matrix = cosine_similarity(claim_embeddings, total_embeddings)

In [62]:
total_embeddings = embeddings1
claim_embeddings = claim_embeddings1

similarity_matrix = cosine_similarity(claim_embeddings, total_embeddings)

In [63]:
import numpy as np

top_k = 100
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]  # sort descending, get top 100 indices

train_df["top_100_evidence"] = [
    evidence_df.iloc[indices]["key"].tolist() for indices in top_indices
]

In [65]:
total_matches = 0
total_gold = 0
total_predicted = 0

for i in range(len(train_df)):
    gold_evidence = train_df["evidences"].iloc[i]
    predicted_evidence = train_df["top_100_evidence"].iloc[i]

    matches = sum(1 for ev in gold_evidence if ev in predicted_evidence)

    total_matches += matches
    total_gold += len(gold_evidence)
    total_predicted += len(predicted_evidence)  # Should be 100 for each if consistent

# Compute precision, recall, and F1-score
precision = total_matches / total_gold if total_gold > 0 else 0
recall = total_matches / total_predicted if total_predicted > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"model1 performance, training set:")
print(f"Total matches: {total_matches}/{total_gold}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1_score:.4f}")

model1 performance, training set:
Total matches: 4106/4122
Precision: 0.9961
Recall: 0.0334
F1-score: 0.0647


In [53]:
print(total_matches)

3803


### Cross Encoder

In [None]:
## setup dataset.
claims = []
evidence_texts = []
labels = []

evidence_map = dict(evidence)
evidence_ids = list(evidence_map.keys())

for row in train_df.itertuples():
    claim_text = row.claim_text
    positive_ids = set(row.evidences)
    negative_ids = set(row.top_100_evidence) - positive_ids

    # === Add all golden (positive) evidences ===
    for pos_id in positive_ids:
        if pos_id in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[pos_id])
            labels.append(1)
    
    for pos_id in negative_ids:
        if pos_id in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[pos_id])
            labels.append(-1)

# Convert to HuggingFace Dataset
data = {
    "claim": claims,
    "evidence": evidence_texts,
    "label": labels
}

dataset = Dataset.from_dict(data)

# Confirm
print(dataset)
print(dataset[0:5])

Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 32738
})
{'claim': ['Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'], 'evidence': ['Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.', 'At very high conc

In [None]:
import logging
import traceback

from datasets import load_dataset

from sentence_transformers.cross_encoder import (
    CrossEncoder,
    CrossEncoderModelCardData,
    CrossEncoderTrainer,
    CrossEncoderTrainingArguments,
)
from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss
from sentence_transformers.cross_encoder.losses import MultipleNegativesRankingLoss


logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
train_batch_size = 64
num_epochs = 1
num_rand_negatives = 5  # How many random negatives should be used for each question-answer pair

model = CrossEncoder("cross-encoder/stsb-roberta-base")

loss = MultipleNegativesRankingLoss(model)

split_dataset = dataset.train_test_split(test_size=0.1, seed=42069)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]

# evaluator = CrossEncoderNanoBEIREvaluator(
#     dataset_names=["msmarco", "nfcorpus", "nq"],
#     batch_size=train_batch_size,
# )
#evaluator(model)

# Set up training arguments
args = CrossEncoderTrainingArguments(
    # Required parameter:
    output_dir=f"out_models/cross_encoder",
    # Optional training parameters:
    num_train_epochs=num_epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=50,
    logging_first_step=True,
    run_name="run_cross",  # Will be used in W&B if `wandb` is installed
    seed=12,
)

trainer = CrossEncoderTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss=loss,
    #evaluator=evaluator,
)

trainer.train()

#evaluator(model)

final_output_dir = f"models/run_cross/final"
model.save_pretrained(final_output_dir)



2025-05-16 08:39:45 - Use pytorch device: cuda:0
2025-05-16 08:40:18 - NanoBEIR Evaluation of the model on ['msmarco', 'nfcorpus', 'nq'] dataset:
2025-05-16 08:40:18 - Evaluating NanoMSMARCO_R100
2025-05-16 08:40:18 - CrossEncoderRerankingEvaluator: Evaluating the model on the NanoMSMARCO_R100 dataset:
2025-05-16 08:40:35 - Queries: 50	Positives: Min 1.0, Mean 1.0, Max 1.0	Negatives: Min 99.0, Mean 99.0, Max 99.0
2025-05-16 08:40:35 -          Base  -> Reranked
2025-05-16 08:40:35 - MAP:     48.96 -> 9.21
2025-05-16 08:40:35 - MRR@10:  47.75 -> 6.16
2025-05-16 08:40:35 - NDCG@10: 54.04 -> 10.77
2025-05-16 08:40:35 - 
2025-05-16 08:40:35 - Evaluating NanoNFCorpus_R100
2025-05-16 08:40:35 - CrossEncoderRerankingEvaluator: Evaluating the model on the NanoNFCorpus_R100 dataset:
2025-05-16 08:42:04 - Queries: 50	Positives: Min 1.0, Mean 50.4, Max 463.0	Negatives: Min 54.0, Mean 92.8, Max 100.0
2025-05-16 08:42:04 -          Base  -> Reranked
2025-05-16 08:42:04 - MAP:     26.10 -> 28.06
202

Step,Training Loss,Validation Loss
