# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme (EDITED BY JOHN and Guwei)

*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import faiss
from sentence_transformers import CrossEncoder
from sentence_transformers import InputExample, SentenceTransformer, losses
from torch.utils.data import DataLoader
import random
import torch
import pandas as pd
import json
import re

def retrieve(evi_ebds, claim_ebds, evi_df, claim_df, retrival_top_k, rerank_top_k,threshold_activated,score_threshold, cross_encoder,dev):
    embedding_dim = evi_ebds.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)
    faiss.normalize_L2(evi_ebds)
    index.add(evi_ebds)
    retrieval = pd.DataFrame()
    retrieved_labels = []
    retrieved_evidences = []

    i = 0
    counts = 0
    claim_texts  =[]
    total = len(claim_ebds)
    for dev_claim_embedding in claim_ebds: 
        faiss.normalize_L2(dev_claim_embedding.reshape(1, -1)) 
        D, I = index.search(dev_claim_embedding.reshape(1, -1), retrival_top_k)
        text = claim_df.iloc[i]['claim_text']
        pairs = [(text, evi_df['value'][a]) for a in I[0]]
        scores = cross_encoder.predict(pairs)
        reranked = sorted(zip(I[0], scores), key=lambda x: x[1], reverse=True)

        retrieved_evidence = []
        filtered = []
        first = True
        for idx, score in reranked[:rerank_top_k]:
            if first:
                evi_id = evi_df['ID'][idx]
                evi_content = evi_df['value'][idx]
                retrieved_evidence.append(evi_id)
                filtered.append((evi_id, score, evi_content))
                first = False
            else:
                if not threshold_activated or score >= score_threshold:
                    evi_id = evi_df['ID'][idx]
                    evi_content = evi_df['value'][idx]
                    retrieved_evidence.append(evi_id)
                    filtered.append((evi_id, score, evi_content))

        retrieved_evi =  [evi_df['ID'][a] for a in I[0]]
        retrieved_evidences.append(retrieved_evidence)
        retrieved_labels.append('SUPPORTS')
        claim_texts.append(text)

        if dev:
            print(f"Claim: {text}")
            if len(filtered) > 0:
                print("evidence relevant")
                for eid, score, c in filtered:
                    print(f"  {eid}, Score: {score:.4f} ")
            print(f"Ground truth: {claim_df.iloc[i]['evidences']}\n")

            count = 0
            for g in claim_df.iloc[i]['evidences']:
                if g in retrieved_evi:
                    count += 1
            counts += count / len(claim_df.iloc[i]['evidences'])



        print('Progress ', round(i * 100 / total, 3), '%')
        i += 1


    print("R: ", counts/ i)
    retrieval['ID'] = claim_df['ID']
    retrieval['evidences'] = retrieved_evidences
    retrieval['claim_label'] = retrieved_labels
    retrieval['claim_text'] = claim_texts
    return retrieval

def mine_hard_negatives(retrieved_evidences, ground_truth_evidences):
    negative_evidences = []
    for i in range(len(retrieved_evidences)):
        retrieved_evidence = retrieved_evidences.iloc[i]
        ground_truth_evidence = ground_truth_evidences.iloc[i]
        negative_evidence = []
        for e in retrieved_evidence:
            if e not in ground_truth_evidence:
                negative_evidence.append(int(re.findall(r'\d+', e)[0]))
        negative_evidences.append(negative_evidence)
    return negative_evidences

def load_train_data(train_dataframe, evidence_dateframe, random_negatives_amount_per_claim):
    train_data = []
    for id in range(len(train_dataframe)):
        claim_text =  train_dataframe.iloc[id]['claim_text']
        # print("CLAIM: ", claim_text)
        positive_evidence_ids = train_dataframe.iloc[id]['evidences_numeric_index']
        negative_evidence_ids = train_dataframe.iloc[id]['negative_evidences']
        for evid_id in positive_evidence_ids:
            
            evidence_text = evidence_dateframe.iloc[evid_id]['value']
            # print("POSITIVE: ", evidence_text)
            train_data.append(InputExample(texts=[claim_text, evidence_text], label=1.0))
        for ngevid_id in negative_evidence_ids:
            evidence_text = evidence_dateframe.iloc[ngevid_id]['value']
            # print("NEGATIVE: ", evidence_text)
            train_data.append(InputExample(texts=[claim_text, evidence_text], label=0.0))

        for i in range(random_negatives_amount_per_claim):
            neg_id = random.choice(list(set(evidence_dateframe.index) - set(positive_evidence_ids)))
            neg_text = evidence_dateframe.iloc[neg_id]['value']
            train_data.append(InputExample(texts=[claim_text, neg_text], label=0.0))
    return train_data

def load_positive_train_data(train_dataframe, evidence_dateframe):
    train_data = []
    for id in range(len(train_dataframe)):
        claim_text =  train_dataframe.iloc[id]['claim_text']
        positive_evidence_ids = train_dataframe.iloc[id]['evidences_numeric_index']
        for evid_id in positive_evidence_ids:
            evidence_text = evidence_dateframe.iloc[evid_id]['value']
            train_data.append(InputExample(texts=[claim_text, evidence_text], label=1.0))
    return train_data

def save_retrieval(retrieval, path):
    output = {}
    for i in range(len(retrieval)):
        output[retrieval.iloc[i]['ID']] = {}
        output[retrieval.iloc[i]['ID']]['evidences'] = retrieval.iloc[i]['evidences']
        output[retrieval.iloc[i]['ID']]['claim_label'] = retrieval.iloc[i]['claim_label']
        output[retrieval.iloc[i]['ID']]['claim_text'] = ''

    with open(path, 'w') as file:
        file.write(json.dumps(output))

Read evidence dataset.

In [None]:
with open('evidence.json', 'r') as f:
    evidence = json.load(f)
flat_list = []
for key in evidence:
    flat_list.append({"ID": key, "value": evidence[key]})

evidence_df = pd.DataFrame(flat_list)

Read training and developing datasets

In [None]:


with open('train-claims.json', 'r') as f:
    train_df = json.load(f)
flat_list = []
for key in train_df:
    evidences_numeric_index = []
    for e in train_df[key]['evidences']:
        evidences_numeric_index.append(int(re.findall(r'\d+', e)[0]))
    flat_list.append({"ID": key, "claim_text": train_df[key]['claim_text'], "claim_label": train_df[key]['claim_label'], "evidences": train_df[key]['evidences'], "evidences_numeric_index": evidences_numeric_index})
train_df = pd.DataFrame(flat_list)

with open('dev-claims.json', 'r') as f:
    dev_df = json.load(f)
flat_list = []
for key in dev_df:
    evidences_numeric_index = []
    for e in dev_df[key]['evidences']:
        evidences_numeric_index.append(int(re.findall(r'\d+', e)[0]))
    flat_list.append({"ID": key, "claim_text": dev_df[key]['claim_text'], "claim_label": dev_df[key]['claim_label'], "evidences": dev_df[key]['evidences'], "evidences_numeric_index": evidences_numeric_index})
dev_df = pd.DataFrame(flat_list)



# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

Calculate sentence embeddings for evidence dataset and trainning dataset.

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer('all-MiniLM-L6-v2', device=device) 

train_data = load_positive_train_data(train_df, evidence_df)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    output_path='./output-bi-encoder',
    weight_decay=0.01
)

evidence_embeddings = model.encode(evidence_df['value'], batch_size=64, show_progress_bar=True)
dev_claims_embeddings = model.encode(dev_df['claim_text'], batch_size=64, show_progress_bar=True)
train_claims_embeddings = model.encode(train_df['claim_text'], batch_size=64, show_progress_bar=True)


Fine tune cross encoder for re-ranking.

In [None]:
from datasets import Dataset
def load_positive_train_ds(train_dataframe, evidence_dateframe, t_ebds, e_ebds):
    query = []
    answer = []
    for id in range(len(train_dataframe)):
        claim_text =  train_dataframe.iloc[id]['claim_text']
        positive_evidence_ids = train_dataframe.iloc[id]['evidences_numeric_index']
        # claim_ebd = t_ebds[id]
        # none_added = True
        for evid_id in positive_evidence_ids:
            
            # evi_ebd = e_ebds[evid_id]
            evidence = evidence_dateframe.iloc[evid_id]['value']
            # similarities = model.similarity(claim_ebd, evi_ebd)

            # if similarities > 0:
            #     none_added = False
            # print('Claim: ', claim_text)
            # print('Evidence: ', evidence)
            
            query.append(claim_text)
            answer.append(evidence)
        # if none_added:
        #     query.append(claim_text)
        #     answer.append(evidence_dateframe.iloc[positive_evidence_ids[0]]['value'])

    return Dataset.from_dict({
    "query": query,
    "answer": answer,})


train_dataset = load_positive_train_ds(train_df, evidence_df, train_claims_embeddings, evidence_embeddings)
print(len(train_dataset))

In [None]:
from sentence_transformers.cross_encoder.evaluation import CrossEncoderRerankingEvaluator
from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments, CrossEncoderTrainer, losses
from sentence_transformers.util import mine_hard_negatives

eval_dataset = load_positive_train_ds(dev_df, evidence_df,dev_claims_embeddings, evidence_embeddings)
embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
hard_eval_dataset = mine_hard_negatives(
    eval_dataset,
    embedding_model,
    corpus = train_dataset["answer"],  # Use the full dataset as the corpus
    num_negatives=50,  # How many negatives per question-answer pair
    batch_size=4096,  # Use a batch size of 4096 for the embedding model
    output_format="n-tuple",  # The output format is (query, positive, negative1, negative2, ...) for the evaluator
    include_positives=True,  # Key: Include the positive answer in the list of negatives
    use_faiss=True,  # Using FAISS is recommended to keep memory usage low (pip install faiss-gpu or pip install faiss-cpu)
)

reranking_evaluator = CrossEncoderRerankingEvaluator(
    samples=[
        {
            "query": sample["query"],
            "positive": [sample["answer"]],
            "documents": [sample[column_name] for column_name in hard_eval_dataset.column_names[2:]],
        }
        for sample in hard_eval_dataset
    ],
    batch_size=32,
    name="gooaq-dev",
)

In [None]:



cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

reranking_evaluator(cross_encoder)


model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
run_name = f"reranker-{short_model_name}-gooaq-cmnrl"
args = CrossEncoderTrainingArguments(
    # Required parameter:
    output_dir=f"models/{run_name}",
    # Optional training parameters:
    num_train_epochs= 40,
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    learning_rate=1e-6,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",
    save_strategy="epoch",

    save_total_limit=2,
    run_name=run_name,  # Will be used in W&B if `wandb` is installed
    seed=12,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    weight_decay = 0.01,
)
loss = losses.MultipleNegativesRankingLoss(model=cross_encoder)

trainer = CrossEncoderTrainer(
    model=cross_encoder,
    args=args,
    train_dataset=train_dataset,
    loss=loss,
    evaluator=reranking_evaluator,
    eval_dataset=eval_dataset,
)
trainer.train()

# cross_encoder.fit(

#     train_dataloader=train_dataloader,
#     # loss_fct=train_loss,
#     epochs=20,
#     output_path='./fine-tuned-cross-encoder',
#     weight_decay=0.01
# )

Second retrieve for testing.

In [None]:
# train_retrieval = retrieve(evidence_embeddings, train_claims_embeddings, evidence_df , train_df, 150, 4, True, 2, cross_encoder, True)
# save_retrieval(train_retrieval, 'train_retrieval.json')

In [None]:

dev_retrieval = retrieve(evidence_embeddings, dev_claims_embeddings, evidence_df , dev_df, 150, 5, True, 0, cross_encoder, True)

save_retrieval(dev_retrieval, 'dev_retrieval.json')


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
with open('test-claims-unlabelled.json', 'r') as f:
    test_df = json.load(f)
flat_list = []
for key in test_df:

    flat_list.append({"ID": key, "claim_text": test_df[key]['claim_text']})
tst_df = pd.DataFrame(flat_list)
tst_claims_embeddings = model.encode(tst_df['claim_text'], batch_size=64, show_progress_bar=True)
retrieval = retrieve(evidence_embeddings, tst_claims_embeddings, evidence_df , tst_df, 150, 5, False, 0, cross_encoder, False)
save_retrieval(retrieval, 'test-output.json')

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*