### **First Stage Sentence Retrieval using SBERT**

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


Matplotlib created a temporary cache directory at C:\Users\BILLZH~1\AppData\Local\Temp\matplotlib-tolv_6rb because the default path (C:\Users\Bill Zhu\.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.





In [2]:
import json

# Load datasets
path = 'data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

In [3]:
# Load model
model1 = SentenceTransformer('sbert_model')
model2 = SentenceTransformer('sbert_model2')


In [4]:
embeddings1 = model1.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')
embeddings2 = model2.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

In [5]:
claim_embeddings1 = model1.encode(train_df['claim_text'].tolist(), show_progress_bar=True, device='cuda')
claim_embeddings2 = model2.encode(train_df['claim_text'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [6]:
claim_combined = (claim_embeddings1 + claim_embeddings2) / 2 
evidence_combined = (embeddings1 + embeddings2) / 2


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(claim_combined , evidence_combined)

In [8]:
import numpy as np
# Get top 10000 evidence for each claim
top_k = 100
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]  # sort descending, get top 100 indices

In [9]:
# Create new column with top evidence texts for each claim
train_df["top_100_evidence"] = [
    evidence_df.iloc[indices]["key"].tolist() for indices in top_indices
]

In [10]:
total_matches = 0
n = 0 
for i in range(len(train_df)):
    k = 0
    evidence_list = train_df["evidences"].iloc[i]
    top_100_list = train_df["top_100_evidence"].iloc[i]

    for ev in evidence_list:
        if ev in top_100_list:
            k += 1
    print(f"Found {k} matches for gold standard {len(evidence_list)}")
    n += len(evidence_list)
    total_matches += k

print(f"Total matches: {total_matches}/{n}")

Found 2 matches for gold standard 3
Found 0 matches for gold standard 2
Found 1 matches for gold standard 2
Found 3 matches for gold standard 5
Found 3 matches for gold standard 5
Found 0 matches for gold standard 5
Found 2 matches for gold standard 2
Found 0 matches for gold standard 3
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 1 matches for gold standard 1
Found 5 matches for gold standard 5
Found 3 matches for gold standard 5
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 0 matches for gold standard 1
Found 3 matches for gold standard 3
Found 4 matches for gold standard 5
Found 0 matches for gold standard 1
Found 4 matches for gold standard 4
Found 2 matches for gold standard 2
Found 2 matches for gold standard 2
Found 3 matches for gold standard 4
Found 3 matches for gold standard 3
Found 0 matches for gold standard 4
Found 2 matches for gold standard 2
Found 0 matches for gold standard 1
Found 5 matches for gold sta

In [None]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# Step 1: Prepare InputExamples
train_examples = []
for claim, top_evidence_list, gold_list in zip(train_df['claim_text'], train_df['top_100_evidence'], train_df['evidences']):
    for ev in top_evidence_list:
        label = 1 if ev in gold_list else 0
        train_examples.append(InputExample(texts=[claim, ev], label=label))

# Step 2: Train/Dev Split
train_data, dev_data = train_test_split(train_examples, test_size=0.1, random_state=42)

# Step 3: CrossEncoder setup
model = CrossEncoder("cross-encoder/stsb-roberta-base")

from sentence_transformers.cross_encoder.evaluation import CrossEncoderClassificationEvaluator

#evaluator = CrossEncoderClassificationEvaluator.from_input_examples(dev_data, name="dev-eval")


# Step 4: Fine-tune
model.fit(
    train_dataloader=DataLoader(train_data, shuffle=True, batch_size=16),
    epochs=2,
    warmup_steps=100,
    output_path="cross-encoder-finetuned"
)

# save the model
model.save("cross-encoder-finetuned")

In [None]:
# from datasets import Dataset
# from sklearn.model_selection import train_test_split

# # Prepare raw data as a list of dictionaries
# pairs = []
# for claim, top_evidence_list, gold_list in zip(train_df['claim_text'], train_df['top_100_evidence'], train_df['evidences']):
#     for ev in top_evidence_list:
#         label = 1 if ev in gold_list else 0
#         pairs.append({'text_pair': [claim, ev], 'label': label})

In [None]:
# train_data, dev_data = train_test_split(pairs, test_size=0.1, random_state=42)

# train_dataset = Dataset.from_list(train_data)
# dev_dataset = Dataset.from_list(dev_data)

In [None]:
# ### loss function
# from datasets import load_dataset
# from sentence_transformers import CrossEncoder
# from sentence_transformers.cross_encoder.losses import MultipleNegativesRankingLoss

# # Load a model to train/finetune
# model = CrossEncoder("cross-encoder/stsb-roberta-base")

# # Initialize the MultipleNegativesRankingLoss
# # This loss requires pairs of related texts or triplets
# loss = MultipleNegativesRankingLoss(model)


In [None]:
# from sentence_transformers.cross_encoder import CrossEncoderTrainingArguments

# args = CrossEncoderTrainingArguments(
#     # Required parameter:
#     output_dir="models/xlm-roberta-base",
#     # Optional training parameters:
#     num_train_epochs=1,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     learning_rate=2e-5,
#     warmup_ratio=0.1,
#     fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
#     bf16=False,  # Set to True if you have a GPU that supports BF16
#     # Optional tracking/debugging parameters:
#     eval_strategy="steps",
#     eval_steps=100,
#     save_strategy="steps",
#     save_steps=100,
#     save_total_limit=2,
#     logging_steps=100,
#     run_name="xlm-roberta-base",  # Will be used in W&B if `wandb` is installed
# )

In [None]:
# from sentence_transformers import CrossEncoder
# from datasets import load_dataset
# from sentence_transformers.cross_encoder import (
#     CrossEncoder,
#     CrossEncoderModelCardData,
#     CrossEncoderTrainer,
#     CrossEncoderTrainingArguments,
# )
# from sentence_transformers.losses import MultipleNegativesRankingLoss
# from sentence_transformers.training_args import BatchSamplers
# from sentence_transformers.evaluation import TripletEvaluator
# from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
# from sentence_transformers.cross_encoder.losses import CachedMultipleNegativesRankingLoss

# train_batch_size = 64

# evaluator = CrossEncoderNanoBEIREvaluator(
#     dataset_names=["msmarco", "nfcorpus", "nq"],
#     batch_size=train_batch_size,
# )

# trainer = CrossEncoderTrainer(
#     model=model,
#     args=args,
#     train_dataset=train_dataset,
#     eval_dataset=dev_dataset,
#     loss=loss,
#     evaluator=evaluator,
# )
# trainer.train()

                                                                        

IndexError: list index out of range