### **First Stage Sentence Retrieval using SBERT**

In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


Matplotlib created a temporary cache directory at C:\Users\BILLZH~1\AppData\Local\Temp\matplotlib-c6tyauwp because the default path (C:\Users\Bill Zhu\.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.





In [2]:
import json

# Load datasets
path = 'data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

In [3]:
# Load model
#model = SentenceTransformer('cross-encoder-finetuned')
model = SentenceTransformer('sbert_model')

import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [4]:
embeddings = model.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

In [5]:
claim_embeddings = model.encode(train_df['claim_text'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(claim_embeddings, embeddings)

In [7]:
import numpy as np
# Get top 10000 evidence for each claim
top_k = 100
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]  # sort descending, get top 100 indices

In [8]:
# Create new column with top evidence texts for each claim
train_df["top_100_evidence"] = [
    evidence_df.iloc[indices]["key"].tolist() for indices in top_indices
]

In [9]:
total_matches = 0
n = 0 
for i in range(len(train_df)):
    k = 0
    evidence_list = train_df["evidences"].iloc[i]
    top_100_list = train_df["top_100_evidence"].iloc[i]

    for ev in evidence_list:
        if ev in top_100_list:
            k += 1
    print(f"Found {k} matches for gold standard {len(evidence_list)}")
    n += len(evidence_list)
    total_matches += k

print(f"Total matches: {total_matches}/{n}")

Found 2 matches for gold standard 3
Found 0 matches for gold standard 2
Found 1 matches for gold standard 2
Found 2 matches for gold standard 5
Found 3 matches for gold standard 5
Found 2 matches for gold standard 5
Found 2 matches for gold standard 2
Found 0 matches for gold standard 3
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 1 matches for gold standard 1
Found 5 matches for gold standard 5
Found 1 matches for gold standard 5
Found 4 matches for gold standard 5
Found 1 matches for gold standard 1
Found 0 matches for gold standard 1
Found 3 matches for gold standard 3
Found 4 matches for gold standard 5
Found 0 matches for gold standard 1
Found 4 matches for gold standard 4
Found 2 matches for gold standard 2
Found 2 matches for gold standard 2
Found 3 matches for gold standard 4
Found 3 matches for gold standard 3
Found 0 matches for gold standard 4
Found 2 matches for gold standard 2
Found 0 matches for gold standard 1
Found 5 matches for gold sta

In [10]:
### cross encoder
from datasets import Dataset

claims = []
evidence_texts = []
labels = []

evidence_map = dict(evidence)
evidence_ids = list(evidence_map.keys())

for row in train_df.itertuples():
    claim_text = row.claim_text
    positive_ids = set(row.evidences)
    negative_ids = set(row.top_100_evidence) - positive_ids

    # === Add all golden (positive) evidences ===
    for eid in positive_ids:
        if eid in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[eid])
            labels.append(1)
    
    for eid in negative_ids:
        if eid in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[eid])
            labels.append(-1)

# onvert to HuggingFace Dataset
data = {
    "claim": claims,
    "evidence": evidence_texts,
    "label": labels
}

dataset = Dataset.from_dict(data)

# Confirm
print(dataset)
print(dataset[0:5])

Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 124177
})
{'claim': ['Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'], 'evidence': ['Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient condit

In [13]:
from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.losses import MultipleNegativesRankingLoss

model = CrossEncoder("cross-encoder/stsb-roberta-base")
loss = MultipleNegativesRankingLoss(model)

In [None]:
import logging
import traceback

from datasets import load_dataset

from sentence_transformers.cross_encoder import (
    CrossEncoder,
    CrossEncoderModelCardData,
    CrossEncoderTrainer,
    CrossEncoderTrainingArguments,
)

split_dataset = dataset.train_test_split(test_size=0.1, seed=42069)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]

args = CrossEncoderTrainingArguments(
    # Required parameter:
    output_dir=f"output/",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=50,
    logging_first_step=True,
    run_name='cross',  # Will be used in W&B if `wandb` is installed
    seed=12,
)

trainer = CrossEncoderTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss=loss,
)
trainer.train()

trainer.save_model("cross_encoder_model")