In [1]:
import json
import pandas as pd

path = 'data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

In [2]:
from datasets import Dataset
import random

claims = []
evidence_texts = []
labels = []

evidence_map = dict(evidence)
evidence_ids = list(evidence_map.keys())

for _, info in train_claims.items():
    claim_text = info["claim_text"]
    positive_ids = set(info["evidences"])  # ensure no duplicates

    # === Add all golden (positive) evidences ===
    for eid in positive_ids:
        if eid in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[eid])
            labels.append(1)

    # === Add at least 5 unique negatives ===
    negatives_added = 0
    tried_ids = set()

    while negatives_added < 10:
        neg_id = random.choice(evidence_ids)

        if neg_id in positive_ids or neg_id in tried_ids:
            continue

        tried_ids.add(neg_id)

        if neg_id in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[neg_id])
            labels.append(-1)
            negatives_added += 1

# onvert to HuggingFace Dataset
data = {
    "claim": claims,
    "evidence": evidence_texts,
    "label": labels
}

dataset = Dataset.from_dict(data)

# Confirm
print(dataset)
print(dataset[0])


Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 16402
})
{'claim': 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'evidence': 'At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 'label': 1}


In [3]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import MultipleNegativesRankingLoss 

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

loss = MultipleNegativesRankingLoss(model)

train_dataset = dataset

Matplotlib created a temporary cache directory at C:\Users\BILLZH~1\AppData\Local\Temp\matplotlib-78de2okr because the default path (C:\Users\Bill Zhu\.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.





In [4]:
### Trainer
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

# Load the dataset
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]

# set arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="output",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="sentence-transformer-training",
)

# set trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss=loss,
)

# train the model
trainer.train()

# Save the model
trainer.save_model("sbert_model2")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
100,4.3762,3.163618
200,3.0097,2.815341
300,2.7236,2.4919
400,2.5135,2.406466
500,2.4476,2.376669
600,2.3958,2.37108
700,2.399,2.326584
800,2.3507,2.328802
900,2.3693,2.309977
1000,2.2346,2.303936
