In [13]:
import json
import pandas as pd

path = 'data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

In [14]:
from datasets import Dataset
import random

claims = []
evidence_texts = []
labels = []

evidence_map = dict(evidence)
evidence_ids = list(evidence_map.keys())

for _, info in train_claims.items():
    claim_text = info["claim_text"]
    positive_ids = info["evidences"]

    # Add positive (matched) examples
    for eid in positive_ids:
        if eid in evidence_map:
            claims.append(claim_text)
            evidence_texts.append(evidence_map[eid])
            labels.append(1)

    # Add negative (mismatched) examples
    for _ in positive_ids:
        while True:
            neg_id = random.choice(evidence_ids)
            if neg_id not in positive_ids:
                claims.append(claim_text)
                evidence_texts.append(evidence_map[neg_id])
                labels.append(-1)
                break

# Convert to HuggingFace Dataset
data = {
    "claim": claims,
    "evidence": evidence_texts,
    "label": labels
}

dataset = Dataset.from_dict(data)

# 🔍 Confirm structure
print(dataset)
print(dataset[0])


Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 8244
})
{'claim': 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.', 'evidence': 'At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.', 'label': 1}


In [None]:
# ## evaluation dataset

# claims = []
# evidence_texts = []
# labels = []
# evidence_map = dict(evidence)
# evidence_ids = list(evidence_map.keys())

# for _, info in dev_claims.items():
#     claim_text = info["claim_text"]
#     positive_ids = info["evidences"]

#     # Add positive (matched) examples
#     for eid in positive_ids:
#         if eid in evidence_map:
#             claims.append(claim_text)
#             evidence_texts.append(evidence_map[eid])
#             labels.append(1)

#     # Add negative (mismatched) examples
#     for _ in positive_ids:
#         while True:
#             neg_id = random.choice(evidence_ids)
#             if neg_id not in positive_ids:
#                 claims.append(claim_text)
#                 evidence_texts.append(evidence_map[neg_id])
#                 labels.append(-1)
#                 break

# # Convert to HuggingFace Dataset
# data = {
#     "claim": claims,
#     "evidence": evidence_texts,
#     "label": labels
# }

# eval_dataset = Dataset.from_dict(data)

# # 🔍 Confirm structure
# print(eval_dataset)
# print(eval_dataset[0])


Dataset({
    features: ['claim', 'evidence', 'label'],
    num_rows: 982
})
{'claim': '[South Australia] has the most expensive electricity in the world.', 'evidence': '[citation needed] South Australia has the highest retail price for electricity in the country.', 'label': 1}


In [16]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import CoSENTLoss

model = SentenceTransformer('all-MiniLM-L6-v2')

loss = CoSENTLoss(model)

train_dataset = dataset

In [17]:
### todo: add evaluation dataset

In [None]:
### Trainer
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

# Load the dataset
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]

# set arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="output",
    # Optional training parameters:
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    run_name="sentence-transformer-training",
)

# set trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    loss=loss,
)

# train the model
trainer.train()

# Save the model
trainer.save_model("sbert_model")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss
1000,0.0815,0.17627
