In [None]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
train_df = pd.read_csv("./data/train_clean.csv")
test_df = pd.read_csv("./data/test_clean.csv")

# Chuẩn hóa cột
train_df = train_df.rename(columns={"Statement": "claim", "Context": "context", "labels": "label", "Evidence": "evidence"})
test_df = test_df.rename(columns={"Statement": "claim", "Context": "context", "labels": "label", "Evidence": "evidence"})

# Hàm tách câu
def sentence_tokenize(text):
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', str(text).strip()) if s]

# Mô hình cho TSS
model_st = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Hàm chọn top-k câu gần nhất
def select_top_k_sentences(claim, context, k=2):
    sentences = sentence_tokenize(context)
    if not sentences:
        return "", []
    embeddings = model_st.encode([claim] + sentences)
    sims = cosine_similarity([embeddings[0]], embeddings[1:])[0]
    top_k_indices = sims.argsort()[-k:][::-1]
    selected = [sentences[i] for i in top_k_indices]
    return " ".join(selected), top_k_indices.tolist()

# Áp dụng chọn rationale cho cả tập train/test
def apply_tss(df, k=2):
    rationale_texts, rationale_idxs = [], []
    for _, row in df.iterrows():
        rationale, idxs = select_top_k_sentences(row['claim'], row['context'], k)
        rationale_texts.append(rationale)
        rationale_idxs.append(idxs)
    df["tss_rationale"] = rationale_texts
    df["tss_rationale_idxs"] = rationale_idxs
    return df

# Áp dụng TSS
train_df = apply_tss(train_df, k=2)
test_df = apply_tss(test_df, k=2)


In [None]:
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(example):
    return tokenizer(example["claim"] + " [SEP] " + example["tss_rationale"],
                     padding="max_length", truncation=True, max_length=128)

# Chuẩn bị dữ liệu
train_hf = Dataset.from_pandas(train_df[["claim", "tss_rationale", "label"]])
test_hf = Dataset.from_pandas(test_df[["claim", "tss_rationale", "label"]])

train_hf = train_hf.map(encode_batch)
test_hf = test_hf.map(encode_batch)

# Mô hình
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Tham số huấn luyện
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=test_hf,
    compute_metrics=compute_metrics
)

trainer.train()


Map:   0%|          | 0/3409 [00:00<?, ? examples/s]

Map:   0%|          | 0/976 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.692335,0.520492
2,0.696700,0.692484,0.520492
3,0.696500,0.692384,0.520492
4,0.695200,0.692306,0.520492
5,0.695800,0.693019,0.520492
6,0.694300,0.690001,0.545082
7,0.694300,0.689583,0.523566
8,0.695700,0.688855,0.544057
9,0.693300,0.687965,0.548156
10,0.686400,0.689366,0.543033


TrainOutput(global_step=4270, training_loss=0.6938338518701057, metrics={'train_runtime': 649.6777, 'train_samples_per_second': 52.472, 'train_steps_per_second': 6.572, 'total_flos': 2242363969305600.0, 'train_loss': 0.6938338518701057, 'epoch': 10.0})

In [3]:
from sklearn.metrics import classification_report

predictions = trainer.predict(test_hf)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=-1)

print(classification_report(y_true, y_pred, target_names=["SUPPORTED", "REFUTED"]))


              precision    recall  f1-score   support

   SUPPORTED       0.57      0.53      0.55       508
     REFUTED       0.53      0.57      0.55       468

    accuracy                           0.55       976
   macro avg       0.55      0.55      0.55       976
weighted avg       0.55      0.55      0.55       976



In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score

def match_sentence_index(evidence, context):
    sents = sentence_tokenize(context)
    matches = []
    for i, s in enumerate(sents):
        if evidence.strip() in s or s in evidence.strip():
            matches.append(i)
    return matches

rationale_prec, rationale_rec, rationale_f1 = [], [], []

for i, row in test_df.iterrows():
    gold_idxs = match_sentence_index(row["evidence"], row["context"])
    pred_idxs = row["tss_rationale_idxs"]
    sents = sentence_tokenize(row["context"])

    if not sents or not gold_idxs:
        continue

    y_true = [1 if i in gold_idxs else 0 for i in range(len(sents))]
    y_pred = [1 if i in pred_idxs else 0 for i in range(len(sents))]

    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    f = f1_score(y_true, y_pred, zero_division=0)

    rationale_prec.append(p)
    rationale_rec.append(r)
    rationale_f1.append(f)

print(f"Rationale (TSS) Quality:")
print(f"Precision: {np.mean(rationale_prec):.3f}")
print(f"Recall:    {np.mean(rationale_rec):.3f}")
print(f"F1-score:  {np.mean(rationale_f1):.3f}")


Rationale (TSS) Quality:
Precision: 0.450
Recall:    0.764
F1-score:  0.550
