In [1]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import classification_report

# -----------------------------
# 1. Tách câu
# -----------------------------
def sentence_tokenize(text):
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', str(text)) if s]

# -----------------------------
# 2. Load dữ liệu
# -----------------------------
def load_data(path):
    df = pd.read_csv(path)
    data = []
    for _, row in df.iterrows():
        claim = row["Statement"]
        context_sents = sentence_tokenize(row["Context"])
        label = row["labels"]
        evidence = row["Evidence"]
        data.append({
            'claim': claim,
            'sentences': context_sents,
            'label': label,
            'evidence': evidence
        })
    return data

# Load test set
test_data = load_data("./data/test_clean.csv")

# -----------------------------
# 3. Load sentence-BERT
# -----------------------------
sbert = SentenceTransformer("all-MiniLM-L6-v2")  

# -----------------------------
# 4. Chọn Top-k rationale
# -----------------------------
def extract_top_k_rationale(claim, sentences, k=2):
    emb_claim = sbert.encode(claim, convert_to_tensor=True)
    emb_sents = sbert.encode(sentences, convert_to_tensor=True)
    scores = util.cos_sim(emb_claim, emb_sents)[0]
    top_indices = scores.topk(k=k).indices.tolist()
    return [sentences[i] for i in top_indices]


In [2]:
def predict_label_from_topk(rationale_sents):
    neg_words = ["không", "chưa", "sai", "phủ nhận", "bác bỏ"]
    for sent in rationale_sents:
        sent_lower = sent.lower()
        if any(word in sent_lower for word in neg_words):
            return 1  # REFUTED
    return 0  # SUPPORTED


In [3]:
y_true, y_pred = [], []

for sample in test_data:
    topk_sents = extract_top_k_rationale(sample["claim"], sample["sentences"], k=2)
    pred = predict_label_from_topk(topk_sents)
    y_pred.append(pred)
    y_true.append(sample["label"])

print("Claim Classification:")
print(classification_report(y_true, y_pred, target_names=["SUPPORTED", "REFUTED"]))


Claim Classification:
              precision    recall  f1-score   support

   SUPPORTED       0.53      0.73      0.61       508
     REFUTED       0.50      0.29      0.37       468

    accuracy                           0.52       976
   macro avg       0.52      0.51      0.49       976
weighted avg       0.52      0.52      0.50       976



In [4]:
def evaluate_rationale_quality(data, k=2):
    matched, total = 0, 0
    for sample in data:
        top_k_sents = extract_top_k_rationale(sample["claim"], sample["sentences"], k=k)
        evidence_text = str(sample["evidence"]).strip()
        if not evidence_text:
            continue
        total += 1
        rationale_text = " ".join(top_k_sents)
        if evidence_text in rationale_text or rationale_text in evidence_text:
            matched += 1
    if total == 0:
        print("Không có evidence hợp lệ.")
    else:
        print(f"Rationale Overlap (Top-{k}): {matched}/{total} = {matched/total:.2f}")


In [5]:
evaluate_rationale_quality(test_data)

Rationale Overlap (Top-2): 350/976 = 0.36


In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_rationale_precision_recall(data, k=2):
    y_true, y_pred = [], []

    for sample in data:
        top_k_sents = extract_top_k_rationale(sample["claim"], sample["sentences"], k=k)
        evidence_text = str(sample["evidence"]).strip()

        if not evidence_text:
            continue

        # Ground truth: 1 if evidence exists, 0 otherwise
        y_true.append(1 if evidence_text else 0)

        # Prediction: 1 if rationale overlaps with evidence, 0 otherwise
        rationale_text = " ".join(top_k_sents)
        y_pred.append(1 if evidence_text in rationale_text or rationale_text in evidence_text else 0)

    # Compute metrics
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")




In [9]:
# Run evaluation
evaluate_rationale_precision_recall(test_data)

Precision: 1.00
Recall: 0.36
F1-score: 0.53
