In [None]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import classification_report

# -----------------------------
# 1. Tách câu
# -----------------------------
def sentence_tokenize(text):
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', str(text)) if s]

# -----------------------------
# 2. Load dữ liệu
# -----------------------------
def load_data(path):
    df = pd.read_csv(path)
    data = []
    for _, row in df.iterrows():
        claim = row["Statement"]
        context_sents = sentence_tokenize(row["Context"])
        label = row["labels"]
        evidence = row["Evidence"]
        data.append({
            'claim': claim,
            'sentences': context_sents,
            'label': label,
            'evidence': evidence
        })
    return data

# Load test set
test_data = load_data("./data/test_clean.csv")

# -----------------------------
# 3. Load sentence-BERT
# -----------------------------
sbert = SentenceTransformer("all-MiniLM-L6-v2")  

# -----------------------------
# 4. Chọn Top-1 rationale
# -----------------------------
def extract_top1_rationale(claim, sentences):
    emb_claim = sbert.encode(claim, convert_to_tensor=True)
    emb_sents = sbert.encode(sentences, convert_to_tensor=True)
    scores = util.cos_sim(emb_claim, emb_sents)[0]
    best_idx = scores.argmax().item()
    return sentences[best_idx], best_idx


In [2]:
def predict_label_from_rationale(rationale_sent):
    rationale_lower = rationale_sent.lower()
    neg_words = ["không", "chưa", "sai", "phủ nhận", "bác bỏ"]
    for word in neg_words:
        if word in rationale_lower:
            return 1  # REFUTED
    return 0  # SUPPORTED


In [None]:
y_true, y_pred = [], []

for sample in test_data:
    rationale_sent, _ = extract_top1_rationale(sample["claim"], sample["sentences"])
    pred = predict_label_from_rationale(rationale_sent)
    y_pred.append(pred)
    y_true.append(sample["label"])

print("Claim Classification:")
print(classification_report(y_true, y_pred, target_names=["SUPPORTED", "REFUTED"]))


Claim Classification (Fully Unsupervised):
              precision    recall  f1-score   support

   SUPPORTED       0.53      0.84      0.65       508
     REFUTED       0.50      0.17      0.26       468

    accuracy                           0.52       976
   macro avg       0.51      0.51      0.45       976
weighted avg       0.51      0.52      0.46       976



In [4]:
def evaluate_rationale_quality(data):
    matched, total = 0, 0
    for sample in data:
        rationale_sent, _ = extract_top1_rationale(sample["claim"], sample["sentences"])
        evidence_text = str(sample["evidence"]).strip()
        if not evidence_text:
            continue
        total += 1
        if evidence_text in rationale_sent or rationale_sent in evidence_text:
            matched += 1
    if total == 0:
        print("Không có evidence hợp lệ.")
    else:
        print(f"Rationale Overlap: {matched}/{total} = {matched/total:.2f}")


In [5]:
evaluate_rationale_quality(test_data)

Rationale Overlap: 470/976 = 0.48


In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_rationale_quality(data):
    all_preds = []
    all_labels = []

    for sample in data:
        context_sentences = sample["sentences"]
        evidence_text = str(sample.get("evidence", "")).strip()
        if not evidence_text:
            continue

        # Chuyển evidence gốc thành gold index (câu nào chứa đoạn evidence)
        gold_labels = [1 if evidence_text in sent else 0 for sent in context_sentences]

        # Dự đoán rationale bằng top-1 similarity (unsupervised)
        rationale_sent, pred_idx = extract_top1_rationale(sample["claim"], context_sentences)
        pred_labels = [1 if i == pred_idx else 0 for i in range(len(context_sentences))]

        # Gộp lại
        all_preds.extend(pred_labels)
        all_labels.extend(gold_labels)

    if not all_preds:
        print("No valid rationale to evaluate.")
        return

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print("Rationale Extraction Quality (Unsupervised Top-1):")
    print(f"Precision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1-score:  {f1:.3f}")


In [7]:
evaluate_rationale_quality(test_data)

Rationale Extraction Quality (Unsupervised Top-1):
Precision: 0.303
Recall:    0.827
F1-score:  0.444
