In [None]:
# =====================================================
# FAKE NEWS DETECTION –  DISTILBERT + LIAR
# =====================================================

# 1. CÀI ĐẶT & IMPORT
!pip install -q transformers datasets torch scikit-learn pandas numpy psutil accelerate

import os, re, time, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, set_seed,
    DataCollatorWithPadding
)
from google.colab import drive
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# 2. KIỂM TRA GPU & THÔNG TIN VRAM (AN TOÀN)
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
else:
    device_name = "CPU"
    vram_gb = 0.0
print(f"Device: {device_name} | CUDA: {torch.cuda.is_available()} | VRAM: {vram_gb:.1f} GB")

# 3. MOUNT DRIVE & CHECK SPACE
drive.mount('/content/drive', force_remount=False)

def check_drive_space(path="/content/drive/MyDrive"):
    try:
        usage = psutil.disk_usage(path)
        used_gb = usage.used / 1e9
        total_gb = usage.total / 1e9
        pct = used_gb/total_gb*100 if total_gb>0 else 0
        print(f"Drive: {used_gb:.1f}GB / {total_gb:.1f}GB ({pct:.1f}%)")
        if pct > 90:
            print("CẢNH BÁO: Dung lượng Drive gần đầy!")
    except Exception as e:
        print("Không thể kiểm tra dung lượng Drive:", e)

check_drive_space()

# 4. OUTPUT_DIR + CHECKPOINT MANAGEMENT
OUTPUT_DIR = "/content/drive/MyDrive/WELFake_DistilBERT_Pro"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def manage_checkpoints(output_dir, keep_latest=2):
    """Giữ lại đúng `keep_latest` checkpoint mới nhất, xóa cũ"""
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if len(ckpts) <= keep_latest:
        return
    # sort by numeric suffix if possible
    def idx_from_name(n):
        parts = n.split("-")
        try:
            return int(parts[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))
    ckpts_sorted = sorted(ckpts, key=idx_from_name)
    for old_ckpt in ckpts_sorted[:-keep_latest]:
        ckpt_path = os.path.join(output_dir, old_ckpt)
        shutil.rmtree(ckpt_path, ignore_errors=True)
        print(f"Đã xóa checkpoint cũ: {old_ckpt}")

def get_last_checkpoint(output_dir):
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if not ckpts:
        return None
    # sort descending by numeric suffix or mtime
    def idx_from_name(n):
        parts = n.split("-")
        try:
            return int(parts[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))
    ckpts_sorted = sorted(ckpts, key=idx_from_name, reverse=True)
    # tidy up older ones now (keeps keep_latest=2)
    manage_checkpoints(output_dir, keep_latest=2)
    return os.path.join(output_dir, ckpts_sorted[0])

last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
print("Checkpoint gần nhất:", last_checkpoint or "Không có → Train từ đầu")

# =====================================================
# 5. TẢI DATASET LIAR & CHUYỂN ĐỔI NHÃN (6 -> 2)
# =====================================================
print("\nĐang tải dataset LIAR (chengxuphd/liar2)...")

try:
    # Tải dataset theo yêu cầu
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"Lỗi tải 'chengxuphd/liar2': {e}")
    print("Đang chuyển sang tải bản gốc 'liar'...")
    dataset = load_dataset("liar")

# LIAR thường đã chia sẵn train/val/test.
# Chúng ta sẽ gộp lại để xử lý nhất quán rồi chia lại (hoặc giữ nguyên tùy ý).
# Ở đây ta gộp lại thành pandas để dễ xử lý map nhãn.
df_train = pd.DataFrame(dataset['train'])
df_val   = pd.DataFrame(dataset['validation'])
df_test  = pd.DataFrame(dataset['test'])

# Gộp tạm thời để xử lý
df = pd.concat([df_train, df_val, df_test], ignore_index=True)
print(f"Tổng số mẫu ban đầu: {len(df)}")

# -----------------------------------------------------
# A. XỬ LÝ NHÃN (6 class -> 2 class)
# -----------------------------------------------------
# LIAR labels standard mapping:
# 0: false, 1: half-true, 2: mostly-true, 3: true, 4: barely-true, 5: pants-fire

def map_liar_labels(label):
    # Nhóm Fake (0): false (0), barely-true (4), pants-fire (5)
    if label in [0, 4, 5]:
        return 0
    # Nhóm Real (1): half-true (1), mostly-true (2), true (3)
    elif label in [1, 2, 3]:
        return 1
    return 0 # Fallback (nếu có lỗi)

print("Đang chuyển đổi 6 nhãn sang 2 nhãn (Binary)...")
df['label'] = df['label'].apply(map_liar_labels)

# Kiểm tra phân bố
label_counts = df['label'].value_counts(normalize=True)
print(f"Phân bố mới: Fake(0)={label_counts.get(0,0):.1%}, Real(1)={label_counts.get(1,0):.1%}")

# -----------------------------------------------------
# B. TẠO INPUT TEXT (QUAN TRỌNG VỚI LIAR)
# -----------------------------------------------------
# Với LIAR, "statement" rất ngắn. Cần ghép thêm context, speaker, party...
# Cấu trúc: [Statement] [SEP] [Context/Subject] - Speaker (Party)

def create_liar_content(row):
    stmt = str(row.get('statement', ''))

    # Lấy thêm thông tin ngữ cảnh nếu có
    # Lưu ý: tên cột có thể khác nhau tùy phiên bản dataset, ta dùng .get để an toàn
    subject = str(row.get('subject', ''))
    speaker = str(row.get('speaker', ''))
    party   = str(row.get('party_affiliation', ''))
    context = str(row.get('context', '')) # ví dụ: "bài phát biểu tại..."

    # Ghép chuỗi phong phú cho BERT học
    # Ví dụ: "Obama is alien [SEP] health-care - barack-obama (democrat) - news release"
    extra_info = f"{subject} | {speaker} ({party}) | {context}"
    return stmt + " [SEP] " + extra_info

print("Đang tạo nội dung đầu vào (Feature Engineering)...")
df['content'] = df.apply(create_liar_content, axis=1)

# -----------------------------------------------------
# C. LÀM SẠCH NHẸ
# -----------------------------------------------------
def clean_text(text):
    if not isinstance(text, str): return ""
    t = text.lower()
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'<.*?>', ' ', t)
    # Với LIAR, KHÔNG NÊN xóa hết dấu câu vì context cần dấu phân cách
    # Chỉ xóa ký tự quá lạ
    t = re.sub(r'[^a-zA-Z0-9\s\.\,\-\(\)]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

df['content'] = df['content'].apply(clean_text)
df = df[df['content'].str.len() > 10] # Lọc câu quá ngắn

# Tính class weights mới
classes = np.array([0,1])
class_weights = compute_class_weight('balanced', classes=classes, y=df['label'])
class_weight_dict = {0: float(class_weights[0]), 1: float(class_weights[1])}
print("Class weights:", class_weight_dict)

# =====================================================
# 6. SPLIT DATA (Sử dụng lại logic phân chia chuẩn)
# =====================================================
# Vì ta đã gộp để xử lý, giờ chia lại theo tỷ lệ chuẩn hoặc stratified split
labels = df['label'].values
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=labels)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")

train_dataset = Dataset.from_pandas(train_df[['content','label']].reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df[['content','label']].reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df[['content','label']].reset_index(drop=True))

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# --- CÁC PHẦN SAU (TOKENIZER, MODEL, TRAINER) GIỮ NGUYÊN ---

# 7. Tokenizer + DataCollator (dynamic padding)
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_fn(batch):
    # NOTE: do NOT do global padding here; use DataCollatorWithPadding for dynamic per-batch padding
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

print("Tokenizing dataset...")
tokenized = dataset_dict.map(tokenize_fn, batched=True, batch_size=1000, remove_columns=['content'])
tokenized = tokenized.rename_column("label", "labels")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8. Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# set id<->label mapping (optional, helpful for inference)
model.config.id2label = {0:"Fake", 1:"Real"}
model.config.label2id = {"Fake":0, "Real":1}

# 9. TrainingArguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=3e-5,
    lr_scheduler_type="linear",

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to="none",
    disable_tqdm=False,
)

# 10. Metrics (safe AUC)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    try:
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
        auc = roc_auc_score(labels, probs)
    except Exception:
        auc = None
    return {"accuracy": float(acc), "precision": float(precision), "recall": float(recall), "f1": float(f1), "auc": auc}

# 11. Weighted Trainer (fix API + dtype)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # create weight tensor float32 on model device
        weight_tensor = torch.tensor([class_weights[0], class_weights[1]], dtype=torch.float32, device=model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# 12. Trainer init
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# 13. Train (resume safe)
print("\n" + "="*60)
print("BẮT ĐẦU HUẤN LUYỆN – PRO FINAL")
print("="*60)

trainer.train(resume_from_checkpoint=last_checkpoint)

# after training, ensure we keep just the latest checkpoints (double safeguard)
manage_checkpoints(OUTPUT_DIR, keep_latest=2)

# 14. Evaluate on test
print("\nĐÁNH GIÁ TRÊN TẬP TEST")
results = trainer.evaluate(tokenized["test"])
for k, v in results.items():
    print(f"{k}: {v}")

# 15. Save best / final model
final_model_path = os.path.join(OUTPUT_DIR, "final_best_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"\nĐÃ LƯU MÔ HÌNH TỐT NHẤT TẠI:\n{final_model_path}")

# optional: clean up leftover checkpoints and other temp dirs
for f in os.listdir(OUTPUT_DIR):
    if f.startswith("checkpoint-") or f == "runs":
        path = os.path.join(OUTPUT_DIR, f)
        if os.path.isdir(path):
            shutil.rmtree(path, ignore_errors=True)
print("Dọn dẹp hoàn tất. Chỉ giữ final model!")


Device: Tesla T4 | CUDA: True | VRAM: 15.8 GB
Mounted at /content/drive
Drive: 0.5GB / 16.1GB (3.2%)
Checkpoint gần nhất: Không có → Train từ đầu

Đang tải dataset LIAR (chengxuphd/liar2)...


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

Tổng số mẫu ban đầu: 22962
Đang chuyển đổi 6 nhãn sang 2 nhãn (Binary)...
Phân bố mới: Fake(0)=39.4%, Real(1)=60.6%
Đang tạo nội dung đầu vào (Feature Engineering)...
Class weights: {0: 1.2693200663349917, 1: 0.8249622763526622}
Train: 18,369 | Val: 2,296 | Test: 2,297


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

Map:   0%|          | 0/2297 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BẮT ĐẦU HUẤN LUYỆN – PRO FINAL


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.6559,0.656872,0.609321,0.640712,0.609321,0.61336,0.662441
2,0.5986,0.65344,0.647648,0.644636,0.647648,0.645871,0.676019
3,0.4979,0.747611,0.631533,0.63742,0.631533,0.633795,0.657101



ĐÁNH GIÁ TRÊN TẬP TEST


eval_loss: 0.669339120388031
eval_accuracy: 0.6356116673922507
eval_precision: 0.6297410974275851
eval_recall: 0.6356116673922507
eval_f1: 0.6316327470208636
eval_auc: 0.6548668000254018
eval_runtime: 1.7995
eval_samples_per_second: 1276.49
eval_steps_per_second: 80.024
epoch: 3.0

ĐÃ LƯU MÔ HÌNH TỐT NHẤT TẠI:
/content/drive/MyDrive/WELFake_DistilBERT_Pro/final_best_model
Dọn dẹp hoàn tất. Chỉ giữ final model!
