In [None]:
# =====================================================
# FAKE NEWS DETECTION ‚Äì ALBERT + LoRA x LIAR Dataset
# =====================================================

# 1. C√ÄI ƒê·∫∂T
!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil sentencepiece

import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
from google.colab import drive

warnings.filterwarnings("ignore")

# 2. KI·ªÇM TRA GPU
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"‚úÖ Device: {device_name} | VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: ƒêang ch·∫°y tr√™n CPU! H√£y chuy·ªÉn sang T4 GPU.")

# 3. MOUNT DRIVE
drive.mount('/content/drive', force_remount=True)
OUTPUT_DIR = "/content/drive/MyDrive/LIAR_ALBERT_LoRA"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 4. T·∫¢I DATASET LIAR
print("\n‚è≥ ƒêang t·∫£i dataset LIAR...")
try:
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"‚ö†Ô∏è L·ªói t·∫£i b·∫£n 'liar2', chuy·ªÉn sang b·∫£n g·ªëc 'liar'...")
    dataset = load_dataset("liar")

# G·ªôp train/val/test ƒë·ªÉ x·ª≠ l√Ω nh·∫•t qu√°n
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])
df = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(f"T·ªïng s·ªë m·∫´u ban ƒë·∫ßu: {len(df)}")

# 5. X·ª¨ L√ù D·ªÆ LI·ªÜU ƒê·∫∂C TH√ô CHO LIAR

# A. Gom nh√£n (6 -> 2)
def map_liar_labels(label):
    # Fake (0): false, barely-true, pants-fire
    if label in [0, 4, 5]: return 0
    # Real (1): half-true, mostly-true, true
    elif label in [1, 2, 3]: return 1
    return 0

df['label'] = df['label'].apply(map_liar_labels)

# B. Feature Engineering (Gh√©p ng·ªØ c·∫£nh - R·∫•t quan tr·ªçng cho LIAR)
def create_liar_content(row):
    stmt = str(row.get('statement', ''))
    subject = str(row.get('subject', ''))
    speaker = str(row.get('speaker', ''))
    party   = str(row.get('party_affiliation', ''))
    context = str(row.get('context', ''))

    # C·∫•u tr√∫c: [Statement] [SEP] [Subject] | [Speaker] ([Party]) | [Context]
    return f"{stmt} [SEP] {subject} | {speaker} ({party}) | {context}"

print("üõ†Ô∏è ƒêang t·∫°o n·ªôi dung ƒë·∫ßu v√†o (Statement + Context)...")
df['content'] = df.apply(create_liar_content, axis=1)

# C. L√†m s·∫°ch
def clean_text(text):
    if not isinstance(text, str): return ""
    t = text.lower()
    t = re.sub(r'https?://\S+', ' ', t)
    t = re.sub(r'<.*?>', ' ', t)
    t = re.sub(r'[^a-z0-9\s\.\,\-\(\)]', ' ', t) # Gi·ªØ l·∫°i d·∫•u c√¢u quan tr·ªçng
    t = re.sub(r'\s+', ' ', t).strip()
    return t

df['content'] = df['content'].apply(clean_text)
df = df[df['content'].str.len() > 10]

# Ph√¢n b·ªë & Class Weights
classes = np.unique(df['label'])
class_weights = compute_class_weight('balanced', classes=classes, y=df['label'])
class_weight_dict = {k: float(v) for k, v in zip(classes, class_weights)}
print("Class weights:", class_weight_dict)

# 6. SPLIT DATA
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df[['content','label']].reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df[['content','label']].reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df[['content','label']].reset_index(drop=True))
})

# 7. TOKENIZER (ALBERT)
MODEL_NAME = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=512, padding=False) # LIAR gh√©p context n√™n c·∫ßn ƒë·ªô d√†i l·ªõn h∆°n

print("‚öôÔ∏è Tokenizing...")
tokenized = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['content'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8. LOAD MODEL & CONFIG LoRA
print(f"\nüöÄ T·∫£i model {MODEL_NAME} v√† √°p d·ª•ng LoRA...")
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
base_model.config.id2label = {0: "Fake", 1: "Real"}
base_model.config.label2id = {"Fake": 0, "Real": 1}

# C·∫•u h√¨nh LoRA cho ALBERT
# ALBERT d√πng c∆° ch·∫ø shared parameters, c√°c module attention l√† query, key, value
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,                   
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value"]
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# 9. CUSTOM TRAINER
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        w = torch.tensor(list(class_weight_dict.values()), dtype=torch.float32, device=model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=w)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 10. TRAINING ARGS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                 
    per_device_train_batch_size=16,    
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=1e-3,                 
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 11. TRAIN & EVAL
print("\nüî• B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN (ALBERT + LoRA)...")
trainer.train()

print("\nüéØ K·∫æT QU·∫¢ TR√äN TEST SET:")
print(trainer.evaluate(tokenized["test"]))

# Save Adapter
final_path = os.path.join(OUTPUT_DIR, "final_albert_lora_liar")
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"\n‚úÖ ƒê√£ l∆∞u LoRA Adapter t·∫°i: {final_path}")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Device: Tesla T4 | VRAM: 15.8 GB
Mounted at /content/drive

‚è≥ ƒêang t·∫£i dataset LIAR...


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

T·ªïng s·ªë m·∫´u ban ƒë·∫ßu: 22962
üõ†Ô∏è ƒêang t·∫°o n·ªôi dung ƒë·∫ßu v√†o (Statement + Context)...
Class weights: {np.int64(0): 1.2693200663349917, np.int64(1): 0.8249622763526622}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

‚öôÔ∏è Tokenizing...


Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

Map:   0%|          | 0/2297 [00:00<?, ? examples/s]


üöÄ T·∫£i model albert-base-v2 v√† √°p d·ª•ng LoRA...


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 75,266 || all params: 11,760,388 || trainable%: 0.6400

üî• B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN (ALBERT + LoRA)...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7158,0.6933,0.393728,0.222456,0.155022,0.393728
2,0.7041,0.696667,0.606272,0.457663,0.367565,0.606272
3,0.7004,0.69404,0.606272,0.457663,0.367565,0.606272
4,0.6967,0.693219,0.606272,0.457663,0.367565,0.606272
5,0.6942,0.693327,0.606272,0.457663,0.367565,0.606272



üéØ K·∫æT QU·∫¢ TR√äN TEST SET:


{'eval_loss': 0.6967869400978088, 'eval_accuracy': 0.6060078363082281, 'eval_f1': 0.4573396086424796, 'eval_precision': 0.3672454976669802, 'eval_recall': 0.6060078363082281, 'eval_runtime': 4.5702, 'eval_samples_per_second': 502.598, 'eval_steps_per_second': 15.754, 'epoch': 5.0}

‚úÖ ƒê√£ l∆∞u LoRA Adapter t·∫°i: /content/drive/MyDrive/LIAR_ALBERT_LoRA/final_albert_lora_liar
