In [None]:
# =====================================================
# FAKE NEWS DETECTION ‚Äì ALBERT + PEFT (LoRA) + WELFAKE
# =====================================================

!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil

import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
from google.colab import drive
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# 1. GPU INFO
print("CUDA:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
    print("VRAM:", torch.cuda.get_device_properties(0).total_memory/1e9, "GB")

# 2. Mount Drive
drive.mount("/content/drive")

OUTPUT_DIR = "/content/drive/MyDrive/WELFake_ALBERT_LoRA"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 3. LOAD DATA
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])
df["content"] = df["title"].fillna("") + " [SEP] " + df["text"].fillna("")

def clean_text(t):
    t = t.lower()
    t = re.sub(r'https?://\S+', ' ', t)
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    return re.sub(r'\s+', ' ', t).strip()

df["content"] = df["content"].apply(clean_text)
df = df[df["content"].str.len() > 20].drop_duplicates(subset=["content"])
print("Clean samples:", len(df))

# 4. SPLIT
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df["label"])
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

train_ds = Dataset.from_pandas(train_df[["content","label"]])
val_ds   = Dataset.from_pandas(val_df[["content","label"]])
test_ds  = Dataset.from_pandas(test_df[["content","label"]])

dataset_dict = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# 5. TOKENIZER
MODEL_NAME = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

tokenized = dataset_dict.map(tokenize_fn, batched=True, remove_columns=["content"])
tokenized = tokenized.rename_column("label", "labels")
collator = DataCollatorWithPadding(tokenizer)

# 6. LOAD ALBERT
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

# 7. LoRA CONFIG ‚Äî ALBERT SPECIFIC
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    # ALBERT uses query/key/value names
    target_modules=["query", "key", "value"]
)

print("Injecting LoRA...")
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# 8. CLASS WEIGHTS
classes = np.array([0,1])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# 9. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    probs = torch.softmax(torch.tensor(logits), dim=1)[:,1].numpy()
    auc = roc_auc_score(labels, probs)
    return {"accuracy":acc, "precision":precision, "recall":recall, "f1":f1, "auc":auc}

# 10. Weighted Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels  = inputs.pop("labels")
        outputs = model(**inputs)
        loss_f  = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss    = loss_f(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

# 11. TRAINING ARGS
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=2,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 12. TRAIN
print("\n==============================")
print("üöÄ TRAINING ALBERT + LoRA")
print("==============================\n")
trainer.train()

# 13. TEST
print("\nTEST RESULTS")
results = trainer.evaluate(tokenized["test"])
for k,v in results.items():
    print(f"{k}: {v}")

# 14. SAVE
SAVE_DIR = OUTPUT_DIR + "/final_albert_lora"
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved to:", SAVE_DIR)


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCUDA: True
Device: Tesla T4
VRAM: 15.828320256 GB
Mounted at /content/drive


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-290868f0a36350(‚Ä¶):   0%|          | 0.00/152M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/72134 [00:00<?, ? examples/s]

Clean samples: 63323


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/47492 [00:00<?, ? examples/s]

Map:   0%|          | 0/7915 [00:00<?, ? examples/s]

Map:   0%|          | 0/7916 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Injecting LoRA...
trainable params: 38,402 || all params: 11,723,524 || trainable%: 0.3276

üöÄ TRAINING ALBERT + LoRA



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.1901,0.154818,0.941377,0.943632,0.941377,0.941488,0.985495
2,0.1386,0.145783,0.952748,0.953203,0.952748,0.952798,0.989742


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.1901,0.154818,0.941377,0.943632,0.941377,0.941488,0.985495
2,0.1386,0.145783,0.952748,0.953203,0.952748,0.952798,0.989742
3,0.1383,0.1411,0.954517,0.955289,0.954517,0.954577,0.990727



TEST RESULTS


eval_loss: 0.13130950927734375
eval_accuracy: 0.9560384032339565
eval_precision: 0.9564451430128789
eval_recall: 0.9560384032339565
eval_f1: 0.9560824604545967
eval_auc: 0.99113124556942
eval_runtime: 84.3
eval_samples_per_second: 93.903
eval_steps_per_second: 5.872
epoch: 3.0
Saved to: /content/drive/MyDrive/WELFake_ALBERT_LoRA/final_albert_lora
