In [None]:
# =====================================================
# FAKE NEWS DETECTION ‚Äì MiniLM + PEFT (LoRA) Z+ WELFAKE
# Colab-ready (uses nreimers/MiniLM-L6-H384-uncased)
# Includes PAPER_PATH for your uploaded PDF: /mnt/data/SLM_fake_news.pdf
# =====================================================

# 0. (Optional) restart runtime if coming from previous installs to avoid conflicts

# 1. INSTALL
!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil

# 2. IMPORTS
import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, TaskType
from google.colab import drive
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# 3. OPTIONAL: Path to uploaded paper (you uploaded SLM_fake_news.pdf)
PAPER_PATH = "/mnt/data/SLM_fake_news.pdf"

# 4. GPU INFO
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
    print("VRAM (GB):", torch.cuda.get_device_properties(0).total_memory/1e9)

# 5. MOUNT GOOGLE DRIVE (optional, for saving)
drive.mount('/content/drive', force_remount=False)

# 6. OUTPUT DIR + CHECKPOINT management
OUTPUT_DIR = "/content/drive/MyDrive/WELFake_MiniLM_LoRA"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def manage_checkpoints(output_dir, keep_latest=2):
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if len(ckpts) <= keep_latest:
        return
    def idx(n):
        try:
            return int(n.split("-")[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))
    ckpts_sorted = sorted(ckpts, key=idx)
    for ck in ckpts_sorted[:-keep_latest]:
        shutil.rmtree(os.path.join(output_dir, ck), ignore_errors=True)
        print("Removed old checkpoint:", ck)

def get_last_checkpoint(output_dir):
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if not ckpts:
        return None
    def idx(n):
        try:
            return int(n.split("-")[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))
    ckpts_sorted = sorted(ckpts, key=idx, reverse=True)
    manage_checkpoints(output_dir, keep_latest=2)
    return os.path.join(output_dir, ckpts_sorted[0])

last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
print("Last checkpoint:", last_checkpoint or "None -> training from scratch")

# 7. LOAD & CLEAN DATA (WELFake)
print("\nLoading WELFake...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])
print("Raw samples:", len(df))

df['content'] = df.get('title','').fillna('') + " [SEP] " + df.get('text','').fillna('')

def clean_text(text):
    if not isinstance(text, str): return ""
    t = text.lower()
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'<.*?>', ' ', t)
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

print("Cleaning text...")
df['content'] = df['content'].apply(clean_text)
df = df[df['content'].str.len() > 20].drop_duplicates(subset=['content']).reset_index(drop=True)
print("After cleaning & dedup:", len(df))

# 8. CLASS WEIGHTS
classes = np.array([0,1])
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=df["label"])
class_weights = torch.tensor(class_weights, dtype=torch.float32)
print("Class weights:", class_weights.numpy())

# 9. STRATIFIED SPLIT
labels = df['label'].values
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42, stratify=labels)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])
print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

train_ds = Dataset.from_pandas(train_df[['content','label']].reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df[['content','label']].reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df[['content','label']].reset_index(drop=True))
dataset_dict = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

# 10. TOKENIZER (MiniLM)
MODEL_NAME = "nreimers/MiniLM-L6-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

print("Tokenizing...")
tokenized = dataset_dict.map(tokenize_fn, batched=True, batch_size=1000, remove_columns=['content'])
tokenized = tokenized.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 11. LOAD BASE MODEL
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 12. LO-RA CONFIG (PEFT)
# target_modules includes common naming variations for attention projection layers;
# adjust if you know exact module names in your checkpoint.
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,                 # LoRA rank (you can try 4,8,16)
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    # include several possible projection names ‚Äî PEFT will pick ones present in model
    target_modules=["q_lin","k_lin","v_lin","query","key","value","q_proj","k_proj","v_proj"]
)

print("Applying LoRA...")
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# 13. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    try:
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
        auc = roc_auc_score(labels, probs)
    except Exception:
        auc = None
    return {"accuracy": float(acc), "precision": float(precision), "recall": float(recall), "f1": float(f1), "auc": auc}

# 14. WEIGHTED TRAINER (signature fixed)
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 15. TRAINING ARGS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to="none",
)

# 16. INIT TRAINER
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# 17. TRAIN (resume if checkpoint found)
print("\n" + "="*60)
print("üöÄ START TRAINING MiniLM + LoRA")
print("="*60 + "\n")

trainer.train(resume_from_checkpoint=last_checkpoint)

# after training, cleanup old checkpoints
manage_checkpoints(OUTPUT_DIR, keep_latest=2)

# 18. EVALUATE ON TEST
print("\nEVALUATING ON TEST SET")
results = trainer.evaluate(tokenized["test"])
for k,v in results.items():
    print(f"{k}: {v}")

# 19. SAVE (PEFT model + tokenizer)
final_model_dir = os.path.join(OUTPUT_DIR, "final_minilm_lora")
os.makedirs(final_model_dir, exist_ok=True)
model.save_pretrained(final_model_dir)   # saves adapter weights + config
tokenizer.save_pretrained(final_model_dir)
print("Saved MiniLM+LoRA to:", final_model_dir)

# 20. OPTIONAL: Save a copy of your paper next to the model for record
try:
    if os.path.exists(PAPER_PATH):
        shutil.copy(PAPER_PATH, os.path.join(final_model_dir, os.path.basename(PAPER_PATH)))
        print("Copied paper to model folder:", os.path.join(final_model_dir, os.path.basename(PAPER_PATH)))
except Exception as e:
    print("Could not copy paper file:", e)

# Done
print("\nALL DONE ‚Äî MiniLM + LoRA training finished.")


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCUDA available: False


ValueError: mount failed