In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from pathlib import Path
from collections import defaultdict
import pandas as pd
import numpy as np
from tqdm import tqdm
import random

In [None]:
# =========================================================
# ‚úÖ 1. Data Augmentation: Word Dropout
# =========================================================

def word_dropout(text, p=0.1):
    words = text.split()
    if len(words) <= 5:
        return text
    keep = [w for w in words if random.random() > p]
    return " ".join(keep) if keep else text

In [None]:
# =========================================================
# ‚úÖ 2. Dataset Generators (with augmentation for training)
# =========================================================

train_dir = "/kaggle/input/fake-or-real-the-impostor-hunt/data/train"
test_dir  = "/kaggle/input/fake-or-real-the-impostor-hunt/data/test"
train_csv = "/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv"

def train_generator(augment=True):
    df = pd.read_csv(train_csv)
    for _, row in df.iterrows():
        folder = Path(train_dir) / f"article_{row['id']:04d}"
        for text_id in [1, 2]:
            file_path = folder / f"file_{text_id}.txt"
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            label = 1 if text_id == row["real_text_id"] else 0

            # Original sample
            yield {"id": row["id"], "text": text, "text_id": text_id, "label": label}

            # Augmented sample (only for training)
            if augment and random.random() < 0.5:
                yield {"id": row["id"], "text": word_dropout(text), "text_id": text_id, "label": label}

def test_generator():
    data_dir = Path(test_dir)
    folders = sorted([f for f in data_dir.iterdir() if f.is_dir()])
    for folder in folders:
        folder_id = int(folder.name.split("_")[1])
        for text_id in [1, 2]:
            file_path = folder / f"file_{text_id}.txt"
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            yield {"id": folder_id, "text": text, "text_id": text_id}

# Create datasets
train_dataset = Dataset.from_generator(lambda: train_generator(augment=True))
test_dataset  = Dataset.from_generator(test_generator)

raw_datasets = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Split into train/val
train_idx, val_idx = train_test_split(
    range(len(raw_datasets['train'])), 
    test_size=0.2, 
    stratify=raw_datasets['train']['label'],
    random_state=42
)
train_split = raw_datasets['train'].select(train_idx)
val_split   = raw_datasets['train'].select(val_idx)

In [None]:
class SiameseSelfAttentionNetwork(nn.Module):
    def __init__(self, model_name, num_labels=2, dropout=0.3):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.backbone = AutoModel.from_pretrained(model_name)
        self.backbone.gradient_checkpointing_enable()
        hidden_size = self.backbone.config.hidden_size

        # Attention Layer
        self.cross_attn = nn.MultiheadAttention(hidden_size, num_heads=self.backbone.config.num_attention_heads, batch_first=True)

        # Interaction Head
        self.interaction_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(hidden_size//2, hidden_size//4), nn.ReLU(), nn.Dropout(dropout)
        )

        # Classifier
        self.classifier = nn.Linear(hidden_size//4, num_labels)

        # Tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def extract_mean_pooling(self, texts):
        encoded = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
        outputs = self.backbone(**encoded)
        last_hidden = outputs.last_hidden_state
        mask = encoded['attention_mask'].unsqueeze(-1).expand(last_hidden.size())
        mean_vec = (last_hidden * mask).sum(1) / mask.sum(1)
        return mean_vec

    def forward(self, texts, labels=None):
        vecs = self.extract_mean_pooling(texts)
        q = k = v = vecs.unsqueeze(1)
        attn_out, _ = self.cross_attn(q, k, v)
        features = self.interaction_head(attn_out.squeeze(1))
        logits = self.classifier(features)

        loss = None
        if labels is not None:
            criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = criterion(logits, labels)

        return type('Out', (), {'loss': loss, 'logits': logits})()

In [None]:
def train_fn(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        texts, labels = batch['text'], batch['label'].to(device)
        optimizer.zero_grad()
        out = model(texts, labels)
        out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step(); scheduler.step()
        total_loss += out.loss.item()
    return total_loss / len(dataloader)

In [None]:
def validate_fn(model, dataloader):
    model.eval()
    total_loss, preds, labels_all = 0, [], []
    with torch.no_grad():
        for batch in dataloader:
            texts, labels = batch['text'], batch['label'].to(device)
            out = model(texts, labels)
            total_loss += out.loss.item()
            pred = torch.argmax(out.logits, 1)
            preds.extend(pred.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    acc = (np.array(preds) == np.array(labels_all)).mean()
    return total_loss / len(dataloader), acc

In [None]:
# =========================================================
# ‚úÖ 5. Training Loop with Early Stopping + Gradual Unfreezing
# =========================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name = "distilbert-base-uncased"
model_name = "roberta-base"
model = SiameseSelfAttentionNetwork(model_name).to(device)

# Freeze backbone initially
for p in model.backbone.parameters():
    p.requires_grad = False

# Separate LRs: smaller for backbone
param_groups = [
    {"params": model.backbone.parameters(), "lr": 1e-5},
    {"params": [p for n,p in model.named_parameters() if "backbone" not in n], "lr": 3e-5}
]
optimizer = AdamW(param_groups, weight_decay=0.01)

train_loader = DataLoader(train_split, batch_size=4, shuffle=True)
val_loader   = DataLoader(val_split, batch_size=4, shuffle=False)

total_steps = len(train_loader) * 20
scheduler = get_linear_schedule_with_warmup(optimizer, 0, total_steps)

best_val_loss = float('inf')
best_metrics = {}  # <-- store train/val metrics
patience, counter = 3, 0

for epoch in range(20):
    if epoch == 3:  # Unfreeze backbone after 3 epochs
        for p in model.backbone.parameters(): 
            p.requires_grad = True
        optimizer = AdamW(param_groups, weight_decay=0.01)

    train_loss = train_fn(model, train_loader, optimizer, scheduler)
    val_loss, val_acc = validate_fn(model, val_loader)
    print(f"Epoch {epoch+1}: Train {train_loss:.4f} | Val {val_loss:.4f} | Acc {val_acc:.4f}")

    # ‚úÖ Save metrics when saving best model
    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        best_metrics = {
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "val_acc": val_acc
        }
        torch.save(model.state_dict(), "best_model.pth")
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("‚èπÔ∏è Early stopping triggered")
            break

print("\nüìå Best Model Metrics:")
print(f"Epoch {best_metrics['epoch']}: "
      f"Train Loss={best_metrics['train_loss']:.4f}, "
      f"Val Loss={best_metrics['val_loss']:.4f}, "
      f"Val Accuracy={best_metrics['val_acc']:.4f}")


In [None]:
# =========================================================
# ‚úÖ 6. Prediction on Test Set
# =========================================================
def predict_fn(model, dataloader):
    model.eval()
    probs_dict = defaultdict(dict)
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            texts, ids, text_ids = batch["text"], batch["id"], batch["text_id"]
            out = model(texts)
            probs = F.softmax(out.logits, 1)[:,1]
            for a, t, p in zip(ids, text_ids, probs):
                probs_dict[int(a)][int(t)] = p.item()
    preds = {a: max(p.items(), key=lambda x:x[1])[0] for a,p in probs_dict.items()}
    return preds

test_loader = DataLoader(raw_datasets['test'], batch_size=4)
preds = predict_fn(model, test_loader)

# Save submission
submission = pd.DataFrame({"id": list(preds.keys()), "real_text_id": list(preds.values())})
submission.to_csv("submission.csv", index=False)
print("‚úÖ Saved submission.csv")