# Transfer Learning for AI Text Detection

Fine-tuning SOTA AI text detection models (`fakespot-ai/roberta-base-ai-text-detection-v1`) on the Mercor AI Detection dataset.

## Approach
- Uses pretrained AI text detection model specifically trained for this task
- Custom ANN classifier head for improved performance
- Combines topic + answer in input format
- Upsamples minority class for balanced training
- Comprehensive evaluation with ROC curves, PR curves, and confusion matrices


In [6]:
# Imports
import os
import random
import math
import warnings
import time
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, 
    roc_curve, auc, precision_recall_curve
)

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    logging as transformers_logging
)
from datasets import Dataset

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Settings
SEED = 42
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
MAX_LEN = 512
PER_DEVICE_TRAIN_BATCH = 16  # Reduced for local training
PER_DEVICE_EVAL_BATCH = 16
GRADIENT_ACCUMULATION_STEPS = 1
NUM_EPOCHS = 10
LEARNING_RATE = 2e-5
FP16 = False  # Set to True if using CUDA
OUTPUT_DIR = "./finetuned_roberta_ann_topic_answer"
PLOT_DIR = "./plots"

# Create directories
os.makedirs(PLOT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

transformers_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=UserWarning)

print("✓ Imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if hasattr(torch.backends, 'mps'):
    print(f"MPS available: {torch.backends.mps.is_available()}")


RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

## Helper Functions

In [None]:
# Helper functions
def clean_text_field(s):
    """Clean text field by removing extra whitespace and newlines."""
    if pd.isna(s):
        return ""
    return " ".join(str(s).strip().replace("\r", " ").replace("\n", " ").split())


def make_input_string(topic, answer):
    """Combine topic and answer into a formatted input string."""
    t = clean_text_field(topic)
    a = clean_text_field(answer)
    return f"TOPIC: {t}\n\nANSWER: {a}"


print("✓ Helper functions defined")


## Load and Prepare Data
to 

In [None]:
# Load data
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("=" * 60)
print("DATASET OVERVIEW")
print("=" * 60)
print(f"Train shape: {df.shape}")
print(f"Test shape: {test_df.shape}")

# Validate required columns
required = {"topic", "answer", "is_cheating"}
if not required.issubset(df.columns):
    raise ValueError(f"train.csv must contain columns: {required}")

if "id" not in test_df.columns:
    raise ValueError("test.csv must contain column 'id'")

# Build combined text field
print("\nCreating combined text field (TOPIC + ANSWER)...")
df["text"] = df.apply(lambda r: make_input_string(r["topic"], r["answer"]), axis=1)
df["label"] = df["is_cheating"].astype(int)
test_df["text"] = test_df.apply(lambda r: make_input_string(r.get("topic", ""), r.get("answer", "")), axis=1)

print(f"\nOriginal class distribution:")
print(df["label"].value_counts().to_dict())
print(f"Class percentages:")
print(df["label"].value_counts(normalize=True) * 100)


s 

In [None]:
# Upsample minority class = 0
counts = df["label"].value_counts().to_dict()
print("Original label counts:", counts)

target_label = 0  # Upsample minority class 0
majority_count = df["label"].value_counts().max()
minor_count = df[df["label"] == target_label].shape[0]

if minor_count == 0:
    raise ValueError(f"No samples with label {target_label}, cannot upsample.")

if minor_count < majority_count:
    needed = majority_count - minor_count
    minority_df = df[df["label"] == target_label]
    extra = minority_df.sample(n=needed, replace=True, random_state=SEED)
    df_ups = pd.concat([df, extra], ignore_index=True).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    print(f"\n✓ Upsampled label={target_label}: +{needed} samples")
    print(f"New counts: {df_ups['label'].value_counts().to_dict()}")
    df = df_ups
else:
    print("\nNo upsampling needed (minority already >= majority).")


## Train/Validation Split


In [None]:
# Train/Val split (stratified)
train_df, val_df = train_test_split(
    df[["text", "label"]],
    test_size=0.20,
    random_state=SEED,
    stratify=df["label"]
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")
print("Post-split label counts (train):", train_df["label"].value_counts().to_dict())

# Save CSVs for reproducibility (optional)
train_df.to_csv("train_for_finetune.csv", index=False)
val_df.to_csv("val_for_finetune.csv", index=False)
print("\n✓ Saved train_for_finetune.csv and val_for_finetune.csv")

# Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df[["id", "text"]].rename(columns={"id": "orig_id"}))

print(f"\n✓ Created HuggingFace datasets")
print(f"  Train: {len(train_ds)} samples")
print(f"  Val: {len(val_ds)} samples")
print(f"  Test: {len(test_ds)} samples")


## Load Model and Tokenizer


In [None]:
# Load tokenizer and model
print(f"Loading model: {MODEL_NAME}")
print("This may take a few minutes on first run...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

print(f"✓ Model loaded: {MODEL_NAME}")
print(f"  Hidden size: {model.config.hidden_size}")
print(f"  Number of labels: {config.num_labels}")


t 

In [None]:
# Custom ANN Head for classification
hidden_size = model.config.hidden_size

class ANNHead(nn.Module):
    """Custom Artificial Neural Network head for classification."""
    
    def __init__(self, hidden_size, inner_dim=768, dropout_prob=0.25):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, inner_dim)
        self.act = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(inner_dim, max(128, inner_dim//2))
        self.act2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_prob)
        self.out = nn.Linear(max(128, inner_dim//2), 2)  # 2 labels
        
        # Initialize weights
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.constant_(self.fc1.bias, 0.0)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.constant_(self.fc2.bias, 0.0)
        nn.init.xavier_uniform_(self.out.weight)
        nn.init.constant_(self.out.bias, 0.0)
    
    def forward(self, features):
        """
        Accepts either:
         - features shape (batch, hidden) -> pooled representations
         - features shape (batch, seq_len, hidden) -> sequence output (pick CLS token)
        """
        if features.dim() == 3:
            # Pick CLS token (Roberta-style)
            features = features[:, 0, :]
        
        x = self.fc1(features)
        x = self.act(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.act2(x)
        x = self.dropout2(x)
        x = self.out(x)
        return x

# Replace classifier head with custom ANN
inner_dim = min(1024, hidden_size * 2)
if hasattr(model, "classifier"):
    model.classifier = ANNHead(hidden_size, inner_dim=inner_dim, dropout_prob=0.25)
else:
    model.classifier = ANNHead(hidden_size, inner_dim=inner_dim, dropout_prob=0.25)

print(f"✓ Replaced classifier with custom ANN head")
print(f"  Hidden size: {hidden_size}")
print(f"  Inner dimension: {inner_dim}")

# Disable cache for training
try:
    model.config.use_cache = False
    if hasattr(model, "model") and hasattr(model.model, "config"):
        model.model.config.use_cache = False
except Exception:
    pass

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() else "cpu")
model.to(device)
print(f"✓ Model moved to device: {device}")


## Tokenize Datasets
bench

In [None]:
# Tokenize datasets
def tokenize_batch(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)

print("Tokenizing train dataset...")
train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=["text"])

print("Tokenizing val dataset...")
val_ds = val_ds.map(tokenize_batch, batched=True, remove_columns=["text"])

print("Tokenizing test dataset...")
test_ds = test_ds.map(
    lambda b: tokenizer(b["text"], padding="max_length", truncation=True, max_length=MAX_LEN),
    batched=True,
    remove_columns=["text"]
)

# Rename label column for Trainer
train_ds = train_ds.rename_column("label", "labels")
val_ds = val_ds.rename_column("label", "labels")

# Set format for PyTorch
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")
test_ds.set_format(type="torch")

data_collator = DataCollatorWithPadding(tokenizer)

print("✓ Datasets tokenized and formatted")


## Define Metrics

In [None]:
# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()[:, 1]
    try:
        auc_score = roc_auc_score(labels, probs)
    except Exception:
        auc_score = 0.5
    preds = np.argmax(logits, axis=1)
    acc = (preds == labels).mean()
    return {"roc_auc": auc_score, "accuracy": float(acc)}

print("✓ Metrics function defined")


## Training Setup


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=FP16,
    logging_dir="./logs",
    logging_steps=50,
    seed=SEED,
    report_to=[],
)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

print("✓ Trainer configured")
print(f"  Training batch size: {PER_DEVICE_TRAIN_BATCH}")
print(f"  Eval batch size: {PER_DEVICE_EVAL_BATCH}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max epochs: {NUM_EPOCHS}")
print(f"  Early stopping patience: 2")


## Train Model


In [None]:
# Train model
print("Starting training...")
print("=" * 60)

t0 = time.time()
train_result = trainer.train()
t1 = time.time()

duration_min = (t1 - t0) / 60.0
print(f"\n{'='*60}")
print(f"Training completed in {duration_min:.2f} minutes")
print(f"{'='*60}")

# Save final model + tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✓ Model saved to {OUTPUT_DIR}")


## Evaluate on Validation Set


In [None]:
# Evaluate on validation set
eval_result = trainer.evaluate(eval_dataset=val_ds)
print("Evaluation result:")
print(eval_result)

# Get predictions for validation set
preds_out = trainer.predict(val_ds)
logits = preds_out.predictions
probs = torch.softmax(torch.from_numpy(logits), dim=1).numpy()[:, 1]
labels = preds_out.label_ids

# Calculate ROC-AUC
fpr, tpr, _ = roc_curve(labels, probs)
roc_auc_val = auc(fpr, tpr)
print(f"\nValidation ROC-AUC: {roc_auc_val:.4f}")


## Visualizations


In [None]:
# ROC curve
plt.figure(figsize=(7, 6))
plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc_val:.4f})')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Validation ROC Curve")
plt.legend()
plt.grid(alpha=0.3)
plt.savefig(os.path.join(PLOT_DIR, "val_roc_curve.png"), dpi=150, bbox_inches='tight')
plt.show()

# Precision-Recall curve
precision, recall, _ = precision_recall_curve(labels, probs)
plt.figure(figsize=(7, 6))
plt.plot(recall, precision, label="PR curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Validation Precision-Recall Curve")
plt.grid(alpha=0.3)
plt.savefig(os.path.join(PLOT_DIR, "val_pr_curve.png"), dpi=150, bbox_inches='tight')
plt.show()

# Confusion matrix (threshold 0.5)
preds_bin = (probs >= 0.5).astype(int)
cm = confusion_matrix(labels, preds_bin)
disp = ConfusionMatrixDisplay(cm, display_labels=[0, 1])
disp.plot(cmap="Blues")
plt.title("Validation Confusion Matrix")
plt.savefig(os.path.join(PLOT_DIR, "val_confusion_matrix.png"), dpi=150, bbox_inches='tight')
plt.show()

print("✓ Validation plots saved")


## Training History Visualization


In [None]:
# Training log visualization
log_history = trainer.state.log_history
if log_history:
    df_logs = pd.DataFrame(log_history)
    df_logs = df_logs[df_logs["epoch"].notna()]
    
    if not df_logs.empty:
        # Loss curves
        plt.figure(figsize=(9, 4))
        if "loss" in df_logs.columns:
            plt.plot(df_logs["epoch"], df_logs["loss"], marker="o", label="train_loss")
        if "eval_loss" in df_logs.columns:
            plt.plot(df_logs["epoch"], df_logs["eval_loss"], marker="o", label="eval_loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training / Validation Loss")
        plt.legend()
        plt.grid(alpha=0.3)
        plt.savefig(os.path.join(PLOT_DIR, "losses.png"), dpi=150, bbox_inches='tight')
        plt.show()
        
        # ROC-AUC by epoch
        if "eval_roc_auc" in df_logs.columns:
            plt.figure(figsize=(9, 4))
            plt.plot(df_logs["epoch"], df_logs["eval_roc_auc"], marker="o", label="eval_roc_auc")
            plt.xlabel("Epoch")
            plt.ylabel("ROC-AUC")
            plt.title("Validation ROC-AUC by Epoch")
            plt.legend()
            plt.grid(alpha=0.3)
            plt.savefig(os.path.join(PLOT_DIR, "val_roc_by_epoch.png"), dpi=150, bbox_inches='tight')
            plt.show()

print("✓ Training history plots saved")


## Save Validation Predictions


In [None]:
# Save validation predictions
val_out = val_df.copy()
val_out["pred_prob_ai"] = probs
val_out["pred_label"] = preds_bin
val_out.to_csv("val_predictions_with_probs.csv", index=False)
print("✓ Saved val_predictions_with_probs.csv")
print(f"\nValidation predictions summary:")
print(val_out[["label", "pred_prob_ai", "pred_label"]].describe())


## Predict on Test Set


In [None]:
# Predict on test set
print(f"Tokenized test set size: {len(test_ds)}")
print("Predicting on test set...")

test_preds = trainer.predict(test_ds)
test_logits = test_preds.predictions
test_probs = torch.softmax(torch.from_numpy(test_logits), dim=1).numpy()[:, 1]

# Retrieve original ids from test_df
ids = test_df["id"].tolist()
if len(ids) != len(test_probs):
    # Fallback: use orig_id from test_ds if available
    if "orig_id" in test_ds.column_names:
        ids = test_ds["orig_id"]
    else:
        ids = list(range(len(test_probs)))

# Create submission
submission_df = pd.DataFrame({"id": ids, "is_cheating": test_probs})
submission_csv = "submission_transfer_learning.csv"
submission_df.to_csv(submission_csv, index=False)

print(f"✓ Saved submission to {submission_csv} ({len(submission_df)} rows)")
print(f"\nSubmission statistics:")
print(submission_df["is_cheating"].describe())


## Test Set Probability Distribution


In [None]:
# Plot test distribution
plt.figure(figsize=(10, 5))
sns.histplot(test_probs, bins=50, kde=True)
plt.title("Test Set Predicted Probability Distribution (is_cheating)", fontweight='bold')
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.grid(alpha=0.3)
plt.savefig(os.path.join(PLOT_DIR, "test_prob_dist.png"), dpi=150, bbox_inches='tight')
plt.show()

print(f"\n{'='*60}")
print("ALL DONE!")
print(f"{'='*60}")
print(f"✓ Model trained and evaluated")
print(f"✓ Plots saved in {PLOT_DIR}")
print(f"✓ Submission saved: {submission_csv}")
print(f"✓ Model saved: {OUTPUT_DIR}")
print(f"\nFinal Validation ROC-AUC: {roc_auc_val:.4f}")
