In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [1]:
pip install raid-bench



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import TrainingArguments, Trainer, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification
!pip install datasets
import torch
from torch.optim import AdamW
from torch.nn import functional as F
import torch
from datasets import Dataset

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.auto import tqdm
from collections import defaultdict



# Data Pre-processing

## Importing our data

Let's start by importing the shared dataset. The dataset has label and source mislabeled. The first step is to switch the column names for label and source.

In [None]:
# Import data
train_df = pd.read_parquet('/content/drive/MyDrive/train_set_final.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/test_set_final.parquet')

print(train_df.head())
print(test_df.head())

In [None]:
# List out counts for each label and source combination
print("Distribution of train labels and sources:")
print(train_df.groupby(['source', 'label']).size().unstack(fill_value=0))
print("\nTotal counts:")
print(train_df.groupby('source').size())

# List out counts for each label and source combination
print("Distribution of test labels and sources:")
print(test_df.groupby(['source', 'label']).size().unstack(fill_value=0))
print("\nTotal counts:")
print(test_df.groupby('source').size())

Distribution of train labels and sources:
label          0       1
source                  
facebook   22647   47454
reddit    217917  192596
twitter    72000   64720

Total counts:
source
facebook     70101
reddit      410513
twitter     136720
dtype: int64
Distribution of test labels and sources:
label         0      1
source                
facebook   5662  11863
reddit    54480  48149
twitter   18000  16180

Total counts:
source
facebook     17525
reddit      102629
twitter      34180
dtype: int64


# Baseline

Answerdotai-ModernBERT-base-ai-detector is based on ModernBERT-base, a lightweight and efficient BERT-based model.

It has been fine-tuned for AI-generated vs Human-written text classification, allowing it to distinguish between texts written by AI models (ChatGPT, DeepSeek, Claude, etc.) and human authors.


📊 Training and Evaluation Data

The model was fine-tuned on 35,894 training samples and 8,974 test samples.
The dataset consists of AI-generated text samples (ChatGPT, Claude, DeepSeek, etc.) and human-written samples (Wikipedia, books, articles).
Labels:
1 → AI-generated text
0 → Human-written text

Learn more here: https://huggingface.co/AICodexLab/answerdotai-ModernBERT-base-ai-detector

## Test model out of the box

In [4]:
def load_model_from_transformers(model_name: str, device: torch.device):
    """
    Load the model and tokenizer from a given model name and move the model to the specified device.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)
    model.eval()
    return tokenizer, model

def evaluate_predictions(true_labels, predictions):
    """
    Evaluate predictions using accuracy, classification report, and confusion matrix.
    """
    # Ensure true_labels are integers
    true_labels = [int(label) for label in true_labels]
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    cm = confusion_matrix(true_labels, predictions)
    return accuracy, report, cm

def get_predictions(model, tokenizer, texts, device, batch_size=32, max_length=256):
    all_predictions = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=max_length
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_predictions.extend(batch_preds)
    return all_predictions

def run_evaluation(model, tokenizer, test_df, save_file=False,
                   batch_size: int = 32,
                   max_length: int = 256):
    """
    Evaluate model on a labelled DataFrame.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # --- Inference ---
    texts       = test_df['text'].tolist()
    true_labels = test_df['label'].tolist()
    predictions = get_predictions(model, tokenizer, texts, device,
                                  batch_size=batch_size, max_length=max_length)

    # --- Convert labels and predictions to int for consistency ---
    try:
        true_labels = [int(label) for label in true_labels]
        predictions = [int(pred) for pred in predictions]
    except Exception as e:
        print("Error converting labels to int:", e)
        raise

    # --- Global Metrics ---
    accuracy_val = accuracy_score(true_labels, predictions)
    report       = classification_report(true_labels, predictions, digits=3)
    cm           = confusion_matrix(true_labels, predictions)

    print(f"\nOVERALL ACCURACY: {accuracy_val:.4f}")
    print("\nClassification Report:\n", report)
    print("\nConfusion Matrix:\n", cm)

    # --- Average token lengths for correct vs. incorrect predictions ---
    correct_token_lengths   = []
    incorrect_token_lengths = []
    for text, true, pred in zip(texts, true_labels, predictions):
        # Use the tokenizer to count tokens (including special tokens)
        tokenized = tokenizer(text, add_special_tokens=True, truncation=True)
        token_length = len(tokenized['input_ids'])
        if true == pred:
            correct_token_lengths.append(token_length)
        else:
            incorrect_token_lengths.append(token_length)

    avg_correct   = sum(correct_token_lengths)   / len(correct_token_lengths)   if correct_token_lengths else 0
    avg_incorrect = sum(incorrect_token_lengths) / len(incorrect_token_lengths) if incorrect_token_lengths else 0

    print("\nAverage Token Lengths:")
    print(f"Correct predictions average token length:   {avg_correct:.2f}")
    print(f"Incorrect predictions average token length: {avg_incorrect:.2f}")

    # --- Per‑source Breakdown (if available) ---
    if 'source' in test_df.columns:
        per_src = defaultdict(lambda: {'y_true': [], 'y_pred': []})
        for src, y_true, y_pred in zip(test_df['source'], true_labels, predictions):
            per_src[src]['y_true'].append(y_true)
            per_src[src]['y_pred'].append(y_pred)

        rows = []
        for src, d in per_src.items():
            src_acc = accuracy_score(d['y_true'], d['y_pred'])
            rows.append({'source': src, 'n': len(d['y_true']), 'accuracy': src_acc})
        print("\nPer‑Source Accuracy")
        print(pd.DataFrame(rows).sort_values('accuracy', ascending=False).to_string(index=False))

    if save_file:

        # Create a copy of the test DataFrame to avoid modifying the original
        test_df_with_predictions = test_df.copy()

        # Add the predictions as a new column
        test_df_with_predictions['predicted_label'] = predictions

        # Save the DataFrame with predictions to a new file
        test_df_with_predictions.to_csv('/content/drive/MyDrive/test_data_with_predictions.csv', index=False)



def show_misclassified_examples(tokenizer, tokenized_dataset, predictions, max_examples=5):
    """
    Display up to `max_examples` misclassified examples with decoded text.

    Parameters:
        tokenizer: HuggingFace tokenizer used to encode the data
        tokenized_dataset: Tokenized dataset (assumes it contains 'input_ids' and 'label')
        predictions: List or array of predicted labels
        max_examples: Max number of misclassified examples to print
    """
    wrong_count = 0

    for i, (true, pred) in enumerate(zip(tokenized_dataset['label'], predictions)):
        if true != pred:
            print(f"Example {i+1}:")
            print(f"True: {true}, Predicted: {pred}")
            text = tokenizer.decode(tokenized_dataset['input_ids'][i], skip_special_tokens=True)
            print(f"Text: {text}\n")
            wrong_count += 1
            if wrong_count >= max_examples:
                break

def preprocess_df(df, tokenizer, max_length=512, remove_columns=["text", "source"], convert_labels=True):
    """
    Convert a pandas DataFrame to a tokenized Hugging Face Dataset and convert labels to integers.

    Args:
        df (pd.DataFrame): The input DataFrame with at least 'text' (and 'label') columns.
        tokenizer (PreTrainedTokenizer): The tokenizer to use.
        max_length (int): Maximum sequence length for tokenization.
        remove_columns (list): Columns to remove after tokenization.
        convert_labels (bool): Whether to convert labels to integers.

    Returns:
        Dataset: A tokenized dataset formatted for PyTorch.
    """
    # Convert DataFrame to Hugging Face dataset
    dataset = Dataset.from_pandas(df)

    # Tokenize the dataset; using batched=True for efficiency
    tokenized_dataset = dataset.map(
        lambda examples: tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=max_length
        ),
        batched=True
    )

    # Remove unnecessary columns
    tokenized_dataset = tokenized_dataset.remove_columns(remove_columns)

    # Optionally, convert labels to integers
    if convert_labels and "label" in tokenized_dataset.column_names:
        tokenized_dataset = tokenized_dataset.map(
            lambda examples: {"label": [int(label) for label in examples["label"]]},
            batched=True
        )

    # Set the dataset format to torch for PyTorch models
    tokenized_dataset.set_format("torch")

    return tokenized_dataset

    # Remove columns not needed for model input (e.g., original text, source)
    tokenized_dataset = tokenized_dataset.remove_columns(remove_columns)

    # Set the dataset format to torch for compatibility with PyTorch models
    tokenized_dataset.set_format("torch")

    return tokenized_dataset

In [None]:
model_name = "AICodexLab/answerdotai-ModernBERT-base-ai-detector"
tokenizer, model = load_model_from_transformers(model_name, torch.device('cpu'))

run_evaluation(
    model=model,
    tokenizer=tokenizer,
    test_df=test_df.sample(500),
    batch_size=2,
    max_length=128
)

Using device: cpu


Predicting:   0%|          | 0/250 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Fine-tuning ModernBERT to our data

## Pre-processing by stratifying dataset

In [None]:
# Stratify dataset using train_test_split
from sklearn.model_selection import train_test_split

df = train_df.copy()
df['concatid'] = df['label'].astype('str') + df['source']

# Split the data into  training and testing sets
train_data, eval_data = train_test_split(df,
                                         train_size=400_000,
                                         test_size=100_000,
                                         random_state=42,
                                         stratify=df['concatid'])

Now I will try to fine tune the model to improve it's accuracy.

In [None]:
# Load model

tokenizer = AutoTokenizer.from_pretrained("AICodexLab/answerdotai-ModernBERT-base-ai-detector")
model = AutoModelForSequenceClassification.from_pretrained("AICodexLab/answerdotai-ModernBERT-base-ai-detector")

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/598M [00:00<?, ?B/s]

In [None]:
train_tokenized = preprocess_df(train_data, tokenizer, max_length=512)
eval_tokenized = preprocess_df(eval_data, tokenizer, max_length=512)
# test_tokenized = preprocess_df(test_df, tokenizer, max_length=512)

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
# Set up training arguments following Hugging Face's guidelines
training_args = TrainingArguments(
    output_dir="train_output_1",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    learning_rate=3e-5,
    fp16=True,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,
    warmup_steps=500
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized
)





In [None]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss
500,0.1365,0.091634
1000,0.1928,0.044599
1500,0.1574,0.035925
2000,0.1468,0.039154
2500,0.1601,0.029826
3000,0.1329,0.029056
3500,0.0639,0.03809
4000,0.0637,0.04033
4500,0.0725,0.044819
5000,0.0579,0.037646


TrainOutput(global_step=9375, training_loss=0.08287629855473837, metrics={'train_runtime': 9930.5709, 'train_samples_per_second': 120.839, 'train_steps_per_second': 0.944, 'total_flos': 4.089094152192e+17, 'train_loss': 0.08287629855473837, 'epoch': 3.0})

## Hyperparameter Optimization

In [None]:
def create_training_args(config):
    """
    Create a TrainingArguments object given a configuration dictionary.

    Args:
        config (dict): Dictionary with hyperparameter values (keys should include 'output_dir',
                       'learning_rate', 'per_device_train_batch_size', 'per_device_eval_batch_size',
                       'num_train_epochs', etc.)

    Returns:
        TrainingArguments: The training arguments object to be used with Trainer.
    """
    return TrainingArguments(
        output_dir=config.get("output_dir", "train_output"),
        evaluation_strategy=config.get("evaluation_strategy", "epoch"),
        per_device_train_batch_size=config.get("per_device_train_batch_size", 16),
        per_device_eval_batch_size=config.get("per_device_eval_batch_size", 16),
        gradient_accumulation_steps=config.get("gradient_accumulation_steps", 4),
        weight_decay=config.get("weight_decay", 0.01),
        num_train_epochs=config.get("num_train_epochs", 3),
        logging_steps=config.get("logging_steps", 50),
        save_strategy=config.get("save_strategy", "epoch"),
        push_to_hub=config.get("push_to_hub", False),
        report_to=config.get("report_to", "none"),
        learning_rate=config.get("learning_rate", 2e-5),
        fp16=config.get("fp16", True)
    )

def train_model(model, train_dataset, eval_dataset, training_config):
    """
    Fine-tune a model with the given hyperparameter configuration.

    Args:
        model: Pretrained or fine-tuned model to be further fine-tuned.
        train_dataset: The tokenized training dataset.
        eval_dataset: The tokenized evaluation dataset.
        training_config (dict): Dictionary containing hyperparameter settings.

    Returns:
        Trainer: The Trainer object after training. You might want to retrieve logs or final metrics.
    """
    training_args = create_training_args(training_config)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    # Train the model and return the trainer (which includes training logs)
    trainer.train()
    return trainer

# Define a list of hyperparameter configurations to try.
# You can expand this list with additional hyperparameters.
hyperparameter_configs = [
    {
        "output_dir": "train_output_lr2e-5_bs16",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "num_train_epochs": 3,
    },
    {
        "output_dir": "train_output_lr3e-5_bs16",
        "learning_rate": 3e-5,
        "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 16,
        "num_train_epochs": 3,
    },
    {
        "output_dir": "train_output_lr2e-5_bs32",
        "learning_rate": 2e-5,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 3,
    },
    {
        "output_dir": "train_output_lr3e-5_bs32",
        "learning_rate": 3e-5,
        "per_device_train_batch_size": 32,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": 3,
    },
]


In [None]:

# Assume that 'finetuned_model', 'train_tokenized', and 'eval_tokenized' are already defined
# (e.g., loaded or preprocessed elsewhere in your code).
import copy

for config in hyperparameter_configs:
    print(f"Starting training with configuration: {config}")
    # Reinitialize or clone the model to ensure a fresh starting point
    model_copy = copy.deepcopy(model)
    trainer = train_model(model_copy, train_tokenized, eval_tokenized, config)
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {config['output_dir']}: {eval_results}")
    print("-" * 40)

Starting training with configuration: {'output_dir': 'train_output_lr2e-5_bs16', 'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'num_train_epochs': 3}




Epoch,Training Loss,Validation Loss
1,1.5882,0.171359
2,0.1078,0.169324


Evaluation results for train_output_lr2e-5_bs16: {'eval_loss': 0.16932375729084015, 'eval_runtime': 10.6775, 'eval_samples_per_second': 468.275, 'eval_steps_per_second': 29.314, 'epoch': 2.9712460063897765}
----------------------------------------
Starting training with configuration: {'output_dir': 'train_output_lr3e-5_bs16', 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'num_train_epochs': 3}




Epoch,Training Loss,Validation Loss
1,1.5872,0.141576
2,0.0723,0.169138


Evaluation results for train_output_lr3e-5_bs16: {'eval_loss': 0.16913793981075287, 'eval_runtime': 10.8886, 'eval_samples_per_second': 459.194, 'eval_steps_per_second': 28.746, 'epoch': 2.9712460063897765}
----------------------------------------
Starting training with configuration: {'output_dir': 'train_output_lr2e-5_bs32', 'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3}




Epoch,Training Loss,Validation Loss
1,No log,0.177145
2,0.368600,0.153765


Evaluation results for train_output_lr2e-5_bs32: {'eval_loss': 0.1537652611732483, 'eval_runtime': 9.4207, 'eval_samples_per_second': 530.747, 'eval_steps_per_second': 16.665, 'epoch': 2.9426751592356686}
----------------------------------------
Starting training with configuration: {'output_dir': 'train_output_lr3e-5_bs32', 'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 3}




Epoch,Training Loss,Validation Loss
1,No log,0.171115
2,0.257600,0.148992


Evaluation results for train_output_lr3e-5_bs32: {'eval_loss': 0.14899218082427979, 'eval_runtime': 9.4286, 'eval_samples_per_second': 530.303, 'eval_steps_per_second': 16.652, 'epoch': 2.9426751592356686}
----------------------------------------


In [10]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the device
model.to(device)

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [None]:
run_evaluation(
    model=model,
    tokenizer=tokenizer,
    test_df=test_df,
    batch_size=32,
    max_length=512,
    save_file=True
)

Using device: cuda


Predicting:   0%|          | 0/4823 [00:00<?, ?it/s]


OVERALL ACCURACY: 0.9887

Classification Report:
               precision    recall  f1-score   support

           0      0.985     0.993     0.989     78142
           1      0.993     0.984     0.988     76192

    accuracy                          0.989    154334
   macro avg      0.989     0.989     0.989    154334
weighted avg      0.989     0.989     0.989    154334


Confusion Matrix:
 [[77578   564]
 [ 1187 75005]]

Average Token Lengths:
Correct predictions average token length:   102.45
Incorrect predictions average token length: 53.60

Per‑Source Accuracy
  source      n  accuracy
  reddit 102629  0.993101
 twitter  34180  0.990404
facebook  17525  0.959201


In [None]:
# Save model

trainer.model.save_pretrained("/content/drive/MyDrive/")
tokenizer.save_pretrained("/content/drive/MyDrive/")

('/content/drive/MyDrive/tokenizer_config.json',
 '/content/drive/MyDrive/special_tokens_map.json',
 '/content/drive/MyDrive/tokenizer.json')

In [None]:
# Save RAID Model

# trainer.model.save_pretrained("/content/drive/MyDrive/raid/")
# tokenizer.save_pretrained("/content/drive/MyDrive/raid/")

('/content/drive/MyDrive/raid/tokenizer_config.json',
 '/content/drive/MyDrive/raid/special_tokens_map.json',
 '/content/drive/MyDrive/raid/tokenizer.json')

In [8]:
# Load the saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/raid/")
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/raid/")


# Testing model performance on RAID Dataset

In [2]:
pip install raid-bench

Collecting raid-bench
  Downloading raid_bench-0.1.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy~=1.26.4 (from raid-bench)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn~=1.3.2 (from raid-bench)
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading raid_bench-0.1.0-py3-none-any.whl (11 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m93.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00

In [5]:
from raid.utils import load_data

# Download the RAID dataset without adversarial attacks included. Adversarial
# data is code to specifically trick detectors.

# Importing the labeled train dataset to use for test verification.
raid_df = load_data(split="train", include_adversarial=False)

Downloading https://dataset.raid-bench.xyz/train_none.csv (801662741B) to /root/.cache/raid/train_none.csv


100%|██████████| 802M/802M [00:18<00:00, 42.6MB/s]


In [6]:
# Rename columns to the expected names
raid_test_df = raid_df.rename(columns={"generation": "text", "model": "label"})

# Correct the labeling to our schema
raid_test_df["label"] = raid_test_df["label"].apply(lambda x: 0 if x == "human" else 1)

# It's a big dataset, so I only sampled 100000
raid_test_df = raid_test_df.sample(50000)




In [None]:
# Correct the labeling to our schema
raid_test_df["label"] = raid_test_df["label"].apply(lambda x: 0 if x == "human" else 1)

# Convert labels to integers (if they aren’t already)
raid_test_df["label"] = raid_test_df["label"].astype(int)

raid_test_tokenized = preprocess_df(raid_test_df, tokenizer,
                                    remove_columns=['id', 'adv_source_id', 'source_id',
                                                    'decoding', 'repetition_penalty', 'attack',
                                                    'domain', 'title', 'prompt',
                                                    '__index_level_0__'], max_length=512)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
# Print the 1 / 0 split
print(raid_test_df['label'].value_counts())


label
1    48519
0     1481
Name: count, dtype: int64


In [11]:
run_evaluation(
    model=model,
    tokenizer=tokenizer,
    test_df=raid_test_df,
    batch_size=32,
    max_length=512
)

Using device: cuda


Predicting:   0%|          | 0/1563 [00:00<?, ?it/s]


OVERALL ACCURACY: 0.9944

Classification Report:
               precision    recall  f1-score   support

           0      0.872     0.932     0.901      1370
           1      0.998     0.996     0.997     48630

    accuracy                          0.994     50000
   macro avg      0.935     0.964     0.949     50000
weighted avg      0.995     0.994     0.994     50000


Confusion Matrix:
 [[ 1277    93]
 [  187 48443]]

Average Token Lengths:
Correct predictions average token length:   318.20
Incorrect predictions average token length: 271.06


In [None]:
wrong_count = 0

for i, (true, pred) in enumerate(zip(raid_test_tokenized['label'], predictions)):
    if true != pred:
        print(f"Example {i+1}:")
        print(f"True: {true}, Predicted: {pred}")
        # Decode the input_ids back to text
        text = tokenizer.decode(raid_test_tokenized['input_ids'][i], skip_special_tokens=True)
        print(f"Text: {text}")
        print("\n")
        wrong_count += 1
        if wrong_count >= 5:
            break

Example 346:
True: 1, Predicted: 0
Text: The unranked queue is filled with smurfs right now because there is literally no reason not to smurf. You can play 30+ games and literally get reported 0 times because nobody cares to report. If you grind the rank system hard, this results in a bunch of smearfasces still hanging around in the unranked queue, ruining the experience for new players or people who just want to play unranked for a bit for a change of pace.

I think a system similar to Overwatchs would be great. Occasionally when someone leaves a game, after the match is over you can verify if they were smurfing, and if so, mark them as such. Afterwards, you can look at your report history and see what your average reports are, and if they are too high compared to games played, you get put in a penalty box for a while where you have increased chances of being matched with other reported players and players who have earned a lot of ranking points recently. 

This would result in tons o

In [None]:
# Set up training arguments following Hugging Face's guidelines
training_args = TrainingArguments(
    output_dir="train_output_1",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_steps=250,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
    learning_rate=2e-5,
    fp16=True  # if you're on a GPU with FP16 support
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_dataset,
    eval_dataset=eval_tokenized
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1672,0.050627
2,0.0049,0.110077


TrainOutput(global_step=7029, training_loss=0.09242210898223481, metrics={'train_runtime': 3623.166, 'train_samples_per_second': 124.201, 'train_steps_per_second': 1.94, 'total_flos': 1.532810573263012e+17, 'train_loss': 0.09242210898223481, 'epoch': 2.9988266666666665})

## Testing Mistral Data

In [None]:
# Import Mistral data

# Import data
linkedin_raw_df = pd.read_parquet('/content/drive/MyDrive/matt-266-project/validation_linkedin.parquet')
mistral_raw_df = pd.read_parquet('/content/drive/MyDrive/matt-266-project/validation_mistral.parquet')


In [None]:
print(linkedin_raw_df.head())
print(mistral_raw_df.head())

                                                text  label    source
0  Need a post idea? I gotchu... Suno is an AI-ge...  human  linkedin
1  I recently got called out .... Last week, I wr...  human  linkedin
2  🔍 Can you spot AI-generated content? It’s gett...  human  linkedin
3  👾 AI just got smarter! 👉 Here are the latest r...  human  linkedin
4  This week, I saw a master of processes in acti...  human  linkedin
                                                text  label  source
0  We have. The Chandra X-Ray telescope has been ...  human  reddit
1  Yes.\n\nThe only issue would be the sheer size...  human  reddit
2  The final scene involves future contracts.  Th...  human  reddit
3  Legislative bodies write their own rules. That...  human  reddit
4  There's a very approachable book which discuss...  human  reddit


In [None]:
# Correct the labeling to our schema
linkedin_df = linkedin_raw_df.copy()
linkedin_df["label"] = linkedin_raw_df["label"].apply(lambda x: 0 if x == "human" else 1)
linkedin_df["label"] = linkedin_df["label"].astype(int)

mistral_df = mistral_raw_df.copy()
mistral_df["label"] = mistral_raw_df["label"].apply(lambda x: 0 if x == "human" else 1)
mistral_df["label"] = mistral_df["label"].astype(int)

In [None]:
# prompt: use run_evaluation to test these datasets

run_evaluation(
    model=model,
    tokenizer=tokenizer,
    test_df=mistral_df,
    batch_size=32,
    max_length=512
)

run_evaluation(
    model=model,
    tokenizer=tokenizer,
    test_df=linkedin_df,
    batch_size=32,
    max_length=512
)


Using device: cpu


Predicting:   0%|          | 0/614 [00:00<?, ?it/s]

Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.


KeyboardInterrupt: 

In [None]:
# Load the saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/")
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/")

In [None]:
# Correct the labeling to our schema
linkedin_df = linkedin_raw_df.copy()
linkedin_df["label"] = linkedin_raw_df["label"].apply(lambda x: 0 if x == "human" else 1)
linkedin_df["label"] = linkedin_df["label"].astype(int)

mistral_df = mistral_raw_df.copy()
mistral_df["label"] = mistral_raw_df["label"].apply(lambda x: 0 if x == "human" else 1)
mistral_df["label"] = mistral_df["label"].astype(int)


linkedin_tokenized = preprocess_df(linkedin_df, tokenizer,
                                    remove_columns=['source'], max_length=512)

mistral_tokenized = preprocess_df(mistral_df, tokenizer,
                                    remove_columns=['source'], max_length=512)

NameError: name 'linkedin_raw_df' is not defined

In [None]:
# Set up training arguments following Hugging Face's guidelines.
# It's not necessary but we need to configure it to use train.predict
training_args = TrainingArguments(
    output_dir="train_output_1",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_steps=250,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none",
    learning_rate=2e-5,
    fp16=True  # if you're on a GPU with FP16 support
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=linkedin_df,
    eval_dataset=linkedin_df
)



In [None]:
# prompt: Use above equations to evaluate the two datasets

# ... (Your existing code)

# Evaluate on LinkedIn dataset
linkedin_results = trainer.predict(linkedin_tokenized)
linkedin_predictions = np.argmax(linkedin_results.predictions, axis=1)
linkedin_accuracy = accuracy_score(linkedin_tokenized['label'], linkedin_predictions)
linkedin_report = classification_report(linkedin_tokenized['label'], linkedin_predictions)

print(f"LinkedIn Accuracy: {linkedin_accuracy:.4f}")
print("LinkedIn Classification Report:")
print(linkedin_report)


# Evaluate on Mistral dataset
mistral_results = trainer.predict(mistral_tokenized)
mistral_predictions = np.argmax(mistral_results.predictions, axis=1)
mistral_accuracy = accuracy_score(mistral_tokenized['label'], mistral_predictions)
mistral_report = classification_report(mistral_tokenized['label'], mistral_predictions)

print(f"Mistral Accuracy: {mistral_accuracy:.4f}")
print("Mistral Classification Report:")
mistral_report


NameError: name 'trainer' is not defined

## Linkedin **Results**

In [None]:
wrong_count = 0

for i, (true, pred) in enumerate(zip(raid_test_tokenized['label'], predictions)):
    if true != pred:
        print(f"Example {i+1}:")
        print(f"True: {true}, Predicted: {pred}")
        # Decode the input_ids back to text
        text = tokenizer.decode(raid_test_tokenized['input_ids'][i], skip_special_tokens=True)
        print(f"Text: {text}")
        print("\n")
        wrong_count += 1
        if wrong_count >= 15:
            break

# Token Analysis

In [None]:
import pandas as pd

# Load the CSV file into a pandas DataFrame.
df = pd.read_csv('/content/drive/MyDrive/test_data_with_predictions.csv')

print(df.head())


                                                text   source  label  \
0  We still have a dominant foot as well. Think a...   reddit      0   
1  "Only 8 more days until I turn 20! My iPod see...  twitter      1   
2  The AC button controls the air conditioning co...   reddit      1   
3  "Yup, you're right, it's crazy that there's st...   reddit      1   
4  The worst is after being out all day on rough ...   reddit      0   

   predicted_label  
0                0  
1                1  
2                1  
3                1  
4                0  


In [13]:
# Load the CSV file into a pandas DataFrame.
df = pd.read_csv('/content/drive/MyDrive/test_data_with_predictions.csv')

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("AICodexLab/answerdotai-ModernBERT-base-ai-detector")

# Dictionary to store stats per source
token_stats = {}

# Iterate through each unique source
for source in df['source'].unique():
    source_df = df[df['source'] == source]

    # Tokenize each text and record the length
    token_lengths = [len(tokenizer.tokenize(text)) for text in source_df['text']]

    avg_length = np.mean(token_lengths)
    std_dev = np.std(token_lengths)

    token_stats[source] = {
        'average_token_length': avg_length,
        'std_dev_token_length': std_dev
    }

# Print results
for source, stats in token_stats.items():
    print(f"Source: {source}")
    print(f"  Average token length: {stats['average_token_length']:.2f}")
    print(f"  Std. deviation:       {stats['std_dev_token_length']:.2f}\n")


Source: reddit
  Average token length: 134.81
  Std. deviation:       134.12

Source: twitter
  Average token length: 24.36
  Std. deviation:       11.74

Source: facebook
  Average token length: 42.78
  Std. deviation:       92.89

