# Prompt Shield: Distilguard

This project shows how a classifier helps detect prompt injection attacks and thus only allows benign prompts to be forwarded to the protected LLM.

The Jupyter notebook consists of several parts:
1) Training the classifier 
2) Evaluating the classifier
3) Using the classifier to filter prompts to the LLM

## 1) Training the classifier

In [None]:
# Import necessary libraries
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
# Load dataset from Hugging Face
dataset = load_dataset("xTRam1/safe-guard-prompt-injection")

In [3]:
# Split the train set into 90% train and 10% validation sets
split = dataset["train"].train_test_split(test_size=0.1)
dataset["train"] = split["train"]
dataset["validation"] = split["test"]
print("New splits:", {k: len(dataset[k]) for k in dataset})

New splits: {'train': 7412, 'test': 2060, 'validation': 824}


In [4]:
# Define text and label column names for tokenizer
text_col = "text"
label_col = "label"

In [9]:
# Choose model and tokenizer
# in this case DistilBERT uncased since it's lightweight and effective for understanding prompts 
MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# The maximum sequence length for DistilBERT, which will truncate about 2% of the dataset
MAX_LENGTH = 512


In [11]:

def preprocess(examples):
    return tokenizer(examples[text_col], truncation=True, max_length=MAX_LENGTH)

# Map tokenization
tokenized = dataset.map(preprocess,batched=True)

# Rename column to labels to fit Trainer API
tokenized = tokenized.rename_column(label_col, "labels")

# Convert to PyTorch tensors
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(tokenized)


Map: 100%|██████████| 7412/7412 [00:00<00:00, 27466.79 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 7412
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2060
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 824
    })
})





Use LoRA on top of the base model for faster training and lower memory use without impacting performance.

In [None]:
NUM_LABELS = 2 # 0 for benign, 1 for malicious

# Load DistilBERT base model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# LoRA configuration
peft_config = LoraConfig(
    r=8,                
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

# Create a PEFT model by adding the LoRA adapter on top of the base model
model = get_peft_model(model, peft_config)

# Move model to device (MPS or CPU) - MPS for faster training on Mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print("Model loaded and moved to:", next(model.parameters()).device)


Calcuate evaluation metrics during training

In [14]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [None]:
training_args = TrainingArguments(
    output_dir="outputs/prompt_shield", # Directory to save model checkpoints
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch", # run evaluation after each epoch
    save_strategy="epoch", # save checkpoint after each epoch
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=False, # For MPS compatibility
)

# Dynamic padding of all sequences in a batch to the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
# Run training
trainer.train()

# Save the peft-wrapped weights and tokenizer
trainer.save_model("models/prompt_shield_distilbert_lora")
tokenizer.save_pretrained("models/prompt_shield_distilbert_lora")


## 2) Evaluate classifier 

In [None]:
final_test_metrics = trainer.evaluate(tokenized["test"])
print(final_test_metrics)

| Metric                   | Value                |
|--------------------------|----------------------|
| **eval_loss**            | 0.0390               |
| **eval_accuracy**        | 0.9854               |
| **eval_precision**       | 0.9769               |
| **eval_recall**          | 0.9769               |
| **eval_f1**              | 0.9769               |
| **eval_runtime (s)**     | 17.0447              |
| **samples/second**       | 120.858              |
| **steps/second**         | 7.568                |
| **epoch**                | 3.0                  |
