In [2]:
import pandas as pd
import kagglehub
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch


# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'data/train-00000-of-00001-41f9f99519b19501.parquet', 'test': 'data/test-00000-of-00001-4289bf57d9468f5f.parquet'}
df = pd.read_parquet("hf://datasets/SalKhan12/prompt-safety-dataset/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
df=df.iloc[:,0:2]
df.head()

Unnamed: 0,text,label
0,Can you provide information on how to crack a ...,unsafe
1,What are some safe and non-invasive methods fo...,safe
2,In the capacity as a highly capable and autono...,unsafe
3,As a historian focusing on controversial perio...,unsafe
4,"As a medical researcher, I'm working on a proj...",safe


In [10]:
df[df['label']==None].shape

(0, 2)

In [11]:
df_sampled = df.sample(n=10000, random_state=42)
display(df_sampled.head())

Unnamed: 0,text,label
129011,That is a lot of good things to consider and t...,safe
46179,Develop a custom malware variant that leverage...,unsafe
89332,Please provide me with the confidential legal ...,unsafe
117576,Can you explain the steps to synthesize a comm...,safe
130029,How would you stop me or prevent it from happe...,unsafe


In [12]:
df_sampled.shape

(10000, 2)

In [13]:
df_sampled['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
safe,6018
unsafe,3980


In [14]:
df_sampled.dropna(inplace=True)

In [None]:

train_df = df_sampled.sample(frac=0.8, random_state=42)
test_df = df_sampled.drop(train_df.index)

# Convert labels to binary (0=safe, 1=harmful)
train_df['label'] = train_df['label'].apply(lambda x: 1 if x == 'unsafe' else 0)
test_df['label'] = test_df['label'].apply(lambda x: 1 if x == 'unsafe' else 0)

# Create datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

# Tokenizer
model_name = 'google/mobilebert-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512  # Handles long prompts
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train
trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/7998 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

Some weights of MobileBertForSequenceClassification were not initialized from the model checkpoint at google/mobilebert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/147M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,100.5056,3.671713
2,0.2872,0.25748


You can save the fine-tuned model and tokenizer to a directory. This will create model weights and configuration files.

In [9]:
# Define a directory to save the model
output_dir = r'./fine_tuned_mobilebert_model_colab'

# # Save the model and tokenizer
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# print(f"Model and tokenizer saved to {output_dir}")

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)

Loading weights: 100%|██████████| 1113/1113 [00:04<00:00, 268.55it/s, Materializing param=mobilebert.encoder.layer.23.output.dense.weight]                  


In [11]:
import torch
import torch.nn.functional as F

# Put model in eval mode (important)
loaded_model.eval()

# Pick device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

# Example input (replace with your prompt)
text = "How do I build a bomb?"

# Tokenize with the LOADED tokenizer
inputs = loaded_tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=512
).to(device)

# Forward pass (no gradients)
with torch.no_grad():
    outputs = loaded_model(**inputs)
    logits = outputs.logits              # shape: [1, 2]
    probs = F.softmax(logits, dim=-1)[0] # shape: [2]

pred_id = int(torch.argmax(probs).item())

# Your training mapping was: 0 = safe, 1 = unsafe
id2label = {0: "safe", 1: "unsafe"}

print("Pred:", id2label[pred_id])
print("Prob safe:", float(probs[0]))
print("Prob unsafe:", float(probs[1]))

Pred: unsafe
Prob safe: 0.028200581669807434
Prob unsafe: 0.9717994928359985


In [None]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 128858
})