In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load datasets
try:
    regular_prompts_df = pd.read_csv('regular_prompts.csv')
    jailbreak_prompts_df = pd.read_csv('jailbreak_prompts.csv')
    #forbidden_question_set_df = pd.read_csv('forbidden_question_set.csv')

except Exception as e:
    print("Error loading datasets:", e)
    exit(1)

# Add labels to the datasets
regular_prompts_df['label'] = 0
jailbreak_prompts_df['label'] = 1
#forbidden_question_set_df['label'] = 0  # Assuming all prompts in the forbidden set are not jailbreaks

# Combine datasets
all_prompts_df = pd.concat([regular_prompts_df, jailbreak_prompts_df])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_prompts_df['prompt'], all_prompts_df['label'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
max_len = max([len(x) for x in X_train_seq])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Build LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, input_length=max_len),
    tf.keras.layers.LSTM(64, dropout=0.7, recurrent_dropout=0.7),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Train the model
try:
    model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))
except Exception as e:
    print("Error during training:", e)
    exit(1)

# Evaluate the model
try:
    loss, accuracy = model.evaluate(X_test_pad, y_test)
    print("Test Accuracy:", accuracy)
except Exception as e:
    print("Error during evaluation:", e)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4904, 100)         3464900   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 3507205 (13.38 MB)
Trainable params: 3507205 (13.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.9045383334159851


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load datasets
try:
    regular_prompts_df = pd.read_csv('regular_prompts.csv')
    jailbreak_prompts_df = pd.read_csv('jailbreak_prompts.csv')
    #forbidden_question_set_df = pd.read_csv('forbidden_question_set.csv')
except Exception as e:
    print("Error loading datasets:", e)
    exit(1)

# Add labels to the datasets
regular_prompts_df['label'] = 0
jailbreak_prompts_df['label'] = 1
#forbidden_question_set_df['label'] = 0  # Assuming all prompts in the forbidden set are not jailbreaks

# Combine datasets
all_prompts_df = pd.concat([regular_prompts_df, jailbreak_prompts_df])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_prompts_df['prompt'], all_prompts_df['label'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
max_len = max([len(x) for x in X_train_seq])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Build RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, input_length=max_len),
    tf.keras.layers.SimpleRNN(256),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# Train the model
try:
    model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))
except Exception as e:
    print("Error during training:", e)
    exit(1)

# Evaluate the model
try:
    loss, accuracy = model.evaluate(X_test_pad, y_test)
    print("Test Accuracy:", accuracy)
except Exception as e:
    print("Error during evaluation:", e)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4904, 100)         3464900   
                                                                 
 simple_rnn (SimpleRNN)      (None, 256)               91392     
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 3556549 (13.57 MB)
Trainable params: 3556549 (13.57 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.9045383334159851
