In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
# Read the dataset into a pandas dataframe
df = pd.read_csv("rnn_data.csv")

# Select spam messages only
df = df[df['Category'] == 'spam']['Message']

# Prepare the text data
train_data = df.sample(frac=0.8, random_state=42)
test_data = df.drop(train_data.index)

In [27]:
# Initialize TextVectorization layer
max_tokens = 20000  # Adjust based on vocabulary size
output_sequence_length = 30  # This controls the length of the sequences
vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)

# Adapt vectorizer on the training data
vectorizer.adapt(train_data)

# Vectorize the input text
X_train_vec = vectorizer(train_data)

In [30]:
# Generate sequences (shifted sequences to predict next word)
def generate_sequences(corpus):
    sequences = []
    for i in range(1, len(corpus)):
        sequences.append(corpus[:i+1])
    return sequences

# Vectorize the sequences and split predictors and labels
def generate_padded_sequences(sequences, maxlen):
    input_sequences = np.array(pad_sequences(sequences, maxlen=maxlen, padding='pre'))
    predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
    return predictors, label

# Convert to padded sequences
sequences = [generate_sequences(seq) for seq in X_train_vec]
sequences_flat = [item for sublist in sequences for item in sublist]  # Flatten the list of sequences
X_train, y_train = generate_padded_sequences(sequences_flat, output_sequence_length)

In [31]:
# Create the RNN model
model = Sequential([
    Embedding(max_tokens, 32, mask_zero=True),
    SimpleRNN(200),
    Dropout(0.1),
    Dense(max_tokens, activation='softmax'),
])

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    min_delta=0.01,
    patience=5,
    restore_best_weights=True,
)

# Train the model
model.fit(X_train, y_train, epochs=100, verbose=1, callbacks=[early_stopping])

Epoch 1/100


[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 46ms/step - accuracy: 0.2224 - loss: 7.0916
Epoch 2/100
[1m  3/542[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 40ms/step - accuracy: 0.2448 - loss: 6.4883

  current = self.get_monitor_value(logs)


[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 38ms/step - accuracy: 0.2292 - loss: 6.3655
Epoch 3/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 46ms/step - accuracy: 0.2309 - loss: 6.3059
Epoch 4/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 47ms/step - accuracy: 0.2303 - loss: 6.2905
Epoch 5/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 40ms/step - accuracy: 0.2324 - loss: 6.2964
Epoch 6/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 45ms/step - accuracy: 0.2278 - loss: 6.3065
Epoch 7/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 46ms/step - accuracy: 0.2260 - loss: 6.2693
Epoch 8/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 39ms/step - accuracy: 0.2298 - loss: 6.0602
Epoch 9/100
[1m542/542[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 46ms/step - accuracy: 0.2368 - loss: 5.8870
Epoch 10/100
[1m542/542[0m

<keras.src.callbacks.history.History at 0x7fa43ca3ac80>

In [33]:
# Function to generate text
def generate_text(seed_text, next_words, model, maxlen, vectorizer):
    for _ in range(next_words):
        token_list = vectorizer([seed_text]).numpy()[0]
        token_list = pad_sequences([token_list], maxlen=maxlen-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0))

        output_word = vectorizer.get_vocabulary()[predicted]
        seed_text += " " + output_word
    return seed_text

# Generate text
generated_text = generate_text("Please", 10, model, output_sequence_length, vectorizer)
print(generated_text)

Please 1 chance to win a £250 cash every wk txt
