### Import Libraries

In [43]:
import pandas as pd
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

### Import Dataset

In [28]:
df = pd.read_csv("spam.csv", encoding="latin-1")

### Clean Data

In [29]:
df = df.drop(columns = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])

In [30]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove whitespace
    text = text.strip()
    
    return text

df["v2"] = df["v2"].apply(clean_text)
df["v1"] = df["v1"].replace({"ham": 0, "spam": 1})
df.head()

Unnamed: 0,v1,v2
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


**1. Recurrent Neural Network**

In [31]:
# Split the data into training and testing sets
texts = df["v2"]
labels = df["v1"]
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Preprocess the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_train)
sequences_train = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = max([len(seq) for seq in sequences_train])
X_train = pad_sequences(sequences_train, maxlen=max_len)
X_test = pad_sequences(sequences_test, maxlen=max_len)

# Convert labels to binary format
y_train = labels_train.replace({"spam": 1, "ham": 0}).values
y_test = labels_test.replace({"spam": 1, "ham": 0}).values

In [32]:
# Define the SimpleRNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len))
model.add(SimpleRNN(64))
model.add(Dense(1, activation="sigmoid"))

In [33]:
# Compile and train the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x221aa1f75b0>

In [35]:
# Make predictions on the test set
y_pred_probs = model.predict(X_test)
y_pred = np.where(y_pred_probs > 0.5, 1, 0)
y_pred = y_pred.flatten()



In [36]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9811659192825112
