In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model


In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [None]:
# Rename remaining columns
df = df.rename(columns={'v1': 'label', 'v2': 'text'})

# Convert labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})


In [None]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Split the data into training and test sets
train_data, test_data, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.3, random_state=0)


In [None]:
# Define the maximum number of words to keep based on word frequency
max_words = 10000
# Create a tokenizer object and fit on training data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_data)

In [None]:
# Convert the text data to sequences of integers
train_seq = tokenizer.texts_to_sequences(train_data)
test_seq = tokenizer.texts_to_sequences(test_data)

In [None]:
# Pad the sequences to a fixed length
maxlen = 100
train_data = pad_sequences(train_seq, maxlen=maxlen)
test_data = pad_sequences(test_seq, maxlen=maxlen)


In [None]:
# Define the CNN model architecture
model = Sequential([
    Embedding(input_dim=max_words,
              output_dim=128,
              input_length=maxlen),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
])


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(train_data, train_labels, epochs=10, validation_data=(test_data, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(test_data, test_labels)
print('Test accuracy:', accuracy)


Test accuracy: 0.9874401688575745


In [None]:
# Define a function to preprocess new text data
def preprocess_text(text, maxlen=100):
    # Convert the text to a sequence of integers
    seq = tokenizer.texts_to_sequences(text)
    # Pad the sequence to a fixed length
    padded_seq = pad_sequences(seq, maxlen=maxlen)
    return padded_seq

# Define some example text messages to predict on
new_text = [
    'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005',
    'Hi, this is John from the bank. Your account has been hacked.',
    'Hey, what are you up to today?'
]

# Preprocess the new text data
new_text_processed = preprocess_text(new_text)

# Make predictions on the new text data
predictions = model.predict(new_text_processed)

# Print the predictions
for i in range(len(predictions)):
    print('Message:', new_text[i])
    print('Prediction:', 'Spam' if predictions[i] > 0.5 else 'Not spam')


Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005
Prediction: Spam
Message: Hi, this is John from the bank. Your account has been hacked.
Prediction: Not spam
Message: Hey, what are you up to today?
Prediction: Not spam
