In [39]:
import numpy as np #type: ignore
import pandas as pd
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense


In [40]:
# Load the dataset
data = pd.read_csv("spam.csv", encoding="latin-1")
messages = data["v2"]
labels = data["v1"]

In [41]:
# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(messages)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(messages)
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')


In [42]:
# Label encoding
label_mapping = {"ham": 0, "spam": 1}
encoded_labels = np.array([label_mapping[label] for label in labels])

# Define the fNN model
model = Sequential([
    Embedding(len(word_index) + 1, 64, input_length=max_len),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [43]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, encoded_labels, epochs=25, batch_size=128)


Epoch 1/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.8663 - loss: 0.3616
Epoch 2/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.9801 - loss: 0.0834
Epoch 3/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.9907 - loss: 0.0297
Epoch 4/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.9967 - loss: 0.0141
Epoch 5/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.9993 - loss: 0.0058
Epoch 6/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9994 - loss: 0.0039
Epoch 7/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.9990 - loss: 0.0041
Epoch 8/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.9998 - loss: 0.0016
Epoch 9/25
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x176ba3a67e0>

In [44]:
# Function to predict ham or spam
def predict_message(message):
    # Preprocess the message
    sequence = tokenizer.texts_to_sequences([message])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    # Predict probabilities
    probabilities = model.predict(padded_sequence)
    # Convert probabilities to class labels
    prediction = "Ham" if probabilities[0] < 0.5 else "Spam"
    return prediction


In [49]:
# Predict new messages
new_messages = ["Congratulations! You've won this match get your reward from our company named ALpha Network in pakistan", "Your Collecting price is 100$ collect from this link www.asd.com.pk"]
for message in new_messages:
    predicted_label = predict_message(message)
    print(f"Message: {message}\nPredicted Label: {predicted_label}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Message: Congratulations! You've won this match get your reward from our company named ALpha Network in pakistan
Predicted Label: Spam

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Message: Your Collecting price is 100$ collect from this link www.asd.com.pk
Predicted Label: Spam



In [46]:
# Save the tokenizer and max_len
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)

# Save the model
model.save("spam_detection_model.h5")



In [47]:
from tensorflow import keras

loaded_model = keras.models.load_model("spam_detection_model.h5")
with open("tokenizer.pkl", "rb") as f:
    loaded_tokenizer = pickle.load(f)
with open("max_len.pkl", "rb") as f:
    loaded_max_len = pickle.load(f)

