In [None]:
!pip install --upgrade --force-reinstall tensorflow --user

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting packaging (from t

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

## Data Cleaning & Labeling

In [None]:
df = pd.read_csv("SpamCollection.csv", encoding="latin1")
df.head()

In [None]:
df.columns

In [None]:
df = df[['v1', 'v2']] # Keep only relevant columns
df.columns = ['label', 'text'] # Rename columns

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label']) # Convert ham/spam to 0/1
df.head()

In [None]:
df['label'].value_counts()

## Define the hyperparameter

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Define hyperparameters
vocab_size = 5000          # Only keep top most common words
embedding_dim = 64         # Embedding Size, words will be represented by a 64-number vector
max_length = 200           # Max number of words per message
trunc_type = 'post'        # Cut off extra words at the end
padding_type = 'post'      # Pad 0 at the end of short messages
oov_token = "<OOV>"        # For words not in vocabulary

In [None]:
# Create tokenizer and fit it on the text messages
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['text'])

# Convert text messages to sequences of word indexes
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad the sequences to make all same length
padded_sequences = pad_sequences(sequences, maxlen=max_length,
                                 padding=padding_type, truncating=trunc_type)

## Split data

In [None]:
X = padded_sequences
y = df['label'].values   # These are your answers: 0 = ham, 1 = spam

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

## Build the (Long Short-Term Memory) LSTM model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

In [None]:
#!pip install livelossplot
from livelossplot import PlotLossesKerasTF

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit with class weights (reuse your computed class_weight_dict)
model.fit(X_train, y_train, epochs=10, batch_size=32, class_weight=class_weight_dict, validation_data=(X_test,y_test),
          callbacks=[PlotLossesKerasTF()])


In [None]:

# Then pass this to model.fit
#model.fit(X_train, y_train, class_weight=class_weight_dict, ...)


## Visualize & Evaluate Result

def plot_history(history):
    # Accuracy plot
    plt.figure(figsize=(12, 5))

    # Plot training and validation accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot training and validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
def plot_history(history):
    # Accuracy plot
    plt.figure(figsize=(12, 5))

    # Plot training and validation accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot training and validation loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()
plot_history(history)

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Evaluate the model on the training set
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=2)
print(f"Train Loss: {train_loss:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}")

In [None]:
train_pred = model.predict(X_train)
pd.crosstab(y_train, (train_pred > 0.5).astype(int).flatten(), rownames=['Actual'], colnames=['Predicted'])

## Test Model

In [None]:
def predict_message(text):
    # Convert text to sequence
    sequence = tokenizer.texts_to_sequences([text])

    # Pad it to the same length
    padded = pad_sequences(sequence, maxlen=200, padding='post', truncating='post')

    # Predict
    prob = model.predict(padded)[0][0]
    label = "spam" if prob > 0.5 else "ham"

    print(f"Message: {text}")
    print(f"Prediction: {label}")

In [None]:
predict_message("Fine if thatåÕs the way u feel. ThatåÕs the way its gota b")

In [None]:
predict_message("England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/Ì¼1.20 POBOXox36504W45WQ 16+")

In [None]:
predict_message("Congratulations! You've won a free cruise. Call now!")

In [None]:
predict_message("Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! ")

In [None]:
predict_message("Do you know what Mallika Sherawat did yesterday? Find out now @  &lt;URL&gt;")

In [None]:
predict_message("XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL")

In [None]:
# Tokenizer and spam data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

spam_df = df[df['label'] == 1].copy()
spam_sequences = tokenizer.texts_to_sequences(spam_df['text'])
spam_padded = pad_sequences(spam_sequences, maxlen=200, padding='post', truncating='post')

# Predict
spam_predictions = model.predict(spam_padded)
spam_pred_labels = (spam_predictions > 0.5).astype(int).flatten()

# Count correct predictions
num_correct_spam = (spam_pred_labels == 1).sum()
total_spam = len(spam_pred_labels)
accuracy_on_spam = num_correct_spam / total_spam

print(f"Spam messages: {total_spam}")
print(f"Correctly predicted as spam: {num_correct_spam}")
print(f"Accuracy on spam: {accuracy_on_spam:.2%}")