In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from keras import Sequential
from keras.src.layers import LSTM, Embedding

print(tf.__version__)

Collecting tf-nightly
  Downloading tf_nightly-2.19.0.dev20241227-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tb-nightly~=2.19.0.a (from tf-nightly)
  Downloading tb_nightly-2.19.0a20250102-py3-none-any.whl.metadata (1.6 kB)
Collecting keras-nightly>=3.6.0.dev (from tf-nightly)
  Downloading keras_nightly-3.8.0.dev2025010303-py3-none-any.whl.metadata (5.8 kB)
Downloading tf_nightly-2.19.0.dev20241227-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (634.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m634.6/634.6 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_nightly-3.8.0.dev2025010303-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tb_nightly-2.19.0a20250102-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m94.3 MB/s[0m 

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv -P data/
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv -P data/

train_file_path = "./data/train-data.tsv"
test_file_path = "./data/valid-data.tsv"

--2025-01-03 10:58:44--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘data/train-data.tsv’


2025-01-03 10:58:44 (7.92 MB/s) - ‘data/train-data.tsv’ saved [358233/358233]

--2025-01-03 10:58:44--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘data/valid-data.tsv’


2025-01-03 10:58:44 (8.17 MB/s) - ‘data/valid-data.tsv’ saved [118774/118774]



In [None]:
# preprocess
EPOCHS = 10
BATCH_SIZE = 32


def load_data(path: str) -> pd.DataFrame:
    """Load data from a file and return a DataFrame."""
    try:
        data = pd.read_csv(
            path,
            sep="\t",
            header=None,
            names=["spam", "sms"],
        )  # .drop_duplicates(["sms"])
        return data
    except Exception as e:
        raise e


train_data = load_data(train_file_path)
test_data = load_data(test_file_path)

In [None]:
# Model architecture and normalization of inputs


def create_text_vectorization_layer(
    train_x: pd.Series,
) -> keras.layers.TextVectorization:
    vectorizer = keras.layers.TextVectorization(
        output_mode="int",
        standardize="lower_and_strip_punctuation",
    )
    vectorizer.adapt(np.array(train_x.values))
    return vectorizer

vectorizer = create_text_vectorization_layer(train_data["sms"])

def create_model(train_x: pd.DataFrame) -> keras.Model:

    vocab_size = len(vectorizer.get_vocabulary())
    # Reduce embedding dimensions
    embedding_dim = min(vocab_size // 4, 100)

    model = Sequential(
        [
            vectorizer,
            # Reduced embedding dimension
            keras.layers.Embedding(vocab_size, embedding_dim),
            # Add BatchNormalization
            keras.layers.BatchNormalization(),
            # First LSTM layer
            keras.layers.LSTM(64, return_sequences=True),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            # Second LSTM layer
            keras.layers.LSTM(32),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.2),
            # Final dense layers
            keras.layers.Dense(16, activation="relu"),
            keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    # Add learning rate scheduling
    initial_learning_rate = 0.001
    decay_steps = 1000
    decay_rate = 0.9
    learning_rate_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps, decay_rate
    )

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)

    model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

    return model


model = create_model(train_data)

AttributeError: module 'ml_dtypes' has no attribute 'float8_e3m4'


In [None]:
# Model training

def train_model(model: keras.Model, x: pd.DataFrame, y: pd.Series) -> keras.Model:

    assert "sms" in x.columns, "SMS column not found in input data"
    assert len(x) == len(y), "Features and labels must have same length"

    x_array = np.array(x["sms"].values)
    y_array = np.array([1 if label == "spam" else 0 for label in y.values])
    model.fit(
        x_array,
        y_array,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        shuffle=True,
        validation_split=0.2,
    )
    return model


x = train_data.copy()
y = train_data.pop("spam")

model = train_model(model, x, y)
model.summary()

Epoch 1/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.8693 - loss: 0.3430 - val_accuracy: 0.8959 - val_loss: 0.4688
Epoch 2/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9750 - loss: 0.0975 - val_accuracy: 0.9007 - val_loss: 0.2731
Epoch 3/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9845 - loss: 0.0685 - val_accuracy: 0.9677 - val_loss: 0.1102
Epoch 4/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9908 - loss: 0.0397 - val_accuracy: 0.9653 - val_loss: 0.1171
Epoch 5/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.9986 - loss: 0.0112 - val_accuracy: 0.9617 - val_loss: 0.1268
Epoch 6/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9872 - loss: 0.0375 - val_accuracy: 0.9737 - val_loss: 0.0949
Epoch 7/10
[1m105/1

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    pred_array = np.array([pred_text], dtype=object)
    prediction = model.predict(pred_array)
    return [prediction[0][0], "spam" if prediction[0][0] > 0.5 else "ham"]

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step
[0.0022120704, 'ham']


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
You passed the challenge. Great job!
