In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
import string

print(tf.__version__)

Collecting tf-nightly
  Downloading tf_nightly-2.20.0.dev20250418-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting tb-nightly~=2.19.0.a (from tf-nightly)
  Downloading tb_nightly-2.19.0a20250218-py3-none-any.whl.metadata (1.8 kB)
Collecting keras-nightly>=3.6.0.dev (from tf-nightly)
  Downloading keras_nightly-3.9.0.dev2025032003-py3-none-any.whl.metadata (6.1 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tf-nightly)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading tf_nightly-2.20.0.dev20250418-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (646.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m646.7/646.7 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_nightly-3.9.0.dev2025032003-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-04-21 18:10:52--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-04-21 18:10:52 (56.1 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-04-21 18:10:52--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-04-21 18:10:53 (35.9 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [None]:
# data Loading and preprocessing

train_data = pd.read_csv(train_file_path, sep='\t', header=None)
test_data = pd.read_csv(test_file_path, sep='\t', header=None)
train_data.columns = ['label', 'message']
test_data.columns = ['label', 'message']

# lowercase
def preprocess_text(text):
    text = text.lower()
    return text

train_data['message'] = train_data['message'].apply(preprocess_text)
test_data['message'] = test_data['message'].apply(preprocess_text)

# convert labels to numerical values
train_data['label_num'] = train_data['label'].map({'ham': 0, 'spam': 1})
test_data['label_num'] = test_data['label'].map({'ham': 0, 'spam': 1})

In [None]:
# text vectorization
vectorizer = tf.keras.layers.TextVectorization(max_tokens=10000, output_sequence_length=100)
vectorizer.adapt(train_data['message'])

X_train = vectorizer(train_data['message'])
X_test = vectorizer(test_data['message'])
y_train = train_data['label_num']
y_test = test_data['label_num']

# Compute class weights to handle imbalance
num_ham = (y_train == 0).sum()
num_spam = (y_train == 1).sum()
total = len(y_train)
weight_for_0 = (1 / num_ham) * (total / 2.0)
weight_for_1 = (1 / num_spam) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

In [None]:
# build model
model = keras.Sequential([
    keras.layers.Embedding(input_dim=10000, output_dim=32),
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test),
          class_weight=class_weight, verbose=0)

<keras.src.callbacks.history.History at 0x7ed881f80510>

In [None]:
# function to predict messages based on model
def predict_message(pred_text):
    pred_text = preprocess_text(pred_text)
    pred_vector = vectorizer([pred_text])
    prediction = model.predict(pred_vector, verbose=0)[0][0]
    label = 'spam' if prediction > 0.5 else 'ham'
    return [prediction, label]

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      print(prediction[0])
      print(msg)
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()

You passed the challenge. Great job!
