In [17]:
import os
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.utils import to_categorical

In [18]:
# Define URLs and file paths
train_url = "https://cdn.freecodecamp.org/project-data/sms/train-data.tsv"
test_url = "https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv"

download_dir = "data"
os.makedirs(download_dir, exist_ok=True)

train_file_path = os.path.join(download_dir, "train-data.tsv")
test_file_path = os.path.join(download_dir, "valid-data.tsv")

# Function to download files
def download_file(url, file_path):
    response = requests.get(url)
    response.raise_for_status()
    with open(file_path, 'wb') as file:
        file.write(response.content)

# Download files
download_file(train_url, train_file_path)
download_file(test_url, test_file_path)

In [19]:
# Load data
train_data = pd.read_csv('data/train-data.tsv', sep='\t')
test_data = pd.read_csv('data/valid-data.tsv', sep='\t')

train_data.columns = ['label', 'text']
test_data.columns = ['label', 'text']

# Prepare data
X_train = train_data['text']
y_train = train_data['label']
X_test = test_data['text']
y_test = test_data['label']

In [20]:
# Tokenize text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Encode labels
y_train_encoded = y_train.map({'ham': 0, 'spam': 1})
y_test_encoded = y_test.map({'ham': 0, 'spam': 1})

In [21]:
# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_len),
    LSTM(128),
    Dense(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, 
                    y_train_encoded, 
                    epochs=10, 
                    validation_split=0.2, 
                    batch_size=32)

Epoch 1/10




[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 79ms/step - accuracy: 0.8819 - loss: 0.2764 - val_accuracy: 0.9821 - val_loss: 0.0506
Epoch 2/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 102ms/step - accuracy: 0.9908 - loss: 0.0234 - val_accuracy: 0.9844 - val_loss: 0.0653
Epoch 3/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 127ms/step - accuracy: 0.9990 - loss: 0.0060 - val_accuracy: 0.9904 - val_loss: 0.0449
Epoch 4/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - accuracy: 0.9997 - loss: 0.0016 - val_accuracy: 0.9880 - val_loss: 0.0509
Epoch 5/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 87ms/step - accuracy: 1.0000 - loss: 7.5040e-04 - val_accuracy: 0.9880 - val_loss: 0.0552
Epoch 6/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 97ms/step - accuracy: 0.9993 - loss: 9.2932e-04 - val_accuracy: 0.9868 - val_loss: 0.0569
Epoch 7/10
[1m105

In [22]:
# Predict function
def predict_message(pred_text):
    pred_seq = tokenizer.texts_to_sequences([pred_text])
    pred_pad = pad_sequences(pred_seq, maxlen=max_len)
    prediction = model.predict(pred_pad)[0][0]
    label = 'spam' if prediction > 0.5 else 'ham'
    return [prediction, label]

In [23]:
# Test predictions
def test_predictions():
    test_messages = ["how are you doing today",
                    "sale today! to stop texts call 98912460324",
                    "i dont want to go. can we try it a different day? available sat",
                    "our new mobile video service is live. just install on your phone to start watching.",
                    "you have won £1000 cash! call to claim your prize.",
                    "i'll bring it tomorrow. don't forget the milk.",
                    "wow, is your arm alright. that happened to me one time too"
                    ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("You passed the challenge. Great job!")
    else:
        print("You haven't passed yet. Keep trying.")

test_predictions()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
You passed the challenge. Great job!
