In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Get the data
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

# Function to read and process the data
def read_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
    # Convert labels to binary (0 for ham, 1 for spam)
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

# Read training and testing data
try:
    train_data = read_data(train_file_path)
    test_data = read_data(test_file_path)
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {e}")
    # If there's an issue loading the data, create sample data for demonstration
    print("Creating sample data for demonstration...")

    # Sample ham messages
    ham_messages = [
        "Hey what's up?",
        "I'll be there in 10 minutes",
        "Can you pick up some milk on your way home?",
        "Don't forget we have dinner at 7",
        "How was your day?",
        "Call me when you get a chance"
    ]

    # Sample spam messages
    spam_messages = [
        "CONGRATULATIONS! You've won a $1000 gift card! Click here to claim now!",
        "FREE entry to concert this weekend! Reply YES to confirm",
        "Your account has been charged $350. If this was not you, call immediately",
        "You have won a free vacation! Call now to claim your prize",
        "50% OFF all items! Limited time offer!",
        "Your payment is due. Reply with STOP to unsubscribe"
    ]

    # Create training data
    train_ham = pd.DataFrame({'label': [0] * 40, 'message': np.random.choice(ham_messages, 40)})
    train_spam = pd.DataFrame({'label': [1] * 20, 'message': np.random.choice(spam_messages, 20)})
    train_data = pd.concat([train_ham, train_spam]).sample(frac=1).reset_index(drop=True)

    # Create testing data
    test_ham = pd.DataFrame({'label': [0] * 10, 'message': np.random.choice(ham_messages, 10)})
    test_spam = pd.DataFrame({'label': [1] * 5, 'message': np.random.choice(spam_messages, 5)})
    test_data = pd.concat([test_ham, test_spam]).sample(frac=1).reset_index(drop=True)

# Display information about the data
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")
print("\nClass distribution in training data:")
print(train_data['label'].value_counts())
print("\nClass distribution in testing data:")
print(test_data['label'].value_counts())

# Display some examples
print("\nSample ham messages:")
print(train_data[train_data['label'] == 0]['message'].head(3).values)
print("\nSample spam messages:")
print(train_data[train_data['label'] == 1]['message'].head(3).values)

# Prepare the data for the model
# Extract features and labels
train_sentences = train_data['message'].values
train_labels = train_data['label'].values
test_sentences = test_data['message'].values
test_labels = test_data['label'].values

# Tokenize the text
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

# Get the word index
word_index = tokenizer.word_index
print(f"\nVocabulary size: {len(word_index)}")

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad the sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Build the model
embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
print("\nModel Summary:")
model.summary()

# Train the model
num_epochs = 30
history = model.fit(
    train_padded, train_labels,
    epochs=num_epochs,
    validation_data=(test_padded, test_labels),
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(test_padded, test_labels)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# Create the prediction function
def predict_message(message):
    # Tokenize the message
    sequence = tokenizer.texts_to_sequences([message])
    # Pad the sequence
    padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    # Make prediction
    prediction = model.predict(padded)[0][0]

    # Return probability and classification
    return [float(prediction), "spam" if prediction > 0.5 else "ham"]

# Test the prediction function with some examples
test_messages = [
    "Hey, how's it going?",
    "Congratulations! You've won a free cruise. Call now to claim your prize!"
]

for message in test_messages:
    prediction = predict_message(message)
    print(f"Message: {message}")
    print(f"Prediction: {prediction[1]} (Probability: {prediction[0]:.4f})")
    print()

# The code below is for the FreeCodeCamp test cell
# You can test your model with:
def test_predictions():
    test_messages = [
        "how are you doing today",
        "sale today! to stop texts call 98912460324",
        "i dont want to go. can we try it a different day? available sat",
        "our new mobile video service is live. just install on your phone to start watching.",
        "you have won £1000 cash! call to claim your prize.",
        "i'll bring it tomorrow. don't forget the milk.",
        "wow, is your arm alright. that happened to me one time too"
    ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("All tests passed!")
    else:
        print("Some tests failed.")

# Run the tests
test_predictions()