In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# Data Loading and Initial Inspection

# Load the training data from the .tsv file into a pandas DataFrame
# .tsv files use tab as a separator ('\t')
# The data appears to have two columns: label (ham/spam) and message text.
# We can use read_csv and specify the separator.
train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])

# Load the validation (test) data similarly
test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

print("Train DataFrame Head:")
print(train_df.head())
print("\nTrain DataFrame Info:")
print(train_df.info())
print("\nTrain DataFrame Shape:", train_df.shape)

print("\nTest DataFrame Head:")
print(test_df.head())
print("\nTest DataFrame Info:")
print(test_df.info())
print("\nTest DataFrame Shape:", test_df.shape)

# Check the distribution of labels in the training data
print("\nLabel distribution in training data:")
print(train_df['label'].value_counts())


In [None]:
# Text Preprocessing and Label Encoding

# Import necessary libraries for text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# --- 1. Convert Labels to Numerical ---
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on the combined labels from train and test data
# This ensures consistent mapping if a label appears only in one set
# Although in this case, both 'ham' and 'spam' are likely in both sets.
# We fit on the training labels and then transform both train and test labels.
train_labels_encoded = label_encoder.fit_transform(train_df['label'])
test_labels_encoded = label_encoder.transform(test_df['label'])

# Check the mapping (optional)
print("Label Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
# Output should show something like {'ham': 0, 'spam': 1} or vice versa.
# Let's assume 'ham' is 0 and 'spam' is 1 based on common practice and the test case.

# --- 2. Convert Text Messages to Numerical Feature Vectors (TF-IDF) ---
# Initialize TfidfVectorizer
# max_features: Limits the number of features (words) to consider.
#               This helps manage dimensionality and focus on most frequent words.
#               Adjust this number based on dataset size and desired complexity.
#               Let's start with 5000 features.
# stop_words='english': Removes common English words (like 'the', 'is', 'in')
#                       that usually don't carry much meaning for classification.
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer ONLY on the training message text
# The vectorizer learns the vocabulary and IDF values from the training data.
tfidf_vectorizer.fit(train_df['message'])

# Transform both training and test message text into TF-IDF vectors
# We use the SAME vectorizer fitted on the training data to transform the test data.
train_messages_tfidf = tfidf_vectorizer.transform(train_df['message'])
test_messages_tfidf = tfidf_vectorizer.transform(test_df['message'])

print("\nShape of TF-IDF transformed training messages:", train_messages_tfidf.shape)
print("Shape of TF-IDF transformed test messages:", test_messages_tfidf.shape)

# Now we have our numerical data ready for the model:
# X_train = train_messages_tfidf (Sparse matrix of TF-IDF features)
# X_test = test_messages_tfidf (Sparse matrix of TF-IDF features)
# y_train = train_labels_encoded (Numpy array of 0s and 1s)
# y_test = test_labels_encoded (Numpy array of 0s and 1s)

# We'll use these variables in the next steps.


In [None]:
# Model Definition, Compilation, and Training

# Ensure the 'layers' module is imported
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential # Also ensure Sequential is imported if not globally
from tensorflow.keras.optimizers import Adam # Ensure Adam is imported if not globally

# Define the model architecture
# We use a Sequential model.
model = keras.Sequential([
    # The input layer needs to match the number of features from TF-IDF.
    # train_messages_tfidf is a sparse matrix, its shape is (num_samples, num_features).
    # We need the number of features (columns).
    # The input shape for the first Dense layer is the number of TF-IDF features.
    layers.Dense(128, activation='relu', input_shape=(train_messages_tfidf.shape[1],)),
    # Add another Dense layer
    layers.Dense(64, activation='relu'),
    # The output layer for binary classification (ham/spam) has 1 unit.
    # We use 'sigmoid' activation to output a probability between 0 and 1.
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
# Optimizer: Adam is a good default choice.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) # Use tf.keras.optimizers.Adam

# Loss function: For binary classification, binary_crossentropy is standard.
loss_function = 'binary_crossentropy'

# Metrics: We want to monitor accuracy.
metrics_to_monitor = ['accuracy']

model.compile(optimizer=optimizer,
              loss=loss_function,
              metrics=metrics_to_monitor)

# Print the model summary
print("Model Summary:")
model.summary()

# Train the model
# We use the .fit() method.
# train_messages_tfidf: The TF-IDF features for training.
# train_labels_encoded: The numerical labels (0s and 1s) for training.
# epochs: Number of times to iterate over the training data. Text data often needs fewer epochs than images.
#         Start with a moderate number, maybe 10-20.
# batch_size: Number of samples per gradient update.
# verbose: Show training progress (1 for progress bar).
# validation_data: Pass the test set here to monitor performance during training (optional but good practice).

print("\nStarting model training...")

history = model.fit(
    train_messages_tfidf,
    train_labels_encoded,
    epochs=15, # Adjust epochs as needed
    batch_size=32, # Adjust batch size as needed
    verbose=1, # Set to 0 for silent, 2 for one line per epoch
    validation_data=(test_messages_tfidf, test_labels_encoded) # Monitor performance on test set
)

print("Model training finished.")

# The model is now trained and ready to be used for predictions.


In [None]:
# function to predict messages based on model
# (should return list containing prediction probability and label, ex. [0.008318834938108921, 'ham'])

# Ensure the necessary objects from previous steps are available:
# tfidf_vectorizer (fitted)
# model (trained)
# label_encoder (fitted, to map back from 0/1 to 'ham'/'spam')

def predict_message(pred_text):
    # --- Step 1: Preprocess the input message ---
    # The model was trained on TF-IDF vectors, so the input message must also be converted to TF-IDF.
    # Use the SAME tfidf_vectorizer that was fitted on the training data.
    # The vectorizer expects a list of strings, so put the single message in a list.
    pred_text_tfidf = tfidf_vectorizer.transform([pred_text])

    # --- Step 2: Use the trained model to predict ---
    # model.predict() returns the model's output. For a sigmoid output layer, this is the probability.
    # The output shape will be (number_of_messages, 1), so we need to access the value.
    prediction_probability = model.predict(pred_text_tfidf)[0][0]

    # --- Step 3: Determine the class label based on the probability ---
    # If the probability of the positive class (spam, assuming it's encoded as 1) is >= 0.5, classify as spam.
    # Otherwise, classify as ham.
    # We can use the label_encoder to map the predicted class (0 or 1) back to the string label ('ham' or 'spam').

    # Determine the predicted class (0 or 1)
    predicted_class = 1 if prediction_probability >= 0.5 else 0

    # Map the predicted class back to the original string label ('ham' or 'spam')
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]


    # --- Step 4: Format the output as required ---
    # The requirement is a list: [probability, label]
    prediction_output = [prediction_probability, predicted_label]

    return prediction_output

# --- Example usage (Optional - you can remove this after pasting into your notebook) ---
# pred_text = "how are you doing today?"
# prediction = predict_message(pred_text)
# print(prediction)

# pred_text_spam = "sale today! call now for prize"
# prediction_spam = predict_message(pred_text_spam)
# print(prediction_spam)


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
