In [3]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import re

In [4]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-10-05 09:49:05--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-10-05 09:49:05 (15.2 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-10-05 09:49:05--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.3.33, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-10-05 09:49:05 (13.8 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [5]:
# Load the data
print("Loading data...")
train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

print(f"Training data: {len(train_df)} messages")
print(f"Test data: {len(test_df)} messages")

# Check the distribution
print("\nTraining data distribution:")
print(train_df['label'].value_counts())
print("\nTest data distribution:")
print(test_df['label'].value_counts())

# Preprocess text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
train_df['processed_message'] = train_df['message'].apply(preprocess_text)
test_df['processed_message'] = test_df['message'].apply(preprocess_text)


Loading data...
Training data: 4179 messages
Test data: 1392 messages

Training data distribution:
label
ham     3619
spam     560
Name: count, dtype: int64

Test data distribution:
label
ham     1205
spam     187
Name: count, dtype: int64


In [6]:
# Create and train the model
print("\nTraining model...")
model = make_pipeline(
    TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1, 2)),
    MultinomialNB()
)

# Train the model
model.fit(train_df['processed_message'], train_df['label'])

# Test the model
test_predictions = model.predict(test_df['processed_message'])
test_accuracy = np.mean(test_predictions == test_df['label'])
print(f"Test accuracy: {test_accuracy:.4f}")


Training model...
Test accuracy: 0.9799


In [7]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    # Preprocess the input text
    processed_text = preprocess_text(pred_text)

    # Get prediction probabilities
    probabilities = model.predict_proba([processed_text])[0]

    # Get the index for 'spam' and 'ham'
    classes = model.classes_
    spam_idx = np.where(classes == 'spam')[0][0]
    ham_idx = np.where(classes == 'ham')[0][0]

    spam_prob = probabilities[spam_idx]
    ham_prob = probabilities[ham_idx]

    # Return the required format
    if spam_prob > ham_prob:
        return [float(spam_prob), 'spam']
    else:
        return [float(ham_prob), 'ham']

In [8]:
# Test the function
print("\nTesting prediction function:")
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(f"Input: '{pred_text}'")
print(f"Output: {prediction}")


Testing prediction function:
Input: 'how are you doing today?'
Output: [0.989572716200215, 'ham']


In [9]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
