In [1]:
pip install pandas scikit-learn nltk




In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
import re
import string

In [3]:
# --- 1. Load the messages and labels ---
# The SMS Spam Collection Dataset is typically a tab-separated file.
# We'll load it assuming the file is named 'sms_spam_collection.csv' 
# or use a common URL for demonstration if a local file isn't available.
# NOTE: Replace 'spam.csv' with the actual path to your downloaded dataset file.
try:
    # Assuming a common format for the UCI SMS Spam Collection
    df = pd.read_csv(
        'spam.csv', 
        encoding='latin-1', 
        usecols=[0, 1], 
        names=['label', 'message'], 
        header=None, 
        sep='\t'
    )
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please download the SMS Spam Collection Dataset and place it in the working directory.")
    # Create a small sample DataFrame for demonstration if the file is missing
    data = {'label': ['ham', 'spam', 'ham', 'spam', 'ham'],
            'message': ['Go until jurong point, crazy..', 'SIX chances to win CASH!', 
                        'Are you available tomorrow?', 'Had your mobile 11 months?', 
                        'I love the simple life!']}
    df = pd.DataFrame(data)

print(f"Dataset loaded with {len(df)} samples.")
print(df.head())

# Map 'spam' to 1 and 'ham' to 0 for numerical processing
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})
X = df['message']
y = df['label_num']

Error: 'spam.csv' not found. Please download the SMS Spam Collection Dataset and place it in the working directory.
Dataset loaded with 5 samples.
  label                         message
0   ham  Go until jurong point, crazy..
1  spam        SIX chances to win CASH!
2   ham     Are you available tomorrow?
3  spam      Had your mobile 11 months?
4   ham         I love the simple life!


In [4]:
# --- 2. Preprocess the text (Custom Preprocessing Function) ---

# Get English stopwords from NLTK (requires 'nltk.download('stopwords')' once)
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    # Fallback if nltk.download('stopwords') hasn't been run
    print("NLTK stopwords not found. Please run: import nltk; nltk.download('stopwords')")
    stop_words = set() 

def text_preprocess(text):
    """
    Performs lowercasing, removes punctuation, tokenizes (implicitly), 
    and removes stopwords.
    """
    # Lowercasing and removing punctuation
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Tokenization and removing stopwords (implicit tokenization in the next step)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    
    return " ".join(tokens)

# Apply preprocessing to the messages
X_processed = X.apply(text_preprocess)
print("\nFirst 5 processed messages:")
print(X_processed.head())

NLTK stopwords not found. Please run: import nltk; nltk.download('stopwords')

First 5 processed messages:
0    go until jurong point crazy
1        six chances to win cash
2     are you available tomorrow
3         had your mobile months
4         i love the simple life
Name: message, dtype: object


In [5]:
# --- 3. Convert text into numeric features (TF-IDF) ---
# TF-IDF (Term Frequency-Inverse Document Frequency) is a great feature
# extraction method that weights words by their importance.
tfidf = TfidfVectorizer(max_features=5000) # Use the top 5000 words
X_features = tfidf.fit_transform(X_processed)
print(f"\nFeature matrix shape: {X_features.shape}")

# --- 4. Split into train/test sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.25, random_state=42
)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

# --- 5. Train a simple model (Naive Bayes) ---
# Multinomial Naive Bayes is excellent for text classification
model = MultinomialNB()
model.fit(X_train, y_train)
print("\nModel trained (Multinomial Naive Bayes).")
# Make predictions
y_pred = model.predict(X_test)


Feature matrix shape: (5, 22)
Training set size: 3, Test set size: 2

Model trained (Multinomial Naive Bayes).


In [7]:
# --- 6. Measure performance ---
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("\n--- Model Performance Metrics ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("---------------------------------")

# Optional: Test with custom messages
def predict_message(message):
    processed = text_preprocess(message)
    features = tfidf.transform([processed])
    prediction = model.predict(features)[0]
    return "SPAM" if prediction == 1 else "HAM"

print("\nCustom Test Cases:")
print(f"Message 1: 'WINNER! You have won a free holiday. Call now!' -> {predict_message('WINNER! You have won a free holiday. Call now!')}")
print(f"Message 2: 'Hey, are we still meeting for lunch today?' -> {predict_message('Hey, are we still meeting for lunch today?')}")


--- Model Performance Metrics ---
Accuracy: 0.5000
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
---------------------------------

Custom Test Cases:
Message 1: 'WINNER! You have won a free holiday. Call now!' -> HAM
Message 2: 'Hey, are we still meeting for lunch today?' -> HAM


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
