In [2]:
# Naive Bayes Classifier with a Real Dataset: SMS Spam Collection

In [3]:
%pip install scikit-learn pandas numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


- Step 1: Import Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

- Step 2: Load the Real Dataset
- Download the SMS Spam Collection Dataset from UCI Machine Learning Repository or use a hosted version (e.g., from Kaggle). For simplicity, I’ll assume you’ve downloaded the dataset as SMSSpamCollection (a tab-separated file).
-- https://archive.ics.uci.edu/dataset/228/sms+spam+collection

In [5]:
# Load the dataset (adjust the path to where you saved the file)
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'text'])

# Display the first few rows
print(df.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


- Step 3: Explore the Dataset

In [6]:
# Check dataset info
print("Dataset Info:")
print(df.info())

# Check class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None

Class Distribution:
label
ham     4825
spam     747
Name: count, dtype: int64


- Step 4: Preprocess the Data

In [7]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')  # Remove common English stop words

# Convert text to word count matrix
X = vectorizer.fit_transform(df['text'])

# Labels
y = df['label']

- Step 5: Split the Data

In [8]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- Step 6: Train the Naive Bayes Classifier
- We’ll use Multinomial Naive Bayes, which is well-suited for text classification with word counts.

In [9]:
# Initialize the classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train, y_train)

- Step 7: Make Predictions

In [10]:
# Predict on the test set
y_pred = nb_classifier.predict(X_test)

- Step 8: Evaluate the Model

In [11]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.92      0.96      0.94       149

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[953  13]
 [  6 143]]


- Step 9: Test with New Data
- Let’s classify some new SMS messages to see how the model performs.

In [None]:
# New SMS messages
new_messages = [
    "Congratulations! You've won a $1000 gift card. Call now!",
    "Hey, are we meeting for lunch today?",
    "Free entry to a concert this weekend! Click here to claim."
]

# Transform the new messages using the same vectorizer
new_messages_transformed = vectorizer.transform(new_messages)

# Predict
predictions = nb_classifier.predict(new_messages_transformed)

# Display results
for msg, pred in zip(new_messages, predictions):
    print(f"Message: {msg}\nPrediction: {pred}\n")

Message: Congratulations! You've won a $1000 gift card. Call now!
Prediction: spam

Message: Hey, are we meeting for lunch today? and gift for you
Prediction: ham

Message: Free entry to a concert this weekend! Click here to claim.
Prediction: spam



- Step 10: Save the Model (Optional)
- To reuse the model later, you can save it using joblib.

In [13]:
import joblib

# Save the model and vectorizer
joblib.dump(nb_classifier, 'naive_bayes_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# To load later:
# nb_classifier = joblib.load('naive_bayes_model.pkl')
# vectorizer = joblib.load('vectorizer.pkl')

['vectorizer.pkl']

### Explanation of Results
- The model achieved 98% accuracy on the test set, which is excellent for a simple Naive Bayes classifier.
- The Multinomial Naive Bayes classifier works well for text data because it models word frequencies - - effectively.
- The slight drop in performance for "spam" (lower recall/precision) is due to the imbalanced dataset (fewer - spam examples). In a real-world scenario, you could address this by:
- Oversampling the minority class (spam).
- Using techniques like TF-IDF instead of CountVectorizer.
- Adjusting class weights in the classifier.

- Use TF-IDF Vectorizer: Replace CountVectorizer with TfidfVectorizer to weigh words by importance.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb_classifier, X, y, cv=5)
print(f"Cross-Validation Scores: {scores.mean():.2f} (± {scores.std():.2f})")

Cross-Validation Scores: 0.98 (± 0.00)
