In [1]:
import pandas as pd
import numpy as np
import string
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [5]:
df = pd.read_excel("smsspamcollection.xlsx")

In [7]:
df.columns = ['label', 'message']

In [8]:
print("Dataset shape:", df.shape)
print(df.head())

Dataset shape: (5571, 2)
  label                                            message
0   ham                      Ok lar... Joking wif u oni...
1  spam  Free entry in 2 a wkly comp to win FA Cup fina...
2   ham  U dun say so early hor... U c already then say...
3   ham  Nah I don't think he goes to usf, he lives aro...
4  spam  FreeMsg Hey there darling it's been 3 week's n...


In [9]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
# Preprocess text: lowercase, remove numbers & punctuation
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    return text.strip()

# Ensure all entries are strings
df['message'] = df['message'].astype(str).apply(preprocess_text)


In [12]:
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [13]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [14]:
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_vec, y_train)
y_pred_bernoulli = bernoulli_nb.predict(X_test_vec)

print("\n--- Bernoulli Naive Bayes ---")
print("Accuracy:", accuracy_score(y_test, y_pred_bernoulli))
print("Precision:", precision_score(y_test, y_pred_bernoulli))
print("Recall:", recall_score(y_test, y_pred_bernoulli))
print("F1-score:", f1_score(y_test, y_pred_bernoulli))
print("Classification Report:\n", classification_report(y_test, y_pred_bernoulli))


--- Bernoulli Naive Bayes ---
Accuracy: 0.9695067264573991
Precision: 0.9915254237288136
Recall: 0.78
F1-score: 0.8731343283582089
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.99      0.78      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [15]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_vec, y_train)
y_pred_multinomial = multinomial_nb.predict(X_test_vec)

print("\n--- Multinomial Naive Bayes ---")
print("Accuracy:", accuracy_score(y_test, y_pred_multinomial))
print("Precision:", precision_score(y_test, y_pred_multinomial))
print("Recall:", recall_score(y_test, y_pred_multinomial))
print("F1-score:", f1_score(y_test, y_pred_multinomial))
print("Classification Report:\n", classification_report(y_test, y_pred_multinomial))


--- Multinomial Naive Bayes ---
Accuracy: 0.9632286995515695
Precision: 1.0
Recall: 0.7266666666666667
F1-score: 0.8416988416988417
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [16]:
results = pd.DataFrame({
    "Model": ["BernoulliNB", "MultinomialNB"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_bernoulli),
        accuracy_score(y_test, y_pred_multinomial)
    ],
    "Precision": [
        precision_score(y_test, y_pred_bernoulli),
        precision_score(y_test, y_pred_multinomial)
    ],
    "Recall": [
        recall_score(y_test, y_pred_bernoulli),
        recall_score(y_test, y_pred_multinomial)
    ],
    "F1-score": [
        f1_score(y_test, y_pred_bernoulli),
        f1_score(y_test, y_pred_multinomial)
    ]
})

print("\nModel Comparison:\n", results)


Model Comparison:
            Model  Accuracy  Precision    Recall  F1-score
0    BernoulliNB  0.969507   0.991525  0.780000  0.873134
1  MultinomialNB  0.963229   1.000000  0.726667  0.841699


Bernoulli NB models the probability of a feature being present or not. So it’s well aligned with binary features.
Multinomial NB models the probability of word frequencies, making it better when the number of occurrences matters.