In [1]:
import pandas as pd


In [3]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"


In [4]:
import zipfile
import urllib.request


In [5]:
urllib.request.urlretrieve(url, "smsspamcollection.zip")


('smsspamcollection.zip', <http.client.HTTPMessage at 0x7df7a559ea50>)

In [6]:
with zipfile.ZipFile("smsspamcollection.zip", "r") as zip_ref:
    zip_ref.extractall()


In [7]:
df = pd.read_csv(
    "SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "message"]
)


In [8]:
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.shape


(5572, 2)

In [10]:
df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


Convert Text to Lowercase

In [12]:
df['message'] = df['message'].str.lower()


In [13]:
df['message'].head()


Unnamed: 0,message
0,"go until jurong point, crazy.. available only ..."
1,ok lar... joking wif u oni...
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor... u c already then say...
4,"nah i don't think he goes to usf, he lives aro..."


Remove Punctuation

In [14]:
import string


In [15]:
df['message'] = df['message'].str.translate(
    str.maketrans('', '', string.punctuation)
)


In [16]:
df['message'].head()


Unnamed: 0,message
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in 2 a wkly comp to win fa cup fina...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...


Remove Numbers

In [17]:
df['message'] = df['message'].str.replace(r'\d+', '', regex=True)


In [18]:
df['message'].head()


Unnamed: 0,message
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in a wkly comp to win fa cup final...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...


Remove Extra Spaces

In [19]:
df['message'] = df['message'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [20]:
df['message'].head()


Unnamed: 0,message
0,go until jurong point crazy available only in ...
1,ok lar joking wif u oni
2,free entry in a wkly comp to win fa cup final ...
3,u dun say so early hor u c already then say
4,nah i dont think he goes to usf he lives aroun...


Split Data into Train & Test

In [22]:
from sklearn.model_selection import train_test_split

X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
print(X_train.shape, X_test.shape)


(4457,) (1115,)


Text → Numbers

Bag of Words (CountVectorizer)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(X_train_vec.shape)
print(X_test_vec.shape)


(4457, 7582)
(1115, 7582)


In [25]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(X_train_vec, y_train)


In [26]:
y_pred = model_nb.predict(X_test_vec)


In [27]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9883408071748879


Model Evaluation (Confusion Matrix)

This will show:

True Spam detected correctly

Ham wrongly classified as spam (false positive)

Spam wrongly classified as ham (false negative)

In [28]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm


array([[966,   0],
       [ 13, 136]])

| Actual \ Predicted | Ham | Spam |
| ------------------ | --- | ---- |
| **Ham**            | 966 | 0    |
| **Spam**           | 13  | 136  |


Train Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_vec, y_train)

y_pred_lr = model_lr.predict(X_test_vec)

from sklearn.metrics import accuracy_score
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9838565022421525


Compare with Confusion Matrix for Logistic Regression

In [30]:
from sklearn.metrics import confusion_matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
cm_lr


array([[964,   2],
       [ 16, 133]])

| Actual \ Predicted | Ham | Spam |
| ------------------ | --- | ---- |
| **Ham**            | 964 | 2    |
| **Spam**           | 16  | 133  |


Naive Bayes had 0 false spam and 13 false ham

Logistic Regression had 2 false spam and 16 false ham

**Calculate Metrics for Naive Bayes**

In [31]:
from sklearn.metrics import classification_report

print("Naive Bayes Metrics:\n")
print(classification_report(y_test, y_pred))


Naive Bayes Metrics:

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



**Metrics for Logistic Regression**

In [32]:
print("Logistic Regression Metrics:\n")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Metrics:

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



**Naive Bayes** is better for this dataset

Because:

 Spam Recall is higher (0.91 vs 0.89),
 Spam Precision is perfect (1.00),
 F1-score is higher for spam (0.95 vs 0.94)



**Conclusion:**
Naive Bayes is the best model here

It detects spam more accurately and makes fewer mistakes.