In [9]:
# Spam Classifier using BoW and TF-IDF

# 📦 Step 1: Imports
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# 📁 Step 2: Load Dataset (UCI SMS Spam Collection)
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jangcha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# 🧹 Step 3: Preprocessing Function
def preprocess(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Apply preprocessing
df['cleaned'] = df['message'].apply(preprocess)

# ✂️ Step 4: Split Dataset
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['label'], test_size=0.2, random_state=42)

In [11]:
# 🧠 Step 5a: BoW Model
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

bow_model = MultinomialNB()
bow_model.fit(X_train_bow, y_train)
y_pred_bow = bow_model.predict(X_test_bow)

print("\nBoW Model Evaluation:")
print(confusion_matrix(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))


BoW Model Evaluation:
[[962   4]
 [ 12 137]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.92      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [12]:
# 🧠 Step 5b: TF-IDF Model
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

tfidf_model = MultinomialNB()
tfidf_model.fit(X_train_tfidf, y_train)
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

print("\nTF-IDF Model Evaluation:")
print(confusion_matrix(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))



TF-IDF Model Evaluation:
[[966   0]
 [ 32 117]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [13]:
# 📌 Step 6: Compare Top Features
def show_top_features(vectorizer, model, n=10):
    feature_names = np.array(vectorizer.get_feature_names_out())
    class_log_prob = model.feature_log_prob_  # shape (n_classes, n_features)
    spam_indices = class_log_prob[1].argsort()
    ham_indices = class_log_prob[0].argsort()
    print("\nTop spam indicators:")
    print(feature_names[spam_indices[-n:]])
    print("Top ham indicators:")
    print(feature_names[ham_indices[-n:]])

show_top_features(tfidf_vectorizer, tfidf_model)


Top spam indicators:
['prize' 'ur' 'reply' 'text' 'stop' 'claim' 'mobile' 'txt' 'free' 'call']
Top ham indicators:
['got' 'good' 'dont' 'ltgt' 'call' 'come' 'get' 'ill' 'im' 'ok']
