In [12]:
import pandas as pd
import numpy as np
import re
import string

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# -----------------------------------
# 1. Preprocessing Function
# -----------------------------------
def preprocess_text(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    return text

In [15]:
# -----------------------------------
# 2. Load and Prepare Dataset
# -----------------------------------
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df['message'] = df['message'].apply(preprocess_text)

In [16]:
# -----------------------------------
# 3. Split Data
# -----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)


In [17]:
# -----------------------------------
# 4. TF-IDF Vectorization
# -----------------------------------
vectorizer = TfidfVectorizer(max_features=7000)  # No stopwords
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
# -----------------------------------
# 5. Train Models
# -----------------------------------
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [19]:
# Logistic Regression (balanced)
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_tfidf, y_train)


In [20]:
# SVM (balanced)
svm_model = SVC(C=1.0, kernel='linear', class_weight='balanced')
svm_model.fit(X_train_tfidf, y_train)

In [21]:
# 6. Evaluate Models
print("=== Naive Bayes ===")
print(classification_report(y_test, nb_model.predict(X_test_tfidf)))

print("=== Logistic Regression ===")
print(classification_report(y_test, lr_model.predict(X_test_tfidf)))

print("=== SVM ===")
print(classification_report(y_test, svm_model.predict(X_test_tfidf)))

=== Naive Bayes ===
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       966
           1       1.00      0.66      0.79       149

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115

=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.91      0.92      0.92       149

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115

=== SVM ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.96      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg  

In [22]:
# -----------------------------------
# 7. Predict New Messages (in 0/1 format)
# -----------------------------------
def predict_message_raw(message):
    cleaned = preprocess_text(message)
    tfidf = vectorizer.transform([cleaned])
    nb_pred = nb_model.predict(tfidf)
    lr_pred = lr_model.predict(tfidf)
    svm_pred = svm_model.predict(tfidf)
    
    print(f"Message: {message}")
    print(f"Predicted class (Naive Bayes): {nb_pred}")
    print(f"Predicted class (Logistic Regression): {lr_pred}")
    print(f"Predicted class (SVM): {svm_pred}\n")


In [23]:
# Test messages
messages = [
    "Congratulations! You have won a free lottery ticket!",
    "Hey, are we still on for coffee tomorrow?",
    "Free entry in a weekly competition! Text WIN to 80088 now!",
    "Can you send me the report by tonight?",
    "Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed å£1000 cash or å£5000 prize!",			
"Havent planning to buy later. I check already lido only got 530 show in e afternoon. U finish work already?"
]

In [24]:
print("\n=== Predictions (0 = Ham, 1 = Spam) ===\n")
for msg in messages:
    predict_message_raw(msg)


=== Predictions (0 = Ham, 1 = Spam) ===

Message: Congratulations! You have won a free lottery ticket!
Predicted class (Naive Bayes): [1]
Predicted class (Logistic Regression): [1]
Predicted class (SVM): [1]

Message: Hey, are we still on for coffee tomorrow?
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted class (SVM): [0]

Message: Free entry in a weekly competition! Text WIN to 80088 now!
Predicted class (Naive Bayes): [1]
Predicted class (Logistic Regression): [1]
Predicted class (SVM): [1]

Message: Can you send me the report by tonight?
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted class (SVM): [0]

Message: Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed å£1000 cash or å£5000 prize!
Predicted class (Naive Bayes): [1]
Predicted class (Logistic Regression): [1]
Predicted class (SVM): [1]

Message: Havent planning to buy later. I check alr

In [25]:
messages_to_check = [
    "Had your contract mobile 11 Mnths? Latest Motorola, Nokia etc. all FREE! Double Mins & Text on Orange tariffs. TEXT YES for callback, no to remove from records.",
    "No, I was trying it all weekend ;V",
    "You know, wot people wear. T shirts, jumpers, hat, belt, is all we know. We r at Cribbs",
    "Cool, what time you think you can get here?",
    "Wen did you get so spiritual and deep. That's great",
    "Have a safe trip to Nigeria. Wish you happiness and very soon company to share moments with",
    "Hahaha..use your brain dear",
    "Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed å£1000 cash or å£5000 prize!",			
    "Havent planning to buy later. I check already lido only got 530 show in e afternoon. U finish work already?"
]


In [26]:
print("=== Predictions on Messages from Image (0 = Ham, 1 = Spam) ===\n")
for msg in messages_to_check:
    cleaned = preprocess_text(msg)
    tfidf = vectorizer.transform([cleaned])
    nb = nb_model.predict(tfidf)
    lr = lr_model.predict(tfidf)
    svm = svm_model.predict(tfidf)
    
    print(f"Message: {msg}")
    print(f"Predicted class (Naive Bayes): {nb}")
    print(f"Predicted class (Logistic Regression): {lr}")
    print(f"Predicted class (SVM): {svm}\n")


=== Predictions on Messages from Image (0 = Ham, 1 = Spam) ===

Message: Had your contract mobile 11 Mnths? Latest Motorola, Nokia etc. all FREE! Double Mins & Text on Orange tariffs. TEXT YES for callback, no to remove from records.
Predicted class (Naive Bayes): [1]
Predicted class (Logistic Regression): [1]
Predicted class (SVM): [1]

Message: No, I was trying it all weekend ;V
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted class (SVM): [0]

Message: You know, wot people wear. T shirts, jumpers, hat, belt, is all we know. We r at Cribbs
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted class (SVM): [0]

Message: Cool, what time you think you can get here?
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted class (SVM): [0]

Message: Wen did you get so spiritual and deep. That's great
Predicted class (Naive Bayes): [0]
Predicted class (Logistic Regression): [0]
Predicted