# **Naive Bayes Spam Classifier**

**1. Data Collection**

In [1]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

data = pd.read_csv('spam.csv', encoding="latin1")

print(data.head())

Saving spam.csv to spam.csv
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


**2. Data Cleaning / Preprocessing**

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_data(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"\d+", " ", text)
    text = text.encode('ascii', errors='ignore').decode()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]

    return tokens


data = data.rename(columns={'v1': 'label', 'v2': 'email'})

data['clean_data'] = data['email'].apply(clean_data)

print(data[['label','email','clean_data']].head(), "\n")

  label                                              email  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                          clean_data  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4  [nah, dont, think, goes, usf, lives, around, t...   



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**3. Feature Extraction (Bag of Words)**

In [6]:
vocabulary = set()
for tokens in data['clean_data']:
    vocabulary.update(tokens)

vocabulary = list(vocabulary)

print("Vocabulary size:", len(vocabulary))
print("First 20 words:", vocabulary[:20])

Vocabulary size: 8345
First 20 words: ['orh', 'planning', 'optouthf', 'luckily', 'opposed', 'tantrums', 'medicine', 'answers', 'meetgreet', 'voda', 'ff', 'card', 'tests', 'cuddling', 'trusting', 'kwish', 'texted', 'computers', 'weddingfriend', 'valuemorning']


In [7]:
def bow_features(tokens, vocabulary):
    return [tokens.count(word) for word in vocabulary]

data['bow_vector'] = data['clean_data'].apply(lambda x: bow_features(x, vocabulary))

print(data[['label','bow_vector']])

     label                                         bow_vector
0      ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1      ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2     spam  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3      ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4      ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
...    ...                                                ...
5567  spam  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5568   ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5569   ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5570   ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5571   ham  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

[5572 rows x 2 columns]


**4. Prior Probabilities**

In [13]:
import numpy as np

vocab_index = {word: i for i, word in enumerate(vocabulary)}
V = len(vocabulary)

spam_word_counts = np.zeros(V)
ham_word_counts  = np.zeros(V)

num_spam_emails = 0
num_ham_emails  = 0
total_spam_words = 0
total_ham_words  = 0

for bow, label in zip(data['bow_vector'], data['label']):
    if label == 'spam':
        num_spam_emails += 1
        for i, count in enumerate(bow):
            spam_word_counts[i] += count
            total_spam_words += count
    else:
        num_ham_emails += 1
        for i, count in enumerate(bow):
            ham_word_counts[i] += count
            total_ham_words += count

num_emails = len(data)
P_spam = num_spam_emails / num_emails
P_ham  = num_ham_emails / num_emails

print("Number of emails:", num_emails)
print("Number of spam emails:", num_spam_emails)
print("Number of ham emails:", num_ham_emails)
print(f"P(Spam) = {P_spam:.3f}")
print(f"P(Ham)  = {P_ham:.3f}")
print()

Number of emails: 5572
Number of spam emails: 747
Number of ham emails: 4825
P(Spam) = 0.134
P(Ham)  = 0.866



**5. Likelihood Probabilities**

In [15]:
spam_likelihood = [(count + 1) / (total_spam_words + V) for count in spam_word_counts]
ham_likelihood  = [(count + 1) / (total_ham_words  + V) for count in ham_word_counts]

print("\nLikelihood Probabilities")
for i, word in enumerate(vocabulary[:20]):
    print(f"P({word}|Spam)={spam_likelihood[i]:.8f}, P({word}|Ham)={ham_likelihood[i]:.8f}")


Likelihood Probabilities
P(orh|Spam)=0.00005171, P(orh|Ham)=0.00004241
P(planning|Spam)=0.00005171, P(planning|Ham)=0.00019086
P(optouthf|Spam)=0.00010342, P(optouthf|Ham)=0.00002121
P(luckily|Spam)=0.00005171, P(luckily|Ham)=0.00004241
P(opposed|Spam)=0.00005171, P(opposed|Ham)=0.00004241
P(tantrums|Spam)=0.00005171, P(tantrums|Ham)=0.00004241
P(medicine|Spam)=0.00005171, P(medicine|Ham)=0.00008483
P(answers|Spam)=0.00005171, P(answers|Ham)=0.00010603
P(meetgreet|Spam)=0.00015513, P(meetgreet|Ham)=0.00002121
P(voda|Spam)=0.00031027, P(voda|Ham)=0.00002121
P(ff|Spam)=0.00010342, P(ff|Ham)=0.00002121
P(card|Spam)=0.00025856, P(card|Ham)=0.00029689
P(tests|Spam)=0.00005171, P(tests|Ham)=0.00008483
P(cuddling|Spam)=0.00005171, P(cuddling|Ham)=0.00006362
P(trusting|Spam)=0.00005171, P(trusting|Ham)=0.00004241
P(kwish|Spam)=0.00005171, P(kwish|Ham)=0.00004241
P(texted|Spam)=0.00005171, P(texted|Ham)=0.00010603
P(computers|Spam)=0.00005171, P(computers|Ham)=0.00004241
P(weddingfriend|Spam)=

**6. Classification Step**

In [16]:
import math

def classify_email(email_text):
    tokens = clean_data(email_text)

    log_prob_spam = math.log(P_spam)
    log_prob_ham  = math.log(P_ham)

    for word in tokens:
        if word in vocab_index:
            i = vocab_index[word]
            log_prob_spam += math.log(spam_likelihood[i])
            log_prob_ham  += math.log(ham_likelihood[i])
        else:
            # Laplace smoothing fallback
            log_prob_spam += math.log(1 / (total_spam_words + V))
            log_prob_ham  += math.log(1 / (total_ham_words + V))

    print(f"\nEmail: '{email_text}'")
    print(f"log P(Spam|email) = {log_prob_spam:.3f}")
    print(f"log P(Ham|email)  = {log_prob_ham:.3f}")

    if log_prob_spam > log_prob_ham:
        print("Classification: SPAM")
        return 'spam'
    else:
        print("Classification: HAM")
        return 'ham'

**7. Testing on New Emails**

In [19]:
new_emails = [
    "Congratulations! You won a free iPhone. Click here to claim now!",
    "Get rich quick! Invest in our program and double your money!",
    "Limited time offer: Buy one, get one free on all products!",
    "You have been selected for a $1000 gift card. Act fast!",
    "Earn $5000 per week working from home. No experience required!",
    "Hey, are we still on for coffee tomorrow?",
    "Please find attached the report for our meeting.",
    "Happy birthday! Wishing you a wonderful day with your family.",
    "Can you review the document and send me your feedback?",
    "Let's schedule a call to discuss the project updates.",
]

classified_labels = []
for email in new_emails:
    label = classify_email(email)
    classified_labels.append(label)


Email: 'Congratulations! You won a free iPhone. Click here to claim now!'
log P(Spam|email) = -36.720
log P(Ham|email)  = -47.371
Classification: SPAM

Email: 'Get rich quick! Invest in our program and double your money!'
log P(Spam|email) = -62.168
log P(Ham|email)  = -58.580
Classification: HAM

Email: 'Limited time offer: Buy one, get one free on all products!'
log P(Spam|email) = -68.970
log P(Ham|email)  = -64.120
Classification: HAM

Email: 'You have been selected for a $1000 gift card. Act fast!'
log P(Spam|email) = -42.707
log P(Ham|email)  = -44.221
Classification: SPAM

Email: 'Earn $5000 per week working from home. No experience required!'
log P(Spam|email) = -62.160
log P(Ham|email)  = -56.275
Classification: HAM

Email: 'Hey, are we still on for coffee tomorrow?'
log P(Spam|email) = -35.220
log P(Ham|email)  = -27.052
Classification: HAM

Email: 'Please find attached the report for our meeting.'
log P(Spam|email) = -44.093
log P(Ham|email)  = -38.846
Classification: HAM



**8. Accuracy Evaluation**

In [20]:
true_labels = [
    "spam","spam","spam","spam","spam",
    "ham","ham","ham","ham","ham"
]

correct = 0
for pred, true in zip(classified_labels, true_labels):
    if pred == true:
        correct += 1

accuracy = correct / len(true_labels) * 100
print(f"Model accuracy: {accuracy:.2f}%")

Model accuracy: 60.00%
