In [1]:
from tabulate import tabulate
import  re, string
import pandas as pd
from nltk.corpus import stopwords
import nltk
import numpy as np
import math

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv("spam.csv", encoding='latin1')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srbuh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    text = re.findall(r"[a-z]+", text)
    tokens = [word for word in tokens if word not in stop_words ]
    return tokens

df['tokens'] = df['v2'].apply(preprocess_text)

In [4]:
def build_vocab(token_lists, min_freq = 1):
    vocab = {}
    word_freq = {}
    for tokens in token_lists:
        for token in tokens:
            word_freq[token] = word_freq.get(token, 0) + 1
    idx = 0
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

vocab = build_vocab(df['tokens'])
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 9431


In [5]:
def doc_to_bow(tokens, vocab):
    vec = np.zeros(len(vocab))
    for token in tokens:
        if token in vocab:
            vec[vocab[token]] += 1
    return vec

df['bow_vector'] = df['tokens'].apply(lambda x: doc_to_bow(x, vocab))
#print(df[['v1','v2','bow_vector']].head())

In [6]:
spam_df = df[df['v1']=='spam']
ham_df = df[df['v1']=='ham']

# posterior probability
P_spam = len(spam_df)/len(df)
P_ham = len(ham_df)/len(df)

spam_tokens_lists = spam_df['tokens'].tolist()
ham_tokens_lists = ham_df['tokens'].tolist()

spam_counts = np.zeros(len(vocab))
ham_counts = np.zeros(len(vocab))

for tokens in spam_tokens_lists:
    for token in tokens:
        if token in vocab:
            spam_counts[vocab[token]] += 1

for tokens in ham_tokens_lists:
    for token in tokens:
        if token in vocab:
            ham_counts[vocab[token]] += 1

spam = spam_counts.sum() #total
ham = ham_counts.sum()
vocab_len = len(vocab)

def p_word_given_class(index, counts, total, V):      # Laplace smoothing
    return (counts[index] + 1) / (total + V)

In [7]:
def classify(tokens):
    log_spam = math.log(P_spam)
    log_ham = math.log(P_ham)

    for token in tokens:
        if token in vocab:
            idx = vocab[token]
            log_spam += math.log(p_word_given_class(idx, spam_counts, spam, vocab_len))
            log_ham += math.log(p_word_given_class(idx, ham_counts, ham, vocab_len))

    return "spam" if log_spam > log_ham else "ham"

In [8]:
spam_emails = [
    "Congratulations! You have won a free iPhone!",
    "Get rich quick with this one simple trick!",
    "Claim your free gift card now!",
    "Limited time offer, buy now and save 50%",
    "You have been selected for a cash reward"
]

ham_emails = [
    "Hey, are we meeting for coffee tomorrow?",
    "Please find attached the report for our meeting",
    "Can you send me the notes from class?",
    "Let's have lunch next week",
    "Don't forget about the project deadline on Friday"
]

for email in spam_emails + ham_emails:
    tokens = preprocess_text(email)
    prediction = classify(tokens)
    print(f"Email: {email}")
    print(f"Prediction: {prediction}")
    print("."*50)

Email: Congratulations! You have won a free iPhone!
Prediction: spam
..................................................
Email: Get rich quick with this one simple trick!
Prediction: ham
..................................................
Email: Claim your free gift card now!
Prediction: spam
..................................................
Email: Limited time offer, buy now and save 50%
Prediction: ham
..................................................
Email: You have been selected for a cash reward
Prediction: spam
..................................................
Email: Hey, are we meeting for coffee tomorrow?
Prediction: ham
..................................................
Email: Please find attached the report for our meeting
Prediction: ham
..................................................
Email: Can you send me the notes from class?
Prediction: ham
..................................................
Email: Let's have lunch next week
Prediction: ham
...........................