In [1]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict

In [2]:
df = pd.read_csv("/Users/felipepesantez/Documents/development/datasets/email_classification.csv")

In [3]:
df.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   179 non-null    object
 1   label   179 non-null    object
dtypes: object(2)
memory usage: 2.9+ KB


In [5]:
df.describe()

Unnamed: 0,email,label
count,179,179
unique,150,2
top,You've been selected for a free trial of our p...,ham
freq,4,100


In [6]:
df['label'].value_counts()

label
ham     100
spam     79
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [9]:
import string
#preprocessing the data
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('','', string.punctuation))
    return text    

In [10]:
df['email'] = df['email'].apply(preprocess_text)

In [11]:
df.head()

Unnamed: 0,email,label
0,upgrade to our premium plan for exclusive acce...,ham
1,happy holidays from our team wishing you joy a...,ham
2,were hiring check out our career opportunities...,ham
3,your amazon account has been locked click here...,spam
4,your opinion matters take our survey and help ...,ham


In [12]:
#tokenize
def tokenize(text):
    return text.split()

df['tokens'] = df['email'].apply(tokenize)

In [13]:
df.head()

Unnamed: 0,email,label,tokens
0,upgrade to our premium plan for exclusive acce...,ham,"[upgrade, to, our, premium, plan, for, exclusi..."
1,happy holidays from our team wishing you joy a...,ham,"[happy, holidays, from, our, team, wishing, yo..."
2,were hiring check out our career opportunities...,ham,"[were, hiring, check, out, our, career, opport..."
3,your amazon account has been locked click here...,spam,"[your, amazon, account, has, been, locked, cli..."
4,your opinion matters take our survey and help ...,ham,"[your, opinion, matters, take, our, survey, an..."


In [14]:
vocab = set()
for tokens in df['tokens']:
    vocab.update(tokens)


In [15]:
#initialize dictionaries
word_counts = defaultdict(lambda: {'ham':0, 'spam':0})
class_counts = {'ham': 0, 'spam': 0}

In [16]:
for i, row in df.iterrows():
    class_label = row['label']
    class_counts[class_label] += 1
    for word in row['tokens']:
        word_counts[word][class_label] += 1

In [21]:
#calculate prior probabilities
total_samples = len(df)
prior_ham = class_counts['ham'] / total_samples
prior_spam = class_counts['spam']/ total_samples

In [22]:
#calculate likelihood probabilities
likelikehood_probs = {}
for word in vocab:
    likelikehood_probs[word] = {
        'ham': (word_counts[word]['ham']+1) / (class_counts['ham'] + len(vocab)),
        'spam': (word_counts[word]['spam']+1) / (class_counts['spam'] + len(vocab))
    }

In [28]:
#prediction function
def predict(email, threshold=0.6):
    tokens = tokenize(preprocess_text(email))
    log_prob_ham = np.log(prior_ham) + sum(np.log(likelikehood_probs.get(word, {'ham':1})['ham']) for word in tokens)
    log_prob_spam = np.log(prior_spam) + sum(np.log(likelikehood_probs.get(word, {'spam':1})['spam']) for word in tokens)

    if log_prob_ham - log_prob_spam > np.log(threshold):
        return 'ham'
    else:
        return 'spam'


In [None]:
#inference
new_email = "Win bitcoin today, click here: "
predicted_label = predict(new_email, threshold=0.5)
print("Predicted label: ", predicted_label)