In [6]:
import math  
import numpy as np  
from collections import Counter  
import pandas as pd  
import os

In [7]:
# Read the CSV file 
current_file = os.getcwd()
file_path = os.path.join(current_file, "spam.csv")
data = pd.read_csv(file_path, encoding = "ISO-8859-1")

# Select only the first two columns of the dataset
data = data.iloc[:, 0:2]

# Convert the 'v1' column to binary encoding (1 for 'ham', 0 for 'spam')
data["v1"] = data["v1"].apply(lambda x: 1 if x == "ham" else 0)

# Extract real and spam sentences from the dataset for training and testing
real_sentences = data[data["v1"] == 1]["v2"]
real_sentences_train = real_sentences[int(len(real_sentences)*0.3):]
real_sentences_test = real_sentences[:int(len(real_sentences)*0.3)]

spam_sentences = data[data["v1"] == 0]["v2"]
spam_sentences_train = spam_sentences[int(len(spam_sentences)*0.3):]
spam_sentences_test = spam_sentences[:int(len(spam_sentences)*0.3)]

# Split sentences into individual words
real_words = " ".join(real_sentences_train).split(" ")
spam_words = " ".join(spam_sentences_train).split(" ")

# Combine all words from real and spam sentences
unified_words = real_words + spam_words

# Find words that appear in both real and spam sentences
intersection_words = set(real_words).intersection(set(spam_words))

# Count occurrences of words in real and spam sentences
occurrences_real = dict(Counter(real_words))
occurrences_spam = dict(Counter(spam_words))

# Calculate probability of each word given that it is in a real or spam sentence
prob_of_w_given_real = {x: y / len(real_words) for x, y in occurrences_real.items()}
prob_of_w_given_spam = {x: y / len(spam_words) for x, y in occurrences_spam.items()}

# Calculate the prior probabilities of real and spam sentences
p_real = len(real_words) / (len(real_words) + len(spam_words))
p_spam = len(spam_words) / (len(real_words) + len(spam_words))

# Function to determine if a sentence is real or spam
def sentence_is_real(sentence):
    analyzed_words = [x for x in sentence.split(" ") if x in intersection_words]

    real_and_this_sentence = math.log(p_real)
    real_and_this_sentence += sum([np.log(prob_of_w_given_real[x]) for x in analyzed_words])

    spam_and_this_sentence = math.log(p_spam)
    spam_and_this_sentence += sum([np.log(prob_of_w_given_spam[x]) for x in analyzed_words])

    return real_and_this_sentence > spam_and_this_sentence

# Calculate accuracy for real sentences
quantity = 0
for x in real_sentences_test:
    if sentence_is_real(x):
        quantity += 1
print(f"{quantity / len(real_sentences_test)} of words are correctly predicted as real")

# Calculate accuracy for spam sentences
quantity = 0
for x in spam_sentences_test:
    if not sentence_is_real(x):
        quantity += 1
print(f"{quantity / len(spam_sentences_test)} of words are correctly predicted as spam")


0.9571527297857636 of words are correctly predicted as real
0.8973214285714286 of words are correctly predicted as spam
