### Importing Data

In [5]:
import nltk
import pandas as pd

df = pd.read_csv('filtered_data.csv', index_col=0)
df = df[~df['message'].isna()]
df = df.sample(frac=1).reset_index(drop=True)

train_data = df[:int(df.shape[0]*0.75)]
test_data = df[int(df.shape[0]*0.75):].reset_index(drop=True)

### My Implementation of Naive-Bayes

In [6]:
from collections import Counter

all_ham_words = train_data[train_data['label'] == False]['message'].tolist()
all_ham_words = " ".join([str(word) for word in all_ham_words])

ham_word_count = Counter(list(filter(lambda x: len(x) < 30, all_ham_words.split(" ")))).most_common()
ham_indexed = {}
total_ham = 0
for word, count in ham_word_count:
    ham_indexed[word] = count
    total_ham += count

In [7]:
all_spam_words = train_data[train_data['label'] == True]['message'].tolist()
all_spam_words = " ".join([str(word) for word in all_spam_words])

spam_word_count = Counter(list(filter(lambda x: len(x) < 30, all_spam_words.split(" ")))).most_common()
spam_indexed = {}
total_spam = 0
for word, count in spam_word_count:
    spam_indexed[word] = count
    total_spam += count

In [8]:
def calculate_spam_score (message):
    indexer = Counter(message.split(" ")).most_common()
    spam_score = total_spam / (total_ham + total_spam)
    for word, count in indexer:
        if word in spam_indexed:
            spam_score *= ((spam_indexed[word]+1)/total_spam) * count
        else:
            spam_score *= (1/total_spam) * count
            
    return spam_score

def calculate_ham_score (message):
    indexer = Counter(message.split(" ")).most_common()
    ham_score = total_ham / (total_ham + total_spam)
    for word, count in indexer:
        if word in ham_indexed:
            ham_score *= ((ham_indexed[word]+1)/total_ham) * count
        else:
            ham_score *= (1/total_ham) * count
            
    return ham_score

In [9]:
r = 0
t = 0
for index, row in test_data.iterrows():
    c = False
    try:
        if calculate_spam_score(row['message']) > calculate_ham_score(row['message']):
            c = True
    except:
        print(row['message'])
    if c == row['label']:
        r += 1
    t += 1

In [10]:
print(f"Accuracy: {r/t*100}%")

Accuracy: 76.59880107185282%


### NLTK's implementation of Naive-Bayes

In [21]:
def feature_generator (message):
    feature_dict = {}
    for word, count in Counter(message.split(' ')).most_common():
            feature_dict[word] = count
    return feature_dict

In [22]:
train_set = []
for index, row in train_data.iterrows():
    train_set.append((feature_generator(row['message']), row['label']))

In [23]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [24]:
r = 0
t = 0
for index, row in test_data.iterrows():
    if classifier.classify(feature_generator(row['message'])) == row['label']:
        r += 1
    t += 1

In [25]:
print(f"Accuracy: {r/t*100}%")

Accuracy: 95.02196193265007%
