## Training the Naive Bayes model

In [144]:
from nltk.corpus import stopwords
import pandas as pd
import re
import csv


#load the traning data
df = pd.read_csv('training_data.txt',names=['SPAM'])

#create two columns, data are seperated with a tab.
df[['SPAM','SMS']] = df["SPAM"].str.split("\t", 1, expand=True)

#keep only alphabetical characters
df['SMS'] = df['SMS'].apply(lambda x : re.sub('[^a-z\s]+',' ',x,flags=re.IGNORECASE)) 

#lowercase the string and replace multiple spaces with only one
df['SMS'] = df['SMS'].apply(lambda x : re.sub('(\s+)',' ',x.lower())) 

#remove stop-words
stop = stopwords.words('english')
df['SMS_without_stop_words'] = df['SMS'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

spam_dataframe = df[df['SPAM'] == 'True']

non_spam_dataframe = df[df['SPAM'] == 'False']

spam_bag_of_words = pd.Series([y for x in spam_dataframe['SMS'].values.flatten() for y in x.split()]).value_counts()

non_spam_bag_of_words = pd.Series([y for x in non_spam_dataframe['SMS'].values.flatten() for y in x.split()]).value_counts()

to_count = pd.Series([y for x in df['SMS'].values.flatten() for y in x.split()]).value_counts()

count = to_count.shape[0]



## testing spam messages

In [143]:
test_string = "im totally not a spam string baby"

#load the test data
test_dataframe = pd.read_csv('test_data.txt',names=['TEST'])

#create two columns, data are seperated with a tab.
test_dataframe[['example_number','SMS']] = test_dataframe["TEST"].str.split("\t", 1, expand=True)

#keep only alphabetical characters
test_dataframe['SMS'] = test_dataframe['SMS'].apply(lambda x : re.sub('[^a-z\s]+',' ',x,flags=re.IGNORECASE)) 

#lowercase the string and replace multiple spaces with only one
test_dataframe['SMS'] = test_dataframe['SMS'].apply(lambda x : re.sub('(\s+)',' ',x.lower()))

#get the number of training examples by getting the rows of the spam or non spam dataframe

total_trained_examples = spam_dataframe.shape[0] + non_spam_dataframe.shape[0]


spam_probability = spam_dataframe.shape[0] / total_trained_examples

non_spam_probability = non_spam_dataframe.shape[0] / total_trained_examples

with open('results.csv', 'w', newline='') as file:
    
    writer = csv.writer(file)
    
    #add header
    writer.writerow(['id', 'label'])
    
    for index, row in test_dataframe.iterrows():
        
        test_string = row['SMS']
        
        filtered_words = list(filter(lambda word: word not in stop, test_string.split()))
        
    
        product_spam = 1

        product_non_spam = 1
        
        word_count = len(test_string.split())


        #find times the word is found in each category
        for word in test_string.split():

            #if the word is not found on the training set set the nontype to 0

            times_word_found_in_spam = int(spam_bag_of_words.get(key = word) or 0)
            times_word_found_in_non_spam = int(non_spam_bag_of_words.get(key = word) or 0)

            spam_word_probability = (times_word_found_in_spam + 1) / (spam_bag_of_words.sum() + count + 1)

            non_spam_word_probability = (times_word_found_in_non_spam + 1) / (non_spam_bag_of_words.sum() + count + 1)

            product_spam = spam_word_probability * product_spam
            product_non_spam = non_spam_word_probability * product_non_spam


        spam = product_spam*spam_probability
        non_spam = product_non_spam*non_spam_probability

        if(spam > non_spam):
            writer.writerow([row['example_number'], "True"])
        else:
            writer.writerow([row['example_number'], "False"])
