In [1]:
# Importing important libraries
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math

In [2]:
data = pd.read_csv('spam.csv',encoding='latin1')
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.groupby(data['v1']).count()['v2']

v1
ham     4828
spam     751
Name: v2, dtype: int64

In [4]:
# train_spam = list(data.loc[data.v1 == 'spam']['v2'])
# train_ham = list(data.loc[data.v1 == 'ham']['v2'])
# Training data for naive bayes text classifier
train_spam = ['send us your password', 'review our website', 'send your password', 'send us your account']
train_ham = ['Your activity report','benefits physical activity', 'the importance vows']

In [18]:
# Make a vocabulary of unique words that occur in known spam emails
vocab_words_spam = []

for sentence in train_spam:
    sentence_as_list = sentence.split()
    for word in sentence_as_list:
        vocab_words_spam.append(word)
        
vocab_unique_words_spam = list(dict.fromkeys(vocab_words_spam))
print(vocab_unique_words_spam)
print(vocab_unique_words_ham)

['send', 'us', 'your', 'password', 'review', 'our', 'website', 'account']
['Your', 'activity', 'report', 'benefits', 'physical', 'the', 'importance', 'vows']


In [6]:
# Calculating probability of words occuring in spam emails
dict_spamicity = {}
for w in vocab_unique_words_spam:
    emails_with_w = 0 
    for sentence in train_spam:
        if w in sentence:
            emails_with_w+=1
            
    print(f"Number of spam emails with the word {w}: {emails_with_w}")
    total_spam = len(train_spam)
    spamicity = (emails_with_w+1)/(total_spam+2)
    print(f"Spamicity of the word '{w}': {spamicity} \n")
    dict_spamicity[w.lower()] = spamicity

Number of spam emails with the word send: 3
Spamicity of the word 'send': 0.6666666666666666 

Number of spam emails with the word us: 2
Spamicity of the word 'us': 0.5 

Number of spam emails with the word your: 3
Spamicity of the word 'your': 0.6666666666666666 

Number of spam emails with the word password: 2
Spamicity of the word 'password': 0.5 

Number of spam emails with the word review: 1
Spamicity of the word 'review': 0.3333333333333333 

Number of spam emails with the word our: 4
Spamicity of the word 'our': 0.8333333333333334 

Number of spam emails with the word website: 1
Spamicity of the word 'website': 0.3333333333333333 

Number of spam emails with the word account: 1
Spamicity of the word 'account': 0.3333333333333333 



In [7]:
# make a vocabulary of unique words that occur in known ham emails
vocab_words_ham = []
for sentence in train_ham:
    sentence_as_list = sentence.split()
    for word in sentence_as_list:
        vocab_words_ham.append(word)
        
vocab_unique_words_ham = list(dict.fromkeys(vocab_words_ham))

In [8]:
# Calculating probability of words occuring in ham emails
dict_hamicity = {}
for w in vocab_unique_words_ham:
    emails_with_w = 0  
    for sentence in train_ham:
        if w in sentence:
            emails_with_w += 1
            
    print(f"Number of ham emails with the word '{w}': {emails_with_w}")
    total_ham = len(train_ham)
    Hamicity = (emails_with_w+1)/(total_ham+2)      
    print(f"Hamicity of the word '{w}': {Hamicity} ")
    dict_hamicity[w.lower()] = Hamicity

Number of ham emails with the word 'Your': 1
Hamicity of the word 'Your': 0.4 
Number of ham emails with the word 'activity': 2
Hamicity of the word 'activity': 0.6 
Number of ham emails with the word 'report': 1
Hamicity of the word 'report': 0.4 
Number of ham emails with the word 'benefits': 1
Hamicity of the word 'benefits': 0.4 
Number of ham emails with the word 'physical': 1
Hamicity of the word 'physical': 0.4 
Number of ham emails with the word 'the': 1
Hamicity of the word 'the': 0.4 
Number of ham emails with the word 'importance': 1
Hamicity of the word 'importance': 0.4 
Number of ham emails with the word 'vows': 1
Hamicity of the word 'vows': 0.4 


In [9]:
prob_spam = len(train_spam) / (len(train_spam)+(len(train_ham)))
print('Probability of Spam: ',prob_spam)
prob_ham = len(train_ham) / (len(train_spam)+(len(train_ham)))
print('Probability of Ham: ',prob_ham)

Probability of Spam:  0.5714285714285714
Probability of Ham:  0.42857142857142855


In [11]:
# Using the split data to test the algorithm
test_emails = {'spam':['renew your password', 'renew your vows','hello'], 
               'ham':['benefits of our account', 'the importance of physical activity']}
tests = []
for i in test_emails['spam']:
    tests.append(i)
    
for i in test_emails['ham']:
    tests.append(i)

# split emails into distinct words
distinct_words_as_sentences_test = []
for sentence in tests:
    sentence_as_list = sentence.split()
    senten = []
    for word in sentence_as_list:
        senten.append(word)
    distinct_words_as_sentences_test.append(senten)

test_spam_tokenized = [distinct_words_as_sentences_test[0], distinct_words_as_sentences_test[1]]
test_ham_tokenized = [distinct_words_as_sentences_test[2], distinct_words_as_sentences_test[3]]
print(test_spam_tokenized)

[['renew', 'your', 'password'], ['renew', 'your', 'vows']]


In [12]:
# Reducing dataset based on test data
print('Reducing for spam:')
reduced_sentences_spam_test = []
for sentence in test_spam_tokenized:
    words_ = []
    for word in sentence:
        if word in vocab_unique_words_spam:
            print(f"'{word}', ok")
            words_.append(word)
        elif word in vocab_unique_words_ham:
            print(f"'{word}', ok")
            words_.append(word)
        else:
            print(f"'{word}', word not present in labelled spam training data")
    reduced_sentences_spam_test.append(words_)
print(reduced_sentences_spam_test)

print('\nReducing for ham:')
reduced_sentences_ham_test = []                   
for sentence in test_ham_tokenized:
    words_ = []
    for word in sentence:
        if word in vocab_unique_words_ham:
            print(f"'{word}', ok")
            words_.append(word)
        elif word in vocab_unique_words_spam:
            print(f"'{word}', ok")
            words_.append(word)
        else:
            print(f"'{word}', word not present in labelled ham training data")
    reduced_sentences_ham_test.append(words_)
print(reduced_sentences_ham_test)

Reducing for spam:
'renew', word not present in labelled spam training data
'your', ok
'password', ok
'renew', word not present in labelled spam training data
'your', ok
'vows', ok
[['your', 'password'], ['your', 'vows']]

Reducing for ham:
'hello', word not present in labelled ham training data
'benefits', ok
'of', word not present in labelled ham training data
'our', ok
'account', ok
[[], ['benefits', 'our', 'account']]


In [13]:
test_spam_stemmed = []
non_key = ['us','the', 'of','your']       # non-key words, gathered from spam,ham and test sentences
for email in reduced_sentences_spam_test:
    email_stemmed=[]
    for word in email:
        if word in non_key:
            print('remove')
        else:
            email_stemmed.append(word)
    test_spam_stemmed.append(email_stemmed)
            
print(test_spam_stemmed)

remove
remove
[['password'], ['vows']]


In [14]:
test_ham_stemmed = []
non_key = ['us',  'the', 'of', 'your'] 
for email in reduced_sentences_ham_test:
    email_stemmed=[]
    for word in email:
        if word in non_key:
            print('remove')
        else:
            email_stemmed.append(word)
    test_ham_stemmed.append(email_stemmed)

In [15]:
def mult(list_) :        # function to multiply all word probs together 
    total_prob = 1
    for i in list_: 
         total_prob = total_prob * i  
    return total_prob

def Bayes(email):
    probs = []
    for word in email:
        Pr_S = prob_spam
        print('prob of spam in general ',Pr_S)
        try:
            pr_WS = dict_spamicity[word]
            print(f'prob "{word}"  is a spam word : {pr_WS}')
        except KeyError:
            pr_WS = 1/(total_spam+2)  # Apply smoothing for word not seen in spam training data, but seen in ham training 
            print(f"prob '{word}' is a spam word: {pr_WS}")
            
        Pr_H = prob_ham
        print('prob of ham in general ', Pr_H)
        try:
            pr_WH = dict_hamicity[word]
            print(f'prob "{word}" is a ham word: ',pr_WH)
        except KeyError:
            pr_WH = (1/(total_ham+2))  # Apply smoothing for word not seen in ham training data, but seen in spam training
            print(f"WH for {word} is {pr_WH}")
            print(f"prob '{word}' is a ham word: {pr_WH}")
        
        prob_word_is_spam_BAYES = (pr_WS*Pr_S)/((pr_WS*Pr_S)+(pr_WH*Pr_H))
        print('')
        print(f"Using Bayes, prob the the word '{word}' is spam: {prob_word_is_spam_BAYES}")
        print('###########################')
        probs.append(prob_word_is_spam_BAYES)
    print(f"All word probabilities for this sentence: {probs}")
    final_classification = mult(probs)
    if final_classification >= 0.5:
        print(f'email is SPAM: with spammy confidence of {final_classification*100}%')
    else:
        print(f'email is HAM: with spammy confidence of {final_classification*100}%')
    return final_classification
for email in test_spam_stemmed:
    print(f"\nTesting stemmed SPAM email {email} :")
    print('Test word by word: ')
    all_word_probs = Bayes(email)
    print(all_word_probs)


Testing stemmed SPAM email ['password'] :
Test word by word: 
prob of spam in general  0.5714285714285714
prob "password"  is a spam word : 0.5
prob of ham in general  0.42857142857142855
WH for password is 0.2
prob 'password' is a ham word: 0.2

Using Bayes, prob the the word 'password' is spam: 0.7692307692307692
###########################
All word probabilities for this sentence: [0.7692307692307692]
email is SPAM: with spammy confidence of 76.92307692307692%
0.7692307692307692

Testing stemmed SPAM email ['vows'] :
Test word by word: 
prob of spam in general  0.5714285714285714
prob 'vows' is a spam word: 0.16666666666666666
prob of ham in general  0.42857142857142855
prob "vows" is a ham word:  0.4

Using Bayes, prob the the word 'vows' is spam: 0.35714285714285715
###########################
All word probabilities for this sentence: [0.35714285714285715]
email is HAM: with spammy confidence of 35.714285714285715%
0.35714285714285715


In [16]:
for email in test_ham_stemmed:
    print(f"\nTesting stemmed HAM email {email} :")
    print('Test word by word: ')
    all_word_probs = Bayes(email)
    print(all_word_probs)


Testing stemmed HAM email [] :
Test word by word: 
All word probabilities for this sentence: []
email is SPAM: with spammy confidence of 100%
1

Testing stemmed HAM email ['benefits', 'our', 'account'] :
Test word by word: 
prob of spam in general  0.5714285714285714
prob 'benefits' is a spam word: 0.16666666666666666
prob of ham in general  0.42857142857142855
prob "benefits" is a ham word:  0.4

Using Bayes, prob the the word 'benefits' is spam: 0.35714285714285715
###########################
prob of spam in general  0.5714285714285714
prob "our"  is a spam word : 0.8333333333333334
prob of ham in general  0.42857142857142855
WH for our is 0.2
prob 'our' is a ham word: 0.2

Using Bayes, prob the the word 'our' is spam: 0.847457627118644
###########################
prob of spam in general  0.5714285714285714
prob "account"  is a spam word : 0.3333333333333333
prob of ham in general  0.42857142857142855
WH for account is 0.2
prob 'account' is a ham word: 0.2

Using Bayes, prob the the