In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import string
import re
from nltk.stem.porter import PorterStemmer
import random
from sklearn.model_selection import train_test_split

In [2]:
# Read the Data
data = pd.read_csv('spam.csv', encoding = 'latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
# Drop the Unnecessary Columns
if len(data.columns) > 3: 
    data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

# Rename the Remaining Columns
if data.columns[0] != 'label':
    data = data.rename(columns = {'v1': 'label', 'v2': 'text'})

data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
# Convert Label to Numerical Values
data['label'] = data.label.map({'ham': 0, 'spam': 1})
data

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [6]:
# Mining Texts
texts = []
for index, row in data.iterrows():
    texts.append((row['text'], row['label']))
texts

,
 ('\\Petey boy whereare you me and all your friendsare in theKingshead come down if you canlove Nic\\""',
  0),
 ('Ok i msg u b4 i leave my house.', 0),
 ('\\Gimme a few\\" was  &lt;#&gt;  minutes ago"', 0),
 ('Last Chance! Claim ur å£150 worth of discount vouchers today! Text SHOP to 85023 now! SavaMob, offers mobile! T Cs SavaMob POBOX84, M263UZ. å£3.00 Sub. 16',
  1),
 ("Appt is at &lt;TIME&gt; am. Not my fault u don't listen. I told u twice",
  0),
 ('FREE for 1st week! No1 Nokia tone 4 ur mobile every week just txt NOKIA to 8077 Get txting and tell ur mates. www.getzed.co.uk POBox 36504 W45WQ 16+ norm150p/tone',
  1),
 ('You have won a guaranteed å£200 award or even å£1000 cashto claim UR award call free on 08000407165 (18+) 2 stop getstop on 88222 PHP. RG21 4JX',
  1),
 ("K I'll be there before 4.", 0),
 ('I dled 3d its very imp', 0),
 ("sure, but make sure he knows we ain't smokin yet", 0),
 ('Boooo you always work. Just quit.', 0),
 ('I am taking half day leave bec i am not w

In [7]:
# Remove Whitespace and Punctuation
tokenized = []
for t in texts:
    m = t[0]
    text = re.sub('[' + string.punctuation + ']', ' ', m) # Regular Expression
    text = re.sub('[\n\t\r]', '', text)
    words = text.split()
    tokenized.append((words, t[1]))
tokenized[2020]

(['Anything',
  'lor',
  'but',
  'toa',
  'payoh',
  'got',
  'place',
  '2',
  'walk',
  'meh'],
 0)

In [8]:
# Remove Stopwords
stopwords = []
try:
    f = open('stopwords.txt', 'r') # Read Only
    stopwords = f.read().split('\n')
except IOError:
    print('Problem opening file')
finally:
    f.close()
print('Sentence before stopwrods removed: \n', tokenized[2020])

filtered = []
for t in tokenized:
    text = t[0]
    f_text = []
    for word in text:
        if word not in stopwords and len(word) > 2:
            f_text.append(word)
    filtered.append((f_text, t[1]))

print('\nSentence after stopwords removed: \n', filtered[2020])

Sentence before stopwrods removed: 
 (['Anything', 'lor', 'but', 'toa', 'payoh', 'got', 'place', '2', 'walk', 'meh'], 0)

Sentence after stopwords removed: 
 (['Anything', 'lor', 'toa', 'payoh', 'place', 'walk', 'meh'], 0)


In [9]:
# Stem the Words
stemmer = PorterStemmer() # Deal with Participles
stemmed = []
for t in filtered:
    text = t[0]
    stemmed_text = []
    for word in text:
        stemmed_word = stemmer.stem(word.lower())
        stemmed_text.append(stemmed_word)
    stemmed.append((stemmed_text, t[1]))

stemmed[2020]

(['anyth', 'lor', 'toa', 'payoh', 'place', 'walk', 'meh'], 0)

In [10]:
# Counting Frequency of Words
word_count = {}
for t in stemmed:
    text = t[0]
    already_counted = []
    for word in text:
        if word not in word_count:
            word_count[word] = 1
        elif word not in already_counted:
            word_count[word] += 1
            already_counted.append(word)

#  Removing the Words that Occur Once
for i in range(len(stemmed)):
    stemmed[i] = (list(filter(lambda x: word_count[x] > 4, stemmed[i][0])), stemmed[i][1])

In [11]:
# Split Data
totaltexts = data.label.value_counts()
total = totaltexts[0] + totaltexts[1]
test_number = int(0.20 * total)

In [12]:
# Train Test Split
test_set = []
taken = {}

while len(test_set) < test_number:
    num = random.randint(0, test_number - 1)
    if num not in taken.keys():
        test_set.append(stemmed.pop(num))
        taken[num] = 1

train_set = stemmed

number_of_hams = data.label.value_counts()[0]
number_of_spams = data.label.value_counts()[1]

len(train_set) / total, len(test_set) / total

(0.8000717875089735, 0.19992821249102657)

Recall the Bayes Theorem, 
$$
\mathbb{P}(\text{Spam} \vert \text{Word}) = \frac{\mathbb{P}(\text{Word} \vert \text{Spam}) \cdot \mathbb{P}(\text{Spam})}{\mathbb{P}(\text{Word} \vert \text{Spam}) \cdot \mathbb{P}(\text{Spam}) + \mathbb{P}(\text{Word} \vert \text{Ham}) \cdot \mathbb{P}(\text{Ham})}.
$$

In [13]:
def p_appears_in_spam(word):
    count = 0
    total_spams = 0

    for t in train_set:
        text = t[0]
        if t[1] == 1:
            total_spams += 1
            if word in text:
                count += 1

    return count / total_spams

In [14]:
def p_appears_in_ham(word):
    count = 0
    total_hams = 0

    for t in train_set:
        text = t[0]
        if t[1] == 0:
            total_hams += 1
            if word in text:
                count += 1

    return count / total_hams

In [15]:
def total_spams_and_hams(tset):
    spams = 0
    hams = 0

    for t in tset:
        spams += 1 if t[1] == 1 else 0
        hams += 1 if t[1] == 0 else 0
    
    return spams, hams

In [16]:
p_spam = total_spams_and_hams(train_set)[0] / len(train_set)
p_ham = total_spams_and_hams(train_set)[1] / len(train_set)

In [17]:
def p_is_spam_given_word(word):
    return (p_appears_in_spam(word) * p_spam) / ((p_appears_in_spam(word) * p_spam + p_appears_in_ham(word) * p_ham))

In [18]:
word = 'discount'
print('Probability that a message is spam given the word "{}" is: {}'.format(word, p_is_spam_given_word(word)))

Probability that a message is spam given the word "discount" is: 0.75


In [19]:
probabilities = {}

for t in train_set:
    text = t[0]

    for word in text:
        if word not in probabilities:
            p = p_is_spam_given_word(word)

            if p == 0:
                probabilities[word] = 0.001 # To deal with the zero probability problem. Tweaking this value
            elif p == 1:
                probabilities[word] = 0.999 # Tweaking this value
            else:
                probabilities[word] = p

In [20]:
from functools import reduce

def p_is_spam(words):
    probs = []
    for word in words:
        if word in probabilities:
            probs.append(probabilities[word])
        else:
            probs.append(0.5) # for Unseen Words
    
    probs_not = list(map(lambda prob: 1 - prob, probs))
    product = reduce(lambda x, y: x * y, probs, 1) 
    product_not = reduce(lambda x, y: x * y, probs_not, 1)
    return product / (product + product_not)

In [21]:
total_correct = 0
true_spam_as_spam = 0
true_spam_as_ham = 0
true_ham_as_ham = 0
true_ham_as_spam = 0

false_positives = []

for t in test_set:
    guess = -1
    words = t[0]
    answer = t[1]
    p_spam = p_is_spam(words)
    # If p > 0.95, predict 'yes' (is spam)
    guess = 1 if p_spam > 0.95 else 0
    if guess == answer:
        total_correct += 1
        if answer == 0: # true negative
            true_ham_as_ham += 1
        else: # true positive
            true_spam_as_spam += 1 
    else:
        if answer == 0: # false positive
            true_ham_as_spam += 1
            false_positives.append((words, p_spam))
        else: # true negative
            true_spam_as_ham += 1

true_spams = total_spams_and_hams(test_set)[0]
true_hams = total_spams_and_hams(test_set)[1]

print('Total test texts: ', len(test_set))
print('Number of correct: ', total_correct)
print('Accuracy: ', total_correct * 100 / (true_spams + true_hams))
print('-------------------------------')
print('Ham precision: ', true_ham_as_ham / (true_ham_as_ham + true_spam_as_ham))
print('Ham recall: ', true_ham_as_ham / (true_ham_as_ham + true_ham_as_spam))
print('Spam precision: ', true_spam_as_spam / (true_spam_as_spam + true_ham_as_spam)) # Most important 
print('Spam recall: ', true_spam_as_spam / (true_spam_as_spam + true_spam_as_ham))
print('-------------------------------')
print('False Positives (hams that got labeled as spam):')

for i, (text, p) in enumerate(false_positives):
    print('{}: Words in text: {} | Degree of certainty: {}'.format(i + 1, text, p))

Total test texts:  1114
Number of correct:  1087
Accuracy:  97.57630161579893
-------------------------------
Ham precision:  0.9735234215885947
Ham recall:  0.9989550679205852
Spam precision:  0.9924242424242424
Spam recall:  0.8343949044585988
-------------------------------
False Positives (hams that got labeled as spam):
1: Words in text: ['from', 'the', 'award', 'month', 'current', 'month'] | Degree of certainty: 0.9853495632267187
