In [1]:
from collections import Counter, defaultdict
import math, random, re, glob

def tokenize(message):
    message = message.lower()                       # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)   # extract the words
    return set(all_words)                           # remove duplicates


def count_words(training_set):
    """training set consists of pairs (message, is_spam)"""
    counts = defaultdict(lambda: [0, 0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
    """turn the word_counts into a list of triplets
    w, p(w | spam) and p(w | ~spam)"""
    return [(w,
             (spam + k) / (total_spams + 2 * k),
             (non_spam + k) / (total_non_spams + 2 * k))
             for w, (spam, non_spam) in counts.items()]

def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0.0

    for word, prob_if_spam, prob_if_not_spam in word_probs:

        # for each word in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)

        # for each word that's not in the message
        # add the log probability of _not_ seeing it
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)

    prob_if_spam = math.exp(log_prob_if_spam)
    prob_if_not_spam = math.exp(log_prob_if_not_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:

    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def train(self, training_set):

        # count spam and non-spam messages
        num_spams = len([is_spam
                         for message, is_spam in training_set
                         if is_spam])
        num_non_spams = len(training_set) - num_spams

        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts,
                                             num_spams,
                                             num_non_spams,
                                             self.k)

    def classify(self, message):
        return spam_probability(self.word_probs, message)


-

    classified.sort(key=lambda row: row[2])
    spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
    hammiest_spams = list(filter(lambda row: row[1], classified))[:5]

    print("spammiest_hams", spammiest_hams)
    print("hammiest_spams", hammiest_spams)

    words = sorted(classifier.word_probs, key=p_spam_given_word)

    spammiest_words = words[-5:]
    hammiest_words = words[:5]

    print("spammiest_words", spammiest_words)
    print("hammiest_words", hammiest_words)

def split_data(data, prob): 
    """split data into fractions [prob, 1 - prob]""" 
    results = [], [] 
    for row in data: 
        results[0 if random.random() < prob else 1].append(row) 
    return results 

In [41]:
file = r"/Users/ioanwilliams/Downloads/smsspamcollection/SMSSpamCollection"

messages = [line.rstrip() for line in open(file)]

data_set = []
#format out output data is (message, spam_bool)
for _, message in enumerate(messages):
    indicator, message = re.split(r'\t', message)
    data_set.append((message,indicator == "spam"))
    
from textblob import TextBlob
text_blobs = [TextBlob(message) for message, _ in data_set]
cleaned_blobs = [blob.words for blob in text_blobs]


**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/ioanwilliams/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.


In [33]:
num_spams = len([is_spam
                 for message, is_spam in data_set
                 if is_spam])
num_non_spams = len(data_set) - num_spams
print(len(data_set),num_spams, num_non_spams)

5574 747 4827


In [29]:
train_data, test_data = split_data(data_set, 0.75)
num_spams = len([is_spam
                 for message, is_spam in train_data
                 if is_spam])
num_non_spams = len(train_data) - num_spams
print(len(train_data),num_spams, num_non_spams)

4171 567 3604


In [17]:
word_counts = count_words(train_data)

In [18]:
#sort dictionary
sorted_counts = []
for word, (spam_count, non_spam_count) in word_counts.items():
    sorted_counts.append((spam_count, non_spam_count,word))

sorted_counts.sort(key=lambda tup: tup[0], reverse = True)
sorted_counts[:10]

[(341, 895, 'to'),
 (255, 166, 'call'),
 (221, 657, 'a'),
 (175, 968, 'you'),
 (162, 266, 'your'),
 (153, 223, 'now'),
 (135, 169, 'or'),
 (130, 320, 'for'),
 (120, 639, 'the'),
 (119, 39, 'free')]

In [19]:
doris = [(w,(spam, non_spam)) for w, (spam, non_spam) in word_counts.items()]

In [10]:
import string
def process_file(filename):
    hist = {}
    fp = open(filename, encoding="utf-8")

    for line in fp:
        process_line(line, hist)

    return hist

def process_line(line, hist):
    # replace hyphens with spaces before splitting
    line = line.replace('-', ' ')
    strippables = string.punctuation + string.whitespace

    for word in line.split():
        # remove punctuation and convert to lowercase
        word = word.strip(strippables)
        word = word.lower()

        # update the histogram
        hist[word] = hist.get(word, 0) + 1
        
def most_common(hist):
    t = []
    for key, value in hist.items():
        t.append((value, key))

    t.sort()
    t.reverse()
    return t

In [11]:
sh_count = process_file(r"C:\Users\iwilliam\Downloads\SH.txt")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\iwilliam\\Downloads\\SH.txt'

In [None]:
common_sh = most_common(sh_count)
common_stop = common_sh[:20]
common_stop = [w for c, w in common_stop]
common_stop

In [20]:
#sort dictionary
sorted_counts2 = []
for word, (spam_count, non_spam_count) in word_counts.items():
    if word not in common_stop:
        sorted_counts2.append((spam_count, non_spam_count,word))

sorted_counts2.sort(key=lambda tup: tup[0], reverse = True)
sorted_counts2[:10]

NameError: name 'common_stop' is not defined

In [32]:
classifier = NaiveBayesClassifier()
classifier.train(train_data)

classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]

counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
                 for _, is_spam, spam_probability in classified)

print(counts)

Counter({(False, False): 1221, (True, True): 165, (True, False): 15, (False, True): 2})


In [24]:
#False False - correctly label non-spam as non-spam (ham as ham)
#True True - correctly label spam as spam
#True False - incorrectly labelled spam as non-spam (spam as ham)
#False True - incorrectly labelled non-spam as spam (ham as spam)

In [25]:
words = sorted(classifier.word_probs, key=p_spam_given_word)

spammiest_words = words[-5:]
hammiest_words = words[:5]

In [26]:
spammiest_words

[('150p', 0.0885509838998211, 0.00013812154696132598),
 ('won', 0.10644007155635063, 0.00013812154696132598),
 ('prize', 0.11717352415026834, 0.00013812154696132598),
 ('www', 0.13148479427549195, 0.00013812154696132598),
 ('claim', 0.1475849731663685, 0.00013812154696132598)]

In [27]:
hammiest_words

[('gt', 0.0008944543828264759, 0.04848066298342541),
 ('lt', 0.0008944543828264759, 0.04792817679558011),
 ("i'll", 0.0008944543828264759, 0.03604972375690608),
 ('lor', 0.0008944543828264759, 0.030248618784530387),
 ('he', 0.0008944543828264759, 0.029972375690607733)]