In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()

# Change spam values to booleans
sms_raw['spam'] = (sms_raw['spam'] == 'spam')

data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [17]:
# Calculate the accuracy of the model
print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))

print('Accuracy is {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))

Number of mislabeled points out of a total 5572 points: 604
Accuracy is 89.16%


In [18]:
# Use built-in confusion matric from sklearn
# The columns are prediction, rows are actual
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]])

We learn the majority of our error is coming from times where we failed to identify a spam message. 549 of our 604 errors are from failing to identify spam. So we need to get a little bit better at identifying spam messages.

Let's assume our goal is to identify spam (rather than identify ham).

A false positive is when we identify something as spam that is not. In this case we had 55 of these. This is sometimes also called a "Type I Error" or a "false alarm"

A false negative is therefore when we mistakenly identify something as not spam when it is. We had 549 of these. This is also called a "Type II Error" or a "miss"

Sensitivity is the percentage of positives correctly identified, in our case 198/747 or 27%. This shows how good we are at catching positives, or how sensitive our model is to identifying positives.

Specificity is just the opposite, the percentage of negatives correctly identified, 4770/4825 or 99%.

In [44]:
# Build your confusion matrix without sklearn
ticker = -1
ham_ham = 0
spam_spam = 0
ham_spam = 0
spam_ham = 0

for email in target:
    ticker += 1
    if email == True:
        if y_pred[ticker] == True:
            spam_spam += 1
        elif y_pred[ticker] == False:
            spam_ham += 1
    elif email == False:
        if y_pred[ticker] == True:
            ham_spam += 1
        elif y_pred[ticker] == False:
            ham_ham += 1
        
confusion_matrix = [[ham_ham, ham_spam],
                   [spam_ham, spam_spam]]

confusion_matrix

[[4770, 55], [549, 198]]

In [49]:
# Calculate sensitivity and specificity
sensitivity = round(spam_spam / (spam_ham + spam_spam) * 100, 1)
specificity = round(ham_ham / (ham_ham + ham_spam) * 100, 1)

print(confusion_matrix)
print('Sensitivity: {}\nSpecificity: {}'.format(sensitivity, specificity))

[[4770, 55], [549, 198]]
Sensitivity: 26.5
Specificity: 98.9
