In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [3]:
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 604


# Success Rate

- Note what the data directly comparable for model evaluation: target and y_pred variable
- target: actual outcome
- y_pred: predicted outcomes from the classifier


In [5]:
# Calculating the accuracy of the model:
from sklearn.metrics import accuracy_score

accuracy_score(target, y_pred)

0.8916008614501076

In [6]:
from sklearn.metrics import accuracy_score

accuracy_score(target, y_pred, normalize=False) # correctly classified messages

4968

The accuracy score of this model is 89.16% with 4968 of correctly classified messages and 604 of incorrectly classified messages.

Success Rates are not sufficient because:
- Not all errors are created equal
- Certain outcomes are not predicted accurately

**Confusion Matrix**

- columns - predictions
- rows - actual

In [4]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4770,   55],
       [ 549,  198]])

# DRILL

Build your confusion matrix and calculate sensitivity and specificity here.

In [7]:
X = np.array([(350, 85), (2408, 1200)])
X

array([[ 350,   85],
       [2408, 1200]])

In [8]:
X[1,0]

2408

In [9]:
X[1,1] / (X[1,0]+X[1,1])

0.3325942350332594

In [10]:
X[0,0] / (X[0,1]+X[0,0])

0.8045977011494253

In [11]:
print("Sensitivity of X: ", X[1,1] / (X[1,0]+X[1,1]))

print("Specificity of X: ", X[0,0] / (X[0,1]+X[0,0]))


Sensitivity of X:  0.3325942350332594
Specificity of X:  0.8045977011494253
