In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Grad and process the raw data
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)
    
sms_raw['allcaps'] = sms_raw.message.str.isupper()

sms_raw['spam'] = (sms_raw['spam'] == 'spam')

data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [3]:
# Test your model with different holdout groups
from sklearn.model_selection import train_test_split

# Use train_test_split to create the necessary training and testing groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.884304932735426
Testing on Sample: 0.8916008614501076


These scores look really consistent! It doesn't seem like our model is overfitting. Part of the reason for that is that it's so simple (more on that in a bit).

In [4]:
# Cross Validation
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.89784946, 0.89426523, 0.89426523, 0.890681  , 0.89605735,
       0.89048474, 0.88150808, 0.89028777, 0.88489209, 0.89568345])

That's exactly what we'd hope to see. The array that cross_val_score returns is a series of accuracy scores with a different hold out group each time. If our model is overfitting at a variable amount, those scores will fluctuate. Instead, ours are relatively consistent.

In [49]:
# Implement your own cross validation with your spam model

# Create test and train sets using 20% holdout
# X is data and y is target
mask = np.random.rand(len(data)) < 0.8

X_train = data[mask]
y_train = target[mask]
X_test = data[~mask]
y_test = target[~mask]

print(len(X_train), len(y_train), len(X_test), len(y_test))

# Train model on train set and test on test set
# Compare to model trained and tested on full dataset
y_pred_train = bnb.fit(X_train, y_train).predict(X_test)
y_pred_data = bnb.fit(data, target).predict(data)

# Calculate the accuracy of the model manually
print('-' * 30)
print('With 20% Holdout: {}%'.format(100 - round((y_test != y_pred_train).sum() / X_test.shape[0] * 100, 2)))
print('Testing on Data: {}%'.format(100 - round((target != y_pred_data).sum() / data.shape[0] * 100, 2)))

# Calculate the accuracy using .score()
print('-' * 30)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

4441 4441 1131 1131
------------------------------
With 20% Holdout: 89.03999999999999%
Testing on Data: 89.16%
------------------------------
With 20% Holdout: 0.8903625110521662
Testing on Sample: 0.8916008614501076


In [56]:
# Do a more in depth evalution of the model looking at the 
# kind of errors we're generating and what accuracy we'd get
# if we just randomly guessed. You may want to use a
# confusion matrix to show different kinds of errors.

from sklearn.metrics import confusion_matrix
conf_20 = confusion_matrix(y_test, y_pred_train)
sens_20 = conf_20[1][1] / conf_20[1].sum()
spec_20 = conf_20[0][0] / conf_20[0].sum()
rand_20 = conf_20[1].sum() / conf_20[0].sum()
print('With 20% holdout:')
print(conf_20)
print('Sensitivity: {}\nSpecificity: {}'.format(sens_20, spec_20))
print('Percent Spam: {}'.format(rand_20))

print('-' * 30)
conf_100 = confusion_matrix(target, y_pred_data)
sens_100 = conf_100[1][1] / conf_100[1].sum()
spec_100 = conf_100[0][0] / conf_100[0].sum()
rand_100 = conf_100[1].sum() / conf_100[0].sum()
print('Testing on Data:')
print(conf_100)
print('Sensitivity: {}\nSpecificity: {}'.format(sens_100, spec_100))
print('Percent Spam: {}'.format(rand_100))

With 20% holdout:
[[974  11]
 [113  33]]
Sensitivity: 0.22602739726027396
Specificity: 0.9888324873096447
Percent Spam: 0.14822335025380712
------------------------------
Testing on Data:
[[4770   55]
 [ 549  198]]
Sensitivity: 0.26506024096385544
Specificity: 0.9886010362694301
Percent Spam: 0.15481865284974095
