In [44]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [24]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [25]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.884304932735
Testing on Sample: 0.89160086145


In [27]:
data_sample = data.sample(frac=.1, replace=False)
data_sample_target = data_sample.index.values
target_for_data = target.take(data_sample_target)

In [72]:
conMatrix = []
for x in range(0, 10):
    data_sample = data.sample(frac=.1, replace=False)
    data_sample_target = data_sample.index.values
    target_for_data = target.take(data_sample_target)
    
    X_train, X_test, y_train, y_test = train_test_split(data_sample, target_for_data, test_size=random.randrange(1, 50)/100, random_state=20)
    #X_train, X_test, y_train, y_test = data_sample.sample()
    
    print('With 20% Holdout {}: '.format(x) + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
    #print('Testing on Sample {}: '.format(x) + str(bnb.fit(data, target).score(data, target)))
    
    y_pred = bnb.fit(X_train, y_train).predict(X_test)
    
    k = confusion_matrix(target.take(y_test.index.values), y_pred)
    
    print(confusion_matrix(target.take(y_test.index.values), y_pred))
    
    print('percentage of true positive: ' + str(k[0][0]/(k[0][0] + k[0][1])))
    print('percentage of true negative: ' + str(k[1][0]/(k[1][0] + k[1][1])) + '\n')



With 20% Holdout 0: 0.849816849817
[[223   2]
 [ 39   9]]
percentage of true positive: 0.991111111111
percentage of true negative: 0.8125

With 20% Holdout 1: 0.91061452514
[[158   1]
 [ 15   5]]
percentage of true positive: 0.993710691824
percentage of true negative: 0.75

With 20% Holdout 2: 0.833333333333
[[9 1]
 [1 1]]
percentage of true positive: 0.9
percentage of true negative: 0.5

With 20% Holdout 3: 0.871559633028
[[185   3]
 [ 25   5]]
percentage of true positive: 0.984042553191
percentage of true negative: 0.833333333333

With 20% Holdout 4: 0.899253731343
[[235   3]
 [ 24   6]]
percentage of true positive: 0.987394957983
percentage of true negative: 0.8

With 20% Holdout 5: 0.866666666667
[[37  0]
 [ 6  2]]
percentage of true positive: 1.0
percentage of true negative: 0.75

With 20% Holdout 6: 0.903448275862
[[128   0]
 [ 14   3]]
percentage of true positive: 1.0
percentage of true negative: 0.823529411765

With 20% Holdout 7: 0.883720930233
[[110   0]
 [ 15   4]]
percentag

In [37]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([ 0.89784946,  0.89426523,  0.89426523,  0.890681  ,  0.89605735,
        0.89048474,  0.88150808,  0.89028777,  0.88489209,  0.89568345])