In [19]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [7]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.8923766816143498
Testing on Sample: 0.8916008614501076


In [4]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.89784946, 0.89426523, 0.89426523, 0.890681  , 0.89605735,
       0.89048474, 0.88150808, 0.89028777, 0.88489209, 0.89568345])

In [8]:
# implementing my own cross validation:
def joe_cross_val_score(model, data, target, cv):
    piece_scores = []
    for i in range(cv):
        X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=i*2)
        piece_scores.append(bnb.fit(X_train, y_train).score(X_test, y_test))
    return piece_scores

In [10]:
joe_cross_val_score(bnb, data, target, 2)

[0.8893540669856459, 0.8893540669856459]

In [17]:
len(data)

5572

In [27]:
a = [2,3,5]

In [30]:
data.loc[a]

Unnamed: 0,click,offer,winner,buy,free,cash,urgent,allcaps
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False


In [45]:
def joe_train_test_split(data, target, split_percentage):
    test=[]
    train=[]
    while len(test) < len(data)*(split_percentage):
        x = random.randint(0,len(data) - 1)
        if x not in test:
            test.append(x)
        test = sorted(test)
    for i in range(len(data)):
        if i not in test:
            train.append(i)
    return data.loc[test], data.loc[train], target.loc[test], target.loc[train]

In [52]:
X_test, X_train, y_test, y_train = joe_train_test_split(data, target, 0.4)