In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sklearn
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [21]:
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter = '\t', header = None)
sms_raw.columns = ['spam', 'message']

In [22]:
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent', 'win']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [23]:
#adding a column with allcaps 
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw.columns.unique()

Index(['spam', 'message', 'click', 'offer', 'winner', 'buy', 'free', 'cash',
       'urgent', 'win', 'allcaps'],
      dtype='object')

In [24]:
#Turning this into a boolean to prepare for modeling
sms_raw['spam'] = (sms_raw['spam'] == 'spam')

In [25]:
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

In [26]:
#import the Bernoulli classifier since data is binary/boolean
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

#Classify storing the result in a new variable.
y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total of {} points : {}".format(
     data.shape[0],
     (target != y_pred).sum()
))

Number of mislabeled points out of a total of 5572 points : 576


In [27]:
#Guassian classifier
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(data, target).predict(data)
print("Number of mislabeled points out of a total {} points :{}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points :575


In [28]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
X = data
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4457, 9) (4457,)
(1115, 9) (1115,)


In [29]:
from sklearn.naive_bayes import BernoulliNB
#Cross validation model
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits = 5, shuffle=True, random_state = 40)
bnb = BernoulliNB() 
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    bnb = bnb.fit(X_train, y_train)
    print("Score:", bnb.score(X_test, y_test))

Score: 0.893273542601
Score: 0.917488789238
Score: 0.892280071813
Score: 0.879712746858
Score: 0.899461400359


In [83]:
#keywords1 = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent', 'win']
#case = False - case does not matter

keywords1 = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent', 'win', 'boss', 'strings', 'opportunity',
           'double','easy', '08717205546','08714712388', '09050002311', 'girl']

for key1 in keywords1:
    sms_raw[str(key1)] = sms_raw.message.str.contains(
        str(key1), case=False
    )

In [84]:
data1 = sms_raw[keywords1]
target1 = sms_raw['spam']

In [85]:
#import the Bernoulli classifier since data is binary/boolean
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data1, target1)

#Classify storing the result in a new variable.
y_pred = bnb.predict(data1)

print("Number of mislabeled points out of a total of {} points : {}".format(
     data1.shape[0],
     (target1 != y_pred).sum()
))

Number of mislabeled points out of a total of 5572 points : 505


In [86]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
X1 = data1
y1 = target1
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)
print(X1_train.shape, y1_train.shape)
print(X1_test.shape, y1_test.shape)

(4457, 17) (4457,)
(1115, 17) (1115,)


In [87]:
from sklearn.naive_bayes import BernoulliNB
#Cross validation model
from sklearn.model_selection import KFold # import KFold
kf = KFold(n_splits = 5, shuffle=True, random_state = 40)
bnb = BernoulliNB() 
for train_index, test_index in kf.split(X1):
    X1_train, X1_test = X1.iloc[train_index], X1.iloc[test_index]
    y1_train, y1_test = y1.iloc[train_index], y1.iloc[test_index]
    bnb = bnb.fit(X1_train, y1_train)
    print("Score:", bnb.score(X1_test, y1_test))

Score: 0.904035874439
Score: 0.931838565022
Score: 0.906642728905
Score: 0.894973070018
Score: 0.910233393178
