##Goal:  Train a Naive Bayes model to classify future SMS messages as either spam or ham.

Steps:

1.  Convert the words ham and spam to a binary indicator variable(0/1)

2.  Convert the txt to a sparse matrix of TFIDF vectors

3.  Fit a Naive Bayes Classifier

4.  Measure your success using roc_auc_score



In [63]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [64]:
df= pd.read_csv("SMSSpamCollection",sep='\t', names=['spam', 'txt'])

In [65]:
df.head()

Unnamed: 0,spam,txt
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [66]:
df.spam.value_counts()

ham     4825
spam     747
Name: spam, dtype: int64

In [67]:
# Convert the words ham and spam to a binary indicator variable(0/1)
def spam_bin(x):
    if x == 'ham':
        return 0
    else:
        return 1
df.spam = df.spam.map(spam_bin)

In [68]:
df.spam.value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [69]:
#nltk.download('stopwords') - $HOME/ntlk_data

In [70]:
#TFIDF Vectorizer, just like before
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [71]:
#assign Y 
y = df.spam

In [72]:
X= vectorizer.fit_transform(df.txt)

In [73]:
X.shape

(5572, 8587)

In [74]:
y.shape

(5572,)

In [75]:
#Test Train Split as usual
X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=42)

In [76]:
#we will train a naive_bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [77]:
# Accuracy of the model
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.98589322144123448

In [78]:
import numpy as np

In [84]:
test_run_array=np.array(["U dun say so early hor... U c already then say...", "how you doin?", "whatsup?",
                         "Flibbertidigibbett", "Honorificabilitudinitatibus",
                         "Pneumonomicroscopicsilicovolcanoconiosis", "Buy 1 get 1 free offer", "free entry", "prize"])

test_run_vector = vectorizer.transform(test_run_array)

print clf.predict(test_run_vector)

[0 0 0 0 0 0 0 1 1]
