# Spam Classifier

In [99]:
import pandas as pd

df = pd.read_table("C:\Users\James\Desktop\Coding\Data_Science\Udacity\SMSSpamCollection",header=None,names = ['label','sms_message'])
df.head(5)

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
df['label'] = df['label'].map({'ham':0, 'spam':1})
df.head(5)

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [101]:
df.shape #(rows, columns)

(5572, 2)

## Creating a Manual Count Vectorizer

In [102]:
# Example docs to create a distribution matrix over
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

In [103]:
# Remove uppercase letters
lower_case_documents =[]
for content in documents:
    lower_case_documents.append(content.lower())
print lower_case_documents

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [104]:
# Remove punctuation
import string 
sans_punctuation_documents =[]
for content in lower_case_documents:
    sans_punctuation_documents.append(content.translate(None, string.punctuation))
print sans_punctuation_documents

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [105]:
# Tokenize strings by splitting them
preprocessed_documents =[]
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())
print preprocessed_documents

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [106]:
# Count occureneces of each word in each document
import collections as cltn
frequency_list = []
for doc in preprocessed_documents:
    frequency_list.append(cltn.Counter(doc))
frequency_list

[Counter({'are': 1, 'hello': 1, 'how': 1, 'you': 1}),
 Counter({'from': 1, 'home': 1, 'money': 1, 'win': 2}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'call': 1, 'hello': 2, 'tomorrow': 1, 'you': 1})]

## Using Count Vectorizer Proper

In [107]:
# Import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# print method instance to see the default parameter settings
print count_vector

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [108]:
# Fit docs dataset to countvectorizer
count_vector.fit(documents)
count_vector.get_feature_names()

[u'are',
 u'call',
 u'from',
 u'hello',
 u'home',
 u'how',
 u'me',
 u'money',
 u'now',
 u'tomorrow',
 u'win',
 u'you']

In [109]:
# Create distribution matrix
dist_mat = count_vector.transform(documents)
doc_array = dist_mat.toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [110]:
# transform this to a df
dist_df = pd.DataFrame(data = doc_array, columns = count_vector.get_feature_names())
dist_df

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


## Training / Test Sets

In [111]:
from sklearn.model_selection import train_test_split
X = df['sms_message']
y= df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 1) 
print "training dataset row count: %i, test dataset row count: %i, total dataset row count: %i" % (y_train.count(),y_test.count(),y_train.count()+y_test.count())


training dataset row count: 4179, test dataset row count: 1393, total dataset row count: 5572


In [112]:
# Udacity code:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

In [113]:
# Since we are dealing with discrete data use MultinomialNB
# If the input is continuous normally distributed data use Gaussian NB
from sklearn.naive_bayes import MultinomialNB 

In [114]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [115]:
# http://blog.datumbox.com/machine-learning-tutorial-the-naive-bayes-text-classifier/

predictions = naive_bayes.predict(testing_data)


Accuracy measures how often the classifier makes the correct prediction. 
It’s the ratio of the number of correct predictions to the total number of predictions (the number of test data points).

Precision tells us what proportion of messages we classified as spam, actually were spam. 
It is a ratio of true positives(words classified as spam, and which are actually spam) to all positives(all words 
classified as spam, irrespective of whether that was the correct classification), in other words it is the ratio of
[True Positives/(True Positives + False Positives)]

Recall(sensitivity) tells us what proportion of messages that actually were spam were classified by us as spam. 
It is a ratio of true positives(words classified as spam, and which are actually spam) to all the words that were actually spam, in other words it is the ratio of
[True Positives/(True Positives + False Negatives)]

In [116]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print 'Accuracy score: ',  accuracy_score(y_test, predictions)
print 'Precision score: ', precision_score(y_test, predictions)
print 'Recall score: ', recall_score(y_test, predictions)
print 'F1 score: ', f1_score(y_test, predictions)

# F1 score = 2 * precision * recall / (precision + recall)

Accuracy score:  0.988513998564
Precision score:  0.972067039106
Recall score:  0.940540540541
F1 score:  0.956043956044
