In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Get the spam data collection
df_spam_collections = pd.read_csv('SpamCollection', sep='\t', names=['response', 'messages'])
df_spam_collections.describe()


Unnamed: 0,response,messages
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [3]:
# Verify length of the messages and also add it as a new column
df_spam_collections['Message length'] = df_spam_collections['messages'].apply(
    len)
df_spam_collections.groupby('response').describe()


Unnamed: 0_level_0,Message length,Message length,Message length,Message length,Message length,Message length,Message length,Message length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,71.482487,58.440652,2.0,33.0,52.0,93.0,910.0
spam,747.0,138.670683,28.873603,13.0,133.0,149.0,157.0,223.0


In [4]:
# define a function to get rid of stopwords present in the messages
def eliminateStopWords(msg):
    # 1 remove punctuation
    no_punc = ''.join([char for char in msg if char not in string.punctuation])
    # 2 remove stopwords
    return [word for word in no_punc.split() if word.lower() not in stopwords.words('english')]

# verify that function is working
df_spam_collections['messages'].head().apply(eliminateStopWords)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: messages, dtype: object

In [5]:
# --- bag of words 
# start text processing with vectorizer
# use bag of words by applying the function and fit the data into it
bag_of_words_transformer = CountVectorizer(analyzer=eliminateStopWords).fit(df_spam_collections['messages'])
# print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

11425


In [6]:
# store bag of words for messages using transform method
messages_bag_of_words = bag_of_words_transformer.transform(df_spam_collections['messages'])

In [7]:
# apply tfidf transformer and fit the bag of words into it (transformed version)
tfidf_transformer = TfidfTransformer().fit(messages_bag_of_words)

In [8]:
# print shape of the tfidf
message_tfidf = tfidf_transformer.transform(messages_bag_of_words)
print(message_tfidf.shape)

(5572, 11425)


In [9]:
# choose naive Bayes model to detect the spam and fit the tfidf data into it
spam_detect_model = MultinomialNB().fit(message_tfidf,df_spam_collections['response'])

In [10]:
# check model for the predicted and expected value say for message#2 and message#84
# 1 transform the message using bag of words
msg = df_spam_collections['messages'][2]
bag_of_words_for_msg = bag_of_words_transformer.transform([msg])
# 2 transform the message using tfidf 
tfidf = tfidf_transformer.transform(bag_of_words_for_msg)
print(f'Message 2\npred = {spam_detect_model.predict(tfidf)}'
+f'\nactual = {df_spam_collections.response[2]}\n')

# 1 transform the message using bag of words
msg = df_spam_collections['messages'][84]
bag_of_words_for_msg = bag_of_words_transformer.transform([msg])
# 2 transform the message using tfidf 
tfidf = tfidf_transformer.transform(bag_of_words_for_msg)
print(f'Message 84\npred = {spam_detect_model.predict(tfidf)}'
+f'\nactual = {df_spam_collections.response[84]}')

Message 2
pred = ['spam']
actual = spam

Message 84
pred = ['ham']
actual = ham
