In [16]:
# read spam.csv dataset from kaggle - https://www.kaggle.com/uciml/sms-spam-collection-dataset
import pandas as pd
import numpy as np
import nltk


df_sms = pd.read_csv("spam.csv", encoding='latin-1')
df_sms.head()
df_sms = df_sms.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
df_sms = df_sms.rename(columns = {"v1":"label", "v2":"sms"})

In [17]:
df_sms.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
df_sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [19]:
df_sms.describe()

Unnamed: 0,label,sms
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [20]:
df_sms['msg_length'] = df_sms['sms'].apply(len)
df_sms

Unnamed: 0,label,sms,msg_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,161
5568,ham,Will Ì_ b going to esplanade fr home?,37
5569,ham,"Pity, * was in mood for that. So...any other s...",57
5570,ham,The guy did some bitching but I acted like i'd...,125


In [24]:
df_sms[df_sms['msg_length'] == 2]

Unnamed: 0,label,sms,msg_length
1924,ham,Ok,2
3049,ham,Ok,2
4496,ham,Ok,2
5357,ham,Ok,2


In [25]:
df_sms.head()

Unnamed: 0,label,sms,msg_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [26]:
df_sms['label'] = df_sms['label'].map({'ham':0, 'spam':1})
df_sms.head()

Unnamed: 0,label,sms,msg_length
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61


In [27]:
df_sms.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

In [28]:
# learn and implement bag of words

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']
# 1. convert to lowercase
documents = [d.lower() for d in documents]
documents

['hello, how are you!',
 'win money, win from home.',
 'call me now.',
 'hello, call hello you tomorrow?']

In [29]:
# 2. remove punctuations
docs_without_punctuation = []
import string
for i in documents:
    docs_without_punctuation.append(i.translate(str.maketrans("","",string.punctuation)))
docs_without_punctuation

['hello how are you',
 'win money win from home',
 'call me now',
 'hello call hello you tomorrow']

In [30]:
# 3. tokenize sentences
tokenized_docs = [w.split() for w in docs_without_punctuation]
tokenized_docs

[['hello', 'how', 'are', 'you'],
 ['win', 'money', 'win', 'from', 'home'],
 ['call', 'me', 'now'],
 ['hello', 'call', 'hello', 'you', 'tomorrow']]

In [31]:
# count frequencies
freq_list = []
import pprint
from collections import Counter
freq_list = [Counter(d) for d in tokenized_docs]
freq_list

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]

In [32]:
# sklearn does this automatically
docu = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector.fit(docu)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [33]:
doc_array = count_vector.transform(docu).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]])

In [34]:
freq_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
freq_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [40]:
# apply this to sms dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_sms['sms'], df_sms['label'], test_size = 0.20, random_state = 1)
count_vector = CountVectorizer()
train_data = count_vector.fit_transform(x_train)
test_data = count_vector.transform(x_test)

In [42]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(train_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
predictions = naive_bayes.predict(test_data)

In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.9847533632286996
Precision score: 0.9420289855072463
Recall score: 0.935251798561151
F1 score: 0.9386281588447652
