# Ham/Spam Classification

In [31]:
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import numpy as np

In [32]:
df = pd.read_table('SMSSpamCollection',sep='\t',header=None,names = ['label','message'])

In [33]:
values = df.get_values()
print(values.shape)
values[0,:]
print(df['message'])

(5572, 2)
0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17  

#### Changing labels by numbers

In [34]:
df['label'] = df.label.map({'ham':0,'spam':1})
values = df.get_values()
print(values.shape)
print(values[0,:])
print(values[13,:])


(5572, 2)
[0
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
[0
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."]


#### convert all characters in the message to lower case

In [35]:
df['message'] = df['message'].str.lower()
print(df['message'])

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
5       freemsg hey there darling it's been 3 week's n...
6       even my brother is not like to speak with me. ...
7       as per your request 'melle melle (oru minnamin...
8       winner!! as a valued network customer you have...
9       had your mobile 11 months or more? u r entitle...
10      i'm gonna be home soon and i don't want to tal...
11      six chances to win cash! from 100 to 20,000 po...
12      urgent! you have won a 1 week free membership ...
13      i've been searching for the right words to tha...
14                    i have a date on sunday with will!!
15      xxxmobilemovieclub: to use your credit, click ...
16                             oh k...i'm watching here:)
17      eh u r

#### Remove any punctuation

In [36]:
df['message'] = df.message.str.replace('[^\w\s]','')
print(df['message'])

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
5       freemsg hey there darling its been 3 weeks now...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile 11 months or more u r entitled...
10      im gonna be home soon and i dont want to talk ...
11      six chances to win cash from 100 to 20000 poun...
12      urgent you have won a 1 week free membership i...
13      ive been searching for the right words to than...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                   oh kim watching here
17      eh u r

#### Tokenizing the messages

First, we have to import and download the tokenizer from the console:
An installation window will appear. Go to the "Models" tab and select "punkt" from the "Identifier" column. Then click "Download" and it will install the necessary files. 

In [37]:
import nltk
#nltk.download()

Now we can apply the tokenization:

In [38]:
df['message'] = df['message'].apply(nltk.word_tokenize)
print(df['message'])

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
5       [freemsg, hey, there, darling, its, been, 3, w...
6       [even, my, brother, is, not, like, to, speak, ...
7       [as, per, your, request, melle, melle, oru, mi...
8       [winner, as, a, valued, network, customer, you...
9       [had, your, mobile, 11, months, or, more, u, r...
10      [im, gon, na, be, home, soon, and, i, dont, wa...
11      [six, chances, to, win, cash, from, 100, to, 2...
12      [urgent, you, have, won, a, 1, week, free, mem...
13      [ive, been, searching, for, the, right, words,...
14             [i, have, a, date, on, sunday, with, will]
15      [xxxmobilemovieclub, to, use, your, credit, cl...
16                              [oh, kim, watching, here]
17      [eh, u

In [39]:
df['message'][0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

#### Stemming the messages using the Porter Stemmer algorithm

In [40]:
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])
print(df['message'])

0       [go, until, jurong, point, crazi, avail, onli,...
1                            [ok, lar, joke, wif, u, oni]
2       [free, entri, in, 2, a, wkli, comp, to, win, f...
3       [u, dun, say, so, earli, hor, u, c, alreadi, t...
4       [nah, i, dont, think, he, goe, to, usf, he, li...
5       [freemsg, hey, there, darl, it, been, 3, week,...
6       [even, my, brother, is, not, like, to, speak, ...
7       [as, per, your, request, mell, mell, oru, minn...
8       [winner, as, a, valu, network, custom, you, ha...
9       [had, your, mobil, 11, month, or, more, u, r, ...
10      [im, gon, na, be, home, soon, and, i, dont, wa...
11      [six, chanc, to, win, cash, from, 100, to, 200...
12      [urgent, you, have, won, a, 1, week, free, mem...
13      [ive, been, search, for, the, right, word, to,...
14             [i, have, a, date, on, sunday, with, will]
15      [xxxmobilemovieclub, to, use, your, credit, cl...
16                                 [oh, kim, watch, here]
17      [eh, u

#### Transforming data into occurrences

In [41]:
df['message'] = df['message'].apply(lambda x: ' '.join(x))
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])
print(counts)
print(df['message'])
print(count_vect.get_feature_names())

  (0, 7715)	1
  (0, 1146)	1
  (0, 3388)	1
  (0, 7130)	1
  (0, 2029)	1
  (0, 1748)	1
  (0, 4273)	1
  (0, 7925)	1
  (0, 3425)	1
  (0, 1750)	1
  (0, 3872)	1
  (0, 5292)	1
  (0, 1340)	1
  (0, 2248)	1
  (0, 5635)	1
  (0, 4128)	1
  (0, 7497)	1
  (0, 3336)	1
  (1, 5289)	1
  (1, 7835)	1
  (1, 4094)	1
  (1, 4308)	1
  (1, 5257)	1
  (2, 71)	1
  (2, 1220)	1
  :	:
  (5570, 2760)	1
  (5570, 1777)	1
  (5570, 6596)	1
  (5570, 1773)	1
  (5570, 7534)	1
  (5570, 2492)	1
  (5570, 5048)	1
  (5570, 1463)	1
  (5570, 7109)	1
  (5570, 3105)	1
  (5570, 6587)	1
  (5570, 4396)	1
  (5570, 3823)	1
  (5570, 1160)	1
  (5570, 7754)	1
  (5570, 3987)	1
  (5570, 3559)	1
  (5570, 7236)	1
  (5570, 3148)	1
  (5570, 3872)	1
  (5571, 6114)	1
  (5571, 7366)	1
  (5571, 4970)	1
  (5571, 3987)	2
  (5571, 7236)	1
0       go until jurong point crazi avail onli in bugi...
1                                   ok lar joke wif u oni
2       free entri in 2 a wkli comp to win fa cup fina...
3             u dun say so earli hor u c alread

In [12]:
print(df['message'][0])

go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat


#### Using Term Frequency Inverse Document Frequency

In [13]:
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


## Training the model

#### Splitting data into training and test sets 

In [42]:
x_train, x_test, y_train, y_test = train_test_split(counts, df['label'], test_size=.1, random_state = 60)
counts

<5572x8169 sparse matrix of type '<class 'numpy.int64'>'
	with 72500 stored elements in Compressed Sparse Row format>

In [43]:
x_train

<5014x8169 sparse matrix of type '<class 'numpy.int64'>'
	with 64959 stored elements in Compressed Sparse Row format>

#### Initializing Multinomial Bayes Classifier

In [16]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluating the model

In [17]:
predicted = model.predict(x_test)
print(np.mean(predicted == y_test))

0.9605734767025089


#### Looking at the confusion matrix

In [18]:
print(confusion_matrix(y_test, predicted))

[[487   1]
 [ 21  49]]
