In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

COLOR = 'white'
plt.rcParams['text.color'] = COLOR
plt.rcParams['axes.labelcolor'] = COLOR
plt.rcParams['xtick.color'] = COLOR
plt.rcParams['ytick.color'] = COLOR

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.float_format = '{:,.2f}'.format

In [55]:
spam = pd.read_csv('spam.csv')
spam.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [56]:
spam 
#  here both the columns are text columns. We need to convert these to numbers because machine learning models dont understand text, but they do understand numbers.

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


We can now make a spam column where if the Category of the row is 'spam', then its '1' in the spam column and if its not 'spam' then its 0.

In [57]:
spam['spam'] = spam['Category'].apply(lambda x : 1 if x == 'spam' else 0)
spam

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spam.Message, spam.spam, test_size=0.25)

![](words_as_features.png)

# Example

![](count_vectorizer_example.png)

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()

In [60]:
#  Creating the word matric that only computers can understand.
X_train_count = v.fit_transform(X_train.values)
X_train_count = X_train_count.toarray()

In [61]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

gnb = GaussianNB()
mnb = MultinomialNB()
compnb = ComplementNB()
bnb = BernoulliNB()
catnb = CategoricalNB()
models = [gnb,
mnb,
compnb,
bnb,
catnb]

In [66]:

mnb.fit(X_train_count, y_train)
X_test_count = v.transform(X_test)
mnb.score(X_test_count, y_test)

0.9863603732950467