In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile

In [2]:
data = pd.read_csv("spam_or_not_spam.csv")

In [3]:
data.tail()

Unnamed: 0,email,label
2995,abc s good morning america ranks it the NUMBE...,1
2996,hyperlink hyperlink hyperlink let mortgage le...,1
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1
2999,hello this is chinese traditional 子 件 NUMBER世...,1


In [4]:
data.loc[(data["label"] == 0), 'label'] = "ham"
data.loc[(data["label"] == 1), 'label'] = "spam"

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.email.fillna(' '),data.label, test_size = 0.3, random_state= 10)

In [6]:
X_train

804     gary lawrence murphy wrote although it s like ...
977      i don t understand how does sorting one folde...
755     at NUMBER NUMBER pm NUMBER on NUMBER NUMBER NU...
2002    url URL date NUMBER NUMBER NUMBERtNUMBER NUMBE...
505     on NUMBER NUMBER NUMBER NUMBER NUMBER pm gary ...
                              ...                        
2009                       url URL date not supplied URL 
1180     on wed NUMBER oct NUMBER NUMBER NUMBER NUMBER...
1344     URL additional comments from daniel URL NUMBE...
527     just thought i d pass this on my favorite radi...
1289    i know this is simple but do you have usr src ...
Name: email, Length: 2100, dtype: object

In [7]:
X_train.shape

(2100,)

In [8]:
vect = TfidfVectorizer()

In [9]:
X_train_trans = vect.fit_transform(X_train)
X_test_trans = vect.transform(X_test)

In [10]:
feature_name = vect.get_feature_names()



In [11]:
feature_name

['__',
 '___',
 '____',
 '_____',
 '______',
 '_______',
 '________',
 '_________',
 '__________',
 '______________',
 '_______________',
 '____________________',
 '_______________________',
 '________________________',
 '_________________________',
 '__________________________',
 '______________________________',
 '_______________________________',
 '_________________________________',
 '______________________________________',
 '_____________________________________________',
 '______________________________________________',
 '_______________________________________________',
 '_________________________________________________',
 '__________________________________________________',
 '___________________________________________________',
 '____________________________________________________',
 '_____________________________________________________',
 '______________________________________________________',
 '________________________________________________________',
 '____________

In [12]:
selector = SelectPercentile(percentile=10)
selector.fit(X_train_trans,y_train)
X_train_trans = selector.transform(X_train_trans).toarray()
X_test_trans = selector.transform(X_test_trans).toarray()

In [13]:
X_train_trans

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X_test_trans

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Apply Navie Bayes

## Gaussian NB

In [15]:
model_Gaussian = GaussianNB()
model_Gaussian.fit(X_train_trans,y_train)

y_predict = model_Gaussian.predict(X_test_trans)

In [16]:
accuracy_score(y_test,y_predict)

0.8966666666666666

In [17]:
confusion_matrix(y_test,y_predict)

array([[670,  83],
       [ 10, 137]], dtype=int64)

## Bernoulli NB

In [18]:
model_Bernoulli = BernoulliNB()
model_Bernoulli.fit(X_train_trans,y_train)

y_predict = model_Bernoulli.predict(X_test_trans)

In [19]:
accuracy_score(y_test,y_predict)

0.9511111111111111

In [20]:
confusion_matrix(y_test,y_predict)

array([[734,  19],
       [ 25, 122]], dtype=int64)

## Multinomial NB

In [21]:
model_Multinomial = MultinomialNB()
model_Multinomial.fit(X_train_trans,y_train)

y_predict = model_Multinomial.predict(X_test_trans)

In [22]:
accuracy_score(y_test,y_predict)

0.9133333333333333

In [23]:
confusion_matrix(y_test,y_predict)

array([[753,   0],
       [ 78,  69]], dtype=int64)

# New Email - Manual testing

In [24]:
newemail = pd.Series(["hyperlink"])

In [25]:
newemail

0    hyperlink
dtype: object

In [26]:
newemail_trans = vect.transform(newemail)
newemail_vect = selector.transform(newemail_trans) 

In [27]:
model_Bernoulli.predict(newemail_vect)

array(['ham'], dtype='<U4')