#  Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

## Naive Bayes
### Using Naive Bayes to predict spam

In [3]:
#Use Latin encoding as the Data has non UTF-8 Chars
data = pd.read_csv("spam.csv",encoding='latin-1')

In [4]:
data.shape

(5572, 2)

In [5]:
data.head()

Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
from collections import Counter
Counter(data.type)

Counter({'ham': 4825, 'spam': 747})

In [7]:
X =  data.email
y = data.type

In [8]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: email, dtype: object

## Vectorization : Transforming TEXT to Vectors

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
feature_names = vectorizer.get_feature_names()

In [10]:
len(feature_names)

8672

In [11]:
feature_names[2000:2010]

['chez',
 'chg',
 'chgs',
 'chic',
 'chick',
 'chicken',
 'chickened',
 'chief',
 'chik',
 'chikku']

In [12]:
X = X.toarray()

In [13]:
X.shape

(5572, 8672)

In [14]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
y.shape

(5572,)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [17]:
#Fitting Naive Bayes algo
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
model = BernoulliNB()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

In [18]:
print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))
pd.crosstab(y_test,y_predict)

0.9796650717703349
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1446
        spam       1.00      0.85      0.92       226

    accuracy                           0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



col_0,ham,spam
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1446,0
spam,34,192


## Checking new email for spam

In [19]:
#NewEmail = pd.Series(["Hi team, We have meeting tomorrow"])
NewEmail = pd.Series(['**FREE MESSAGE**Thanks for using the Auction Subscription Service. 18 . 150p/MSGRCVD 2 Skip an Auction txt OUT. 2 Unsubscribe txt STOP CustomerCare 08718726270'])
NewEmail


0    **FREE MESSAGE**Thanks for using the Auction S...
dtype: object

In [20]:
NewEmail_transformed = vectorizer.transform(NewEmail)

In [21]:
model.predict(NewEmail_transformed)

array(['spam'], dtype='<U4')

In [47]:
X_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])