## Naive Bayes for Spam Classification - Scikit Learn

The scikit learn library has a Naive Bayes probabilistic model pre-defined!



In [50]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer # Will help us to generate our vocabulary
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier (why not try random forest?)

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

DATA_JSON_FILE = '/content/drive/My Drive/SpamData/01_Processing/email-text-data.json'

In [51]:
# Loading our data with pandas!
data = pd.read_json(DATA_JSON_FILE) # Tip: JSON files are good given the web uses Json often! and it is easy to retrieve data

print('Data Frame Shape:', data.shape)
data.tail()

Data Frame Shape: (5797, 3)


Unnamed: 0,Message,Category,File_Name
5792,"On Fri, Jul 19, 2002 at 03:35:36PM +0100 or so...",0,00109.1d90dc88e6b0591ee91e3cd605ec778a
5793,"| Date: Wed, 31 Jul 2002 15:13:23 +0100\n\n ...",0,00190.598d2a83744a3a7ac536e36ca56d7e65
5794,"gcc, glibc and binutils, which the lfs site sa...",0,00123.3921de802520cfe7a5b3e0777aa4affc
5795,--==_Exmh_-2079003886P\n\nContent-Type: text/p...,0,00008.6b73027e1e56131377941ff1db17ff12
5796,"On Sat, Jul 27, 2002 at 03:06:15PM +0100, Step...",0,00125.e6d80b873b71ae5324679a4dbefe4eaf


In [52]:
# Sorting by our index
data.sort_index(inplace=True)

In [53]:
# Generating our Vocabulary 

# This function will generate automatically a vocabulary using english language
vectorizer = CountVectorizer(stop_words='english') 

In [54]:
# The vectorizer will extract all features from our Messages 
all_features = vectorizer.fit_transform(data.Message)

all_features.shape
# 5797 >> Rows
# 102694 >> Columns = Tokens in our e-mails (individual words)

(5797, 102694)

In [55]:
# Vocabulary generated by Scikit Learn
vectorizer.vocabulary_

{'html': 48472,
 'head': 47011,
 'title': 87725,
 'digital': 34034,
 'publishing': 74032,
 'tools': 88162,
 'free': 42773,
 'software': 83094,
 'alert': 18591,
 'meta': 61701,
 'http': 48497,
 'equiv': 38991,
 'content': 30249,
 'type': 89371,
 'text': 86991,
 'charset': 27796,
 'iso': 53094,
 '8859': 12828,
 'body': 24390,
 'bgcolor': 23297,
 'ffffff': 41323,
 '000000': 4,
 'center': 27405,
 'table': 86120,
 'width': 95488,
 '582': 9382,
 'border': 24581,
 'cellspacing': 27383,
 'cellpadding': 27375,
 'bordercolor': 24584,
 '0077cc': 261,
 'tr': 88442,
 'td': 86548,
 'colspan': 29390,
 'align': 18634,
 '8749mtdf5': 12778,
 '466pbl14': 8223,
 'href': 48390,
 '3dpageturningebook': 7150,
 'com': 29405,
 'style': 84723,
 'decoration': 32851,
 'font': 42257,
 'face': 40498,
 'verdana': 92415,
 'arial': 20116,
 'helvetica': 47220,
 'sans': 80256,
 'serif': 81304,
 'size': 82347,
 'color': 29367,
 'publish': 74027,
 'like': 58501,
 'professional': 73364,
 '204': 3709,
 'valign': 92046,
 '000

In [65]:
# Spliting and Shuffle our Test data

X_train, X_test, y_train, y_test = train_test_split(all_features, # Features
                                                    data.Category, # Labels 
                                                    test_size=0.3, # 30% for testing and 70% for training
                                                    random_state= 88) # Shuffling (everyday i'm shuffling....)

In [57]:
print('X train shape:', X_train.shape)
print('X test shape:', X_test.shape)

X train shape: (4057, 102694)
X test shape: (1740, 102694)


In [58]:
# Creating the classifier and training it!
classifier = MultinomialNB()

classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [59]:
# Calculating Metrics

nr_correct = (y_test == classifier.predict(X_test)).sum()
nr_incorrect = y_test.size - nr_correct

print(f'{nr_correct} documents classified correctly')
print(f'{nr_incorrect} documents classified incorrectly')

1650 documents classified correctly
90 documents classified incorrectly


In [60]:
accuracy = nr_correct / (nr_correct + nr_incorrect)
print(f'Model Accuracy is: {accuracy:.2%}')

Model Accuracy is: 94.83%


In [61]:
# You can also calculate using 'SCORE' Method
classifier.score(X_test, y_test)

0.9482758620689655

In [88]:
# Recall
recall = recall_score(y_test, classifier.predict(X_test))
print('Recall Score: {:.2%}'.format(recall))

# Precision
precision = precision_score(y_test, classifier.predict(X_test))
print('Precision Score: {:.2%}'.format(precision))

# F1 Score
f_score = 2 * (precision * recall) / (precision + recall)
print('F1 Score: {:.2%}'.format(f_score))

Recall Score: 85.46%
Precision Score: 98.14%
F1 Score: 91.36%


In [89]:
# Testing with new examples in order to check if our new e-mails are spammy or not

example = ['Get viagra for free now',
           'Need a mortgage?  Reply to arrange a call with a specialist and get a quote',
           'Could you please help me with the project tommorrow?',
           'Hey man, How about a game of golf tomorrow? Please let me know',
           'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.']

In [90]:
# 1st let's process our new emails
doc_term_matrix = vectorizer.transform(example)

In [91]:
# Take the classifier
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0])