# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [42]:
# Import the required modules
import os
import io
import numpy
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


# Define a function to read the files from a given path
def readFiles(path):
    # Loop through the subdirectories and files in the path
    for root, dirnames, filenames in os.walk(path):
        # Loop through the filenames
        for filename in filenames:
            # Join the root and filename to get the full path
            path = os.path.join(root, filename)

            # Initialize a flag to indicate whether the line is in the body of the email
            inBody = False
            # Initialize a list to store the lines of the email
            lines = []
            # Open the file with latin1 encoding
            f = io.open(path, "r", encoding="latin1")
            # Loop through the lines in the file
            for line in f:
                # If the flag is True, append the line to the list
                if inBody:
                    lines.append(line)
                # If the line is empty, set the flag to True
                elif line == "\n":
                    inBody = True
            # Close the file
            f.close()
            # Join the lines with newline characters to get the message
            message = "\n".join(lines)
            # Yield the path and message as a tuple
            yield path, message


# Define a function to create a data frame from a given directory and classification
def dataFrameFromDirectory(path, classification):
    # Initialize a list to store the rows of the data frame
    rows = []
    # Initialize a list to store the index of the data frame
    index = []
    # Loop through the filenames and messages from the directory
    for filename, message in readFiles(path):
        # Append a dictionary with the message and class to the rows list
        rows.append({"message": message, "class": classification})
        # Append the filename to the index list
        index.append(filename)

    # Return a data frame with the rows and index
    return DataFrame(rows, index=index)


# Initialize an empty data frame with the columns message and class
data = DataFrame({"message": [], "class": []})

# Concatenate the data frame with the data frame from the spam directory with the class spam
data = pd.concat([data, dataFrameFromDirectory("emails/spam", "spam")])
# Concatenate the data frame with the data frame from the ham directory with the class ham
data = pd.concat([data, dataFrameFromDirectory("emails/ham", "ham")])

Let's have a look at that DataFrame:

In [43]:
data

Unnamed: 0,message,class
emails/spam/00249.5f45607c1bffe89f60ba1ec9f878039a,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",spam
emails/spam/00373.ebe8670ac56b04125c25100a36ab0510,ATTENTION: This is a MUST for ALL Computer Use...,spam
emails/spam/00214.1367039e50dc6b7adb0f2aa8aba83216,This is a multi-part message in MIME format.\n...,spam
emails/spam/00210.050ffd105bd4e006771ee63cabc59978,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,spam
emails/spam/00033.9babb58d9298daa2963d4f514193d7d6,This is the bottom line. If you can GIVE AWAY...,spam
...,...,...
emails/ham/01037.6b42b5f3d3d9e6293bf24af66b250655,"On Thu, 2002-08-29 at 00:02, Ville Skyttä wrot...",ham
emails/ham/02056.7bc7703e40a24dda665d4ce7b0cba710,"URL: http://www.newsisfree.com/click/-6,835599...",ham
emails/ham/01782.278f53b8f65fcd422cb26c5bbe74599d,use Perl Daily Newsletter\n\n\n\nIn this issue...,ham
emails/ham/00043.d2673a72d215cbdd747dc98cde41fbd2,< >\n\n> I downloaded a driver from the nVidia...,ham


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [44]:
# Create an instance of CountVectorizer, which is a tool that can convert text into a matrix of token counts
vectorizer = CountVectorizer()

# Apply the vectorizer to the "message" column of the data, which contains the text of the emails
# The result is a sparse matrix that stores the counts of each word in each email
counts = vectorizer.fit_transform(data["message"].values)

# Get the "class" column of the data, which contains the labels of the emails (spam or ham)
# The result is an array that stores the labels of each email
targets = data["class"].values

# Create an instance of MultinomialNB, which is a classifier that can predict the label of a text based on the word counts
classifier = MultinomialNB()

# Train the classifier using the counts matrix and the targets array
# The classifier learns the probability of each word given each label, and the prior probability of each label
classifier.fit(counts, targets)

Let's try it out:

In [45]:
# Create a list of two examples of emails that you want to classify as spam or ham
examples = ["Free Viagra now!!!", "Hi Bob, how about a game of golf tomorrow?"]

# Apply the vectorizer to the examples, which will transform them into a matrix of word counts
# The vectorizer uses the same vocabulary that it learned from the data that we used to train the classifier
example_counts = vectorizer.transform(examples)

# Apply the classifier to the example counts, which will predict the label of each email based on the word counts
# The classifier uses the same probabilities that it learned from the data that we used to train the classifier
predictions = classifier.predict(example_counts)

# Print the predictions, which will show the label of each email as either 'spam' or 'ham'
predictions

array(['spam', 'ham'], dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [46]:
# Import some modules from sklearn, which is a library that provides tools for machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

# Split the counts matrix and the targets array into training and testing sets
# The test_size parameter specifies the proportion of the data that will be used for testing (0.2 means 20%)
# The random_state parameter ensures that the split is reproducible by using a fixed seed for the random number generator
X_train, X_test, y_train, y_test = train_test_split(
    counts, targets, test_size=0.2, random_state=42
)

# Train the classifier using the training set
classifier.fit(X_train, y_train)

# Predict the labels of the testing set using the classifier
y_pred = classifier.predict(X_test)

# Calculate the accuracy of the classifier, which is the proportion of correct predictions
accuracy = accuracy_score(y_test, y_pred)

# Calculate the baseline accuracy, which is the proportion of the most frequent label in the data
# For example, if the data has 60% spam and 40% ham, the baseline accuracy is 60%
baseline = max(
    numpy.count_nonzero(targets == "spam"), numpy.count_nonzero(targets == "ham")
) / len(targets)

# Print the accuracy and the baseline accuracy as percentages
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Baseline: {:.2f}%".format(baseline * 100))

# Calculate the precision of the classifier, which is the proportion of true positives among all predicted positives
# For example, if the classifier predicts 100 spam emails and 80 of them are actually spam, the precision is 80%
precision = precision_score(y_test, y_pred, pos_label="spam")

# Calculate the recall of the classifier, which is the proportion of true positives among all actual positives
# For example, if there are 100 spam emails in the data and the classifier predicts 80 of them, the recall is 80%
recall = recall_score(y_test, y_pred, pos_label="spam")

# Calculate the F1-score of the classifier, which is the harmonic mean of precision and recall
# The F1-score is a measure of the balance between precision and recall
f1 = f1_score(y_test, y_pred, pos_label="spam")

# Print the precision, recall, and F1-score as percentages
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1-score: {:.2f}%".format(f1 * 100))

Accuracy: 94.50%
Baseline: 83.33%
Precision: 100.00%
Recall: 72.03%
F1-score: 83.74%
