Firstly, we import the data.

In [1]:
import pandas as pd

df = pd.read_csv('train_emails.csv')
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


Check the shape of the data

In [2]:
df.shape

(5172, 3002)

Then we clean the data. Remove the `Email No.` column.

In [3]:
df = df.drop('Email No.', 1)
df.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0


Check if there's any NaN.

In [4]:
df.isnull().values.any()

False

We can see that the data does not have NaNs.

Now we have the clean dataset, we can use `Multinomial Navie Bayes` to classify the mails.

In [5]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df.drop('Prediction', axis=1), df['Prediction'], test_size=0.2, random_state=42)

In [6]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=1.9)
model.fit(train_x, train_y)

MultinomialNB(alpha=1.9)

Now we use `score` method to determine the mean accuracy of the model:

In [7]:
model.score(test_x, test_y)

0.9536231884057971

To make the accuracy more accurate, we can use `Area Under Receiver Operating Characteristic Curve` ("ROC AUC") to measure the model accuracy.

In [8]:
from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)
roc_auc_score(test_y, probabilities[:, 1])

0.9789137073474016

Moreover, we can use `confusion matrix` to see how many emails are classified correctly or incorrectly.

In [9]:
from sklearn.metrics import confusion_matrix
predicted = model.predict(test_x)
confusion_matrix(test_y, predicted)

array([[703,  36],
       [ 12, 284]])

As we can see, the upper-left and lower-right cells have mush larger number than that of upper-right and lower-left. Therefore, This model is accurate.

Then we can also check the presision and recall score.

In [10]:
from sklearn.metrics import precision_score

train_predictions = model.predict(train_x)
precision_score(train_y, train_predictions)

0.8701996927803379

In [11]:
from sklearn.metrics import recall_score

recall_score(train_y, train_predictions)

0.9410299003322259



Following codes are used to predict some emails from other datasets.

In [12]:
import re

# Split the email and count the frequency
def get_word_frequency(content):
    words = list(df.columns)[:-1]

    # Find all words and its length
    pattern_word = re.compile(r'[a-zA-Z]+')
    content_words = pattern_word.findall(content.lower())

    # Calculate the frequency probability
    frequency_word = [content_words.count(word) for word in words]

    return dict(enumerate(frequency_word))

In [13]:
# precict a single email
def predict_spam(content):
    input = [get_word_frequency(content)]
    return model.predict(pd.DataFrame(input))[0]

In [15]:
# Predict emails from another dataset
test_df = pd.read_csv("test_emails.csv")
test_df = test_df[['text', 'label_num']]

test_input = []
for i in range(test_df.shape[0]):
    test_input.append(get_word_frequency(test_df.iloc[i][0]))
test_input = pd.DataFrame(test_input)
test_result = test_df['label_num']

test_input.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,enhancements,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry
0,1,2,0,0,1,0,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,25,16,0,17,6,11,11,8,0,5,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,3,0,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Then check the accuracy of the model.

In [16]:
model.score(test_input, test_result)

0.9400502804099787

In [26]:
probabilities = model.predict_proba(test_input)
roc_auc_score(test_result, probabilities[:, 1])

0.9929796153136223

Above scores show that the model has a good accuracy on this dataset.

Then we check the confusion matrix to see how is the prediction distribution (Spam or ham emails are classified correctly or not)

In [22]:
predicted = model.predict(test_input)
confusion_matrix(test_result, predicted)

array([[3628,   44],
       [ 266, 1233]])

In [19]:
from sklearn.metrics import precision_score

train_predictions = model.predict(test_input)
precision_score(test_result, predicted)

0.9655442443226312

In [20]:
recall_score(test_result, predicted)

0.8225483655770514

Since the upper-left and lower-right cells have mush larger number than that of upper-right and lower-left, we can say that this model can be used to classify emails.