# Naive Bayes classifier for multinomial models

In [1]:
#Load pandas and sklearn libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
#Load general synthethic emails
df_general = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Classification1/data_general.csv")

In [3]:
#Load wrongdoings emails
df_wrongdoings = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Classification1/data_wrongdoing.csv")

In [4]:
#Set label 0 to general emails
df_general['1'] = 0

In [5]:
#Check loaded general emails
df_general.head()

Unnamed: 0,0,1
0,"Dear John,\n\nI hope this email finds you well...",0
1,"Dear CEO,\n\nI am writing to recommend candida...",0
2,"Dear John,\n\nI hope this email finds you well...",0
3,"Dear CEO,\n\nI hope this message finds you wel...",0
4,"Dear Ava Miller,\n\nI hope this email finds yo...",0


In [6]:
#Set label 1 to wrongdoings emails
df_wrongdoings['1'] = 1

In [7]:
#Check loaded wrongdoings emails
df_wrongdoings.head()

Unnamed: 0,0,1
0,"Dear Helen,\n\nI hope this email finds you in ...",1
1,"Dear Sarah,\n\nI am writing to propose the imp...",1
2,"Dear Mark,\n\nI hope this email finds you in g...",1
3,"Dear Jane Smith,\n\nAs the CEO of Acme Corpora...",1
4,"Dear CEO,\n\nI hope this email finds you well....",1


In [8]:
#Combine all emails into one dataset
data = [df_general, df_wrongdoings]

In [9]:
#Check combined data
data

[                                                     0  1
 0    Dear John,\n\nI hope this email finds you well...  0
 1    Dear CEO,\n\nI am writing to recommend candida...  0
 2    Dear John,\n\nI hope this email finds you well...  0
 3    Dear CEO,\n\nI hope this message finds you wel...  0
 4    Dear Ava Miller,\n\nI hope this email finds yo...  0
 ..                                                 ... ..
 512  Dear CEO,\n\nI hope this email finds you well....  0
 513  Dear Stephen,\n\nI hope this email finds you w...  0
 514  Dear Katherine Jenkins,\n\nI hope this email f...  0
 515  Dear CEO,\n\nI am writing to request a new cap...  0
 516  Dear John,\n\nI hope this email finds you well...  0
 
 [517 rows x 2 columns],
                                                      0  1
 0    Dear Helen,\n\nI hope this email finds you in ...  1
 1    Dear Sarah,\n\nI am writing to propose the imp...  1
 2    Dear Mark,\n\nI hope this email finds you in g...  1
 3    Dear Jane Smith,\n\nAs 

In [10]:
#Concatenate data into one dataframe
df_synthetic = pd.concat(data)

In [11]:
#Check dataframe with synthetic data
df_synthetic

Unnamed: 0,0,1
0,"Dear John,\n\nI hope this email finds you well...",0
1,"Dear CEO,\n\nI am writing to recommend candida...",0
2,"Dear John,\n\nI hope this email finds you well...",0
3,"Dear CEO,\n\nI hope this message finds you wel...",0
4,"Dear Ava Miller,\n\nI hope this email finds yo...",0
...,...,...
512,"Dear Jake,\n\n I hope this email finds you wel...",1
513,"Dear John,\n\nI hope this email finds you well...",1
514,"Dear Ms. Smith,\n\nI hope this letter finds yo...",1
515,"Dear Eleanor Jacobs,\n\nI hope this email find...",1


In [12]:
#Separate texts
emails = df_synthetic['0']

In [13]:
#Check texts
emails

0      Dear John,\n\nI hope this email finds you well...
1      Dear CEO,\n\nI am writing to recommend candida...
2      Dear John,\n\nI hope this email finds you well...
3      Dear CEO,\n\nI hope this message finds you wel...
4      Dear Ava Miller,\n\nI hope this email finds yo...
                             ...                        
512    Dear Jake,\n\n I hope this email finds you wel...
513    Dear John,\n\nI hope this email finds you well...
514    Dear Ms. Smith,\n\nI hope this letter finds yo...
515    Dear Eleanor Jacobs,\n\nI hope this email find...
516    Dear CEO,\n\nI hope this email finds you well....
Name: 0, Length: 1034, dtype: object

In [14]:
#Separate labels
labels = df_synthetic["1"]

In [15]:
#Check labels
labels

0      0
1      0
2      0
3      0
4      0
      ..
512    1
513    1
514    1
515    1
516    1
Name: 1, Length: 1034, dtype: int64

In [16]:
#Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer()
features = cv.fit_transform(emails)

In [17]:
#Check vectorised features
features

<1034x5097 sparse matrix of type '<class 'numpy.int64'>'
	with 119228 stored elements in Compressed Sparse Row format>

In [18]:
#Split into test, train datasets
x_train, x_test,y_train, y_test = train_test_split(features,labels,test_size = 0.2)

In [19]:
#Check test labels
y_test

385    1
298    1
114    1
228    1
198    1
      ..
383    1
81     0
162    1
117    0
508    0
Name: 1, Length: 207, dtype: int64

In [20]:
#Load and train Naive Bayes classifier for multinomial models
mnb = MultinomialNB()
mnb.fit(x_train,y_train)

In [21]:
#Check accuracy
print("Accuracy: {}".format(mnb.score(x_test,y_test)))

Accuracy: 1.0


In [22]:
#Load Enron corpus to dataframe
df_enron = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/Classification1/emails.csv")
df_enron.head(5)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [23]:
#Parse Enron corpus
#Credit: https://www.kaggle.com/code/lixa168/data-science-email

def get_message(Series: pd.Series):
    result = pd.Series(index=Series.index)
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        del message_words[:15]
        result.iloc[row] = ''.join(message_words).strip()
    return result

def get_date(Series: pd.Series):
    result = pd.Series(index=Series.index)
    for row, message in enumerate(Series):
        message_words = message.split('\n')
        del message_words[0]
        del message_words[1:]
        result.iloc[row] = ''.join(message_words).strip()
        result.iloc[row] = result.iloc[row].replace('Date: ', '')
    print('Done parsing, converting to datetime format..')
    return pd.to_datetime(result)

def get_sender_and_receiver(Series: pd.Series):
    sender = pd.Series(index = Series.index)
    recipient1 = pd.Series(index = Series.index)
    recipient2 = pd.Series(index = Series.index)
    recipient3 = pd.Series(index = Series.index)

    for row,message in enumerate(Series):
        message_words = message.split('\n')
        sender[row] = message_words[2].replace('From: ', '')
        recipient1[row] = message_words[3].replace('To: ', '')
        recipient2[row] = message_words[10].replace('X-cc: ', '')
        recipient3[row] = message_words[11].replace('X-bcc: ', '')

    return sender, recipient1, recipient2, recipient3

def get_subject(Series: pd.Series):
    result = pd.Series(index = Series.index)

    for row, message in enumerate(Series):
        message_words = message.split('\n')
        message_words = message_words[4]
        result[row] = message_words.replace('Subject: ', '')
    return result

def get_folder(Series: pd.Series):
    result = pd.Series(index = Series.index)

    for row, message in enumerate(Series):
        message_words = message.split('\n')
        message_words = message_words[12]
        result[row] = message_words.replace('X-Folder: ', '')
    return result

In [24]:
#Parse Enron corpus into dataframe
#Credit: https://www.kaggle.com/code/lixa168/data-science-email

df_enron['text'] = get_message(df_enron.message)
df_enron['sender'], df_enron['recipient1'], df_enron['recipient2'], df_enron['recipient3'] = get_sender_and_receiver(df_enron.message)
df_enron['Subject'] = get_subject(df_enron.message)
df_enron['folder'] = get_folder(df_enron.message)
df_enron['date'] = get_date(df_enron.message)

Done parsing, converting to datetime format..


  return pd.to_datetime(result)


In [25]:
#Check parsed Enron data
df_enron.head()

Unnamed: 0,file,message,text,sender,recipient1,recipient2,recipient3,Subject,folder,date
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast,phillip.allen@enron.com,tim.belden@enron.com,,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",2001-05-14 16:39:00-07:00
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,phillip.allen@enron.com,john.lavorato@enron.com,,,Re:,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",2001-05-04 13:51:00-07:00
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!,phillip.allen@enron.com,leah.arsdall@enron.com,,,Re: test,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,2000-10-18 03:00:00-07:00
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy, Can you send me a schedule of the salar...",phillip.allen@enron.com,randall.gay@enron.com,,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,2000-10-23 06:13:00-07:00
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.,phillip.allen@enron.com,greg.piper@enron.com,,,Re: Hello,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,2000-08-31 05:07:00-07:00


In [26]:
#Check whether the initial email is identified
df_enron[df_enron['text'].str.contains("a potential sale of Mariner")]

Unnamed: 0,file,message,text,sender,recipient1,recipient2,recipient3,Subject,folder,date
88788,delainey-d/_sent_mail/55.,Message-ID: <15880209.1075854480433.JavaMail.e...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
89954,delainey-d/all_documents/82.,Message-ID: <29039550.1075854430593.JavaMail.e...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
90738,delainey-d/discussion_threads/561.,Message-ID: <4546772.1075854459062.JavaMail.ev...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
91474,delainey-d/sent/55.,Message-ID: <749272.1075854463849.JavaMail.eva...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
269113,lay-k/all_documents/919.,Message-ID: <2288971.1075840226228.JavaMail.ev...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
271085,lay-k/discussion_threads/779.,Message-ID: <13008745.1075840252008.JavaMail.e...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
273315,lay-k/notes_inbox/688.,Message-ID: <25440421.1075840275667.JavaMail.e...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00


In [27]:
#Check that all copies of initial emails have been identified
(df_enron[df_enron['text'].str.contains("a potential sale of Mariner")]).shape

(7, 10)

In [28]:
#Vectorize Enron emails
features_predict = cv.transform(df_enron['text'])

In [29]:
#Predict classes for Enron dataset
y_pred = mnb.predict(features_predict)

In [30]:
#Check dimensions
print(df_enron.shape)
print(len(y_pred))

(517401, 10)
517401


In [31]:
#Check type of predicted data
type(y_pred)

numpy.ndarray

In [32]:
#Check how many suspicious emails identified
import numpy as np

found = np.where(y_pred == 1)[0]

In [33]:
#Check how many suspicious emails identified
len(found)

41144

In [34]:
#Load suspicious emails into dataframe
wrongdoings_found = df_enron.iloc[found]

In [35]:
#Check columns
wrongdoings_found.columns

Index(['file', 'message', 'text', 'sender', 'recipient1', 'recipient2',
       'recipient3', 'Subject', 'folder', 'date'],
      dtype='object')

In [36]:
#Check whether the initial email is identified
wrongdoings_found[wrongdoings_found['text'].str.contains("a potential sale of Mariner")]

Unnamed: 0,file,message,text,sender,recipient1,recipient2,recipient3,Subject,folder,date
88788,delainey-d/_sent_mail/55.,Message-ID: <15880209.1075854480433.JavaMail.e...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
89954,delainey-d/all_documents/82.,Message-ID: <29039550.1075854430593.JavaMail.e...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
90738,delainey-d/discussion_threads/561.,Message-ID: <4546772.1075854459062.JavaMail.ev...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
91474,delainey-d/sent/55.,Message-ID: <749272.1075854463849.JavaMail.eva...,X-Origin: Delainey-DX-FileName: ddelain.nsfKen...,david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
269113,lay-k/all_documents/919.,Message-ID: <2288971.1075840226228.JavaMail.ev...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
271085,lay-k/discussion_threads/779.,Message-ID: <13008745.1075840252008.JavaMail.e...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00
273315,lay-k/notes_inbox/688.,Message-ID: <25440421.1075840275667.JavaMail.e...,"X-Origin: LAY-KX-FileName: klay.nsfKen, in res...",david.delainey@enron.com,kenneth.lay@enron.com,X-From: David W Delainey,X-To: Kenneth Lay,Mariner,"X-cc: Jeff Donahue, Raymond Bowen",2000-11-28 12:34:00-08:00


In [37]:
#Check that all copies of initial emails have been identified
(wrongdoings_found[wrongdoings_found['text'].str.contains("a potential sale of Mariner")]).shape

(7, 10)