In [1]:
import os
import pandas as pd

import preprocessing as util

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

[nltk_data] Downloading package punkt to /home/ichanis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ichanis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

data_files = ['dataset_1.csv', 'dataset_2.csv']

In [3]:
dataset_1 = pd.read_csv(os.path.join(csv_path, data_files[0]), index_col=0, encoding='latin-1', dtype={'Body': 'object', 'Class': 'bool'})
dataset_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3440 entries, 0 to 3439
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    3440 non-null   object
 1   Class   3440 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 57.1+ KB


In [4]:
dataset_2 = pd.read_csv(os.path.join(csv_path, data_files[1]), index_col=0, encoding='latin-1', dtype={'Body': 'object', 'Class': 'bool'})
dataset_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18920 entries, 0 to 18919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    18920 non-null  object
 1   Class   18920 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 314.1+ KB


## Preprocessing

We need to convert the text data into a format more suitable for use with machine learning algorithms.<br>
The process will follow the steps below:

### 1. Cleanup HTML and whitespace

A percentage of the emails extracted are either in HTML format or they have the same message in both plaintext and HTML, as part of a `multipart/alternative` content-type message.<br>
In order to extract the text, the HTML formatting has to be removed, along with unnecessary whitespace any duplicated text created by the aforementioned multipart emails.

In [5]:
dataset_1['Body'] = dataset_1['Body'].apply(util.strip_characters)
dataset_1['Body'] = dataset_1['Body'].apply(util.deduplicate_text)

In [6]:
dataset_2['Body'] = dataset_2['Body'].apply(util.strip_characters)
dataset_2['Body'] = dataset_2['Body'].apply(util.deduplicate_text)

### 2. Replacing addresses

A lot of the emails contain either **web addresses** (URLs) or **email addresses** that need to be removed in order for the frequency of certain domains to not influence the results.<br>
In order for this information to not get completely lost however, those addresses will be replaced by the strings `<urladdress>` and `<emailaddress>` respectively. Those strings are chosen because they do not occur normally in the emails.

In [7]:
dataset_1['Body'] = dataset_1['Body'].apply(util.replace_email)
dataset_1['Body'] = dataset_1['Body'].apply(util.replace_url)

In [8]:
dataset_2['Body'] = dataset_2['Body'].apply(util.replace_email)
dataset_2['Body'] = dataset_2['Body'].apply(util.replace_url)

### 3. Tokenization and stopword removal

Tokenization is the process of splitting text into individual words. This is useful because generally speaking, the meaning of the text can easily be interpreted by analyzing the words present in the text.<br>
Along with this process, letters are also converted to lowercase and punctuation or other special characters are removed.<br>
Since there are some words (called **stopwords**) that do not contribute very much in meaning (like pronouns or simple verbs), they can be removed to reduce the noise.

In [9]:
dataset_1['Body'] = dataset_1['Body'].apply(util.tokenize)
# dataset_1['Body'] = dataset_1['Body'].apply(util.remove_stopwords)

In [10]:
dataset_1['Body'].loc[0]

['dear',
 'user',
 'your',
 'mail-box',
 'has',
 'exceeded',
 'the',
 'storage',
 'limit',
 'you',
 'can',
 'not',
 'send',
 'or',
 'receive',
 'new',
 'messages',
 'until',
 'you',
 're-validate',
 'your',
 'mail',
 'to',
 're-validate',
 'the',
 'mailbox',
 'click',
 'here',
 'to',
 're-validate',
 'thank',
 'you',
 'mail',
 'server']

In [11]:
dataset_1['Body'].loc[3436]

['dear',
 'usaa',
 'member',
 'this',
 'message',
 'is',
 'to',
 'notify',
 'you',
 'that',
 'you',
 'have',
 'an',
 'incoming',
 'e-payment',
 'transfer',
 'which',
 'is',
 'awaiting',
 'your',
 'approval',
 'for',
 'security',
 'reasons',
 'a',
 'quick',
 'verification',
 'is',
 'required',
 'as',
 'a',
 'means',
 'to',
 'accept',
 'the',
 'incoming',
 'payment',
 'this',
 'is',
 'to',
 'verify',
 'the',
 'payment',
 'between',
 'the',
 'sender',
 'and',
 'the',
 'receiver',
 'click',
 'here',
 'to',
 'approve',
 'your',
 'payment',
 'will',
 'be',
 'posted',
 'into',
 'your',
 'account',
 'within',
 'the',
 'next',
 '48hours',
 'after',
 'verification',
 'thank',
 'you',
 'usaa']

In [12]:
dataset_1['Body'].loc[394]

['urladdress']

In [13]:
dataset_1['Body'].loc[3439]

['dear',
 'emailaddress',
 '1969mb',
 '2000mb',
 'your',
 'e-mail',
 'account',
 'is',
 'running',
 'on',
 'a',
 'low',
 'storage',
 'space',
 'verify',
 'your',
 'account',
 'now',
 'to',
 'increase',
 'storage',
 'space',
 'other',
 'wise',
 'your',
 'account',
 'shall',
 'be',
 'locked',
 'out',
 'click',
 'here',
 'to',
 'verify',
 'your',
 'account',
 'notice',
 'failure',
 'to',
 'verify',
 'your',
 'e-mail',
 'account',
 'shall',
 'result',
 'to',
 'account',
 'lock',
 'out',
 'thanks',
 'account',
 'service']

In [14]:
dataset_1

Unnamed: 0,Body,Class
0,"[dear, user, your, mail-box, has, exceeded, th...",True
1,"[please, forward, any, book, requests, to, me,...",False
2,"[i, will, not, give, out, my, i, have, not, be...",False
3,"[start, date, hourahead, hour, 6, no, ancillar...",False
4,"[confirm, your, account, dear, member, please,...",True
...,...,...
3435,"[to, ensure, delivery, to, your, inbox, please...",True
3436,"[dear, usaa, member, this, message, is, to, no...",True
3437,"[just, to, keep, you, in, the, loop, i, have, ...",False
3438,"[suspicious, activity, has, been, detected, on...",True
