In [1]:
import raw_utils as util
import os

import pandas as pd
import numpy as np

import random
np.random.seed(1746)

In [2]:
# Paths
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

# Filenames
nazario_csv = 'nazario_recent.csv'
enron_csv = ['enron_text_2000.csv', 'enron_text_20000.csv', 'enron_text_100000.csv']

## Phishing

First, read the csv with the recent emails.

In [3]:
phishing_text_raw = pd.read_csv(os.path.join(csv_path, nazario_csv), index_col=0, encoding='latin-1', dtype={'Body': 'object'})

In [4]:
phishing_text_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1916 entries, 0 to 1915
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    1875 non-null   object
dtypes: object(1)
memory usage: 29.9+ KB


### Cleanup

#### Remove Uninformative Rows

We see that there are some rows with no body text. These were emails that contained only attachments, so it is safe to drop them.

In [5]:
phishing_text = phishing_text_raw.dropna()

Afterwards, we can see that there are some computer generated messages at the beginning of the mbox files, which we also need to remove.

In [6]:
phishing_text = phishing_text[phishing_text["Body"].str.contains("This text is part of the internal format of your mail folder, and is not\na real message.") == False]

In [7]:
phishing_text.shape

(1869, 1)

Finally, we will remove the duplicate messages.

In [8]:
phishing_text = phishing_text[phishing_text.duplicated(keep='first') == False]

In [9]:
phishing_text.shape

(1720, 1)

We see that we have 1720 emails to work with.

## Legitimate

We will repeat this process with the two smaller legitimate email datasets (since we aim for ratios of 1:1 and 1:10).

In [10]:
legit_text_small_raw = pd.read_csv(os.path.join(csv_path, enron_csv[0]), index_col=0, encoding='latin-1', dtype={'Body': 'object'})
legit_text_small_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    1991 non-null   object
dtypes: object(1)
memory usage: 31.2+ KB


In [11]:
legit_text_big_raw = pd.read_csv(os.path.join(csv_path, enron_csv[1]), index_col=0, encoding='latin-1', dtype={'Body': 'object'})
legit_text_big_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 19999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Body    19928 non-null  object
dtypes: object(1)
memory usage: 312.5+ KB


### Cleanup

#### Remove Uninformative Rows

In [12]:
legit_text_small = legit_text_small_raw.dropna()
legit_text_small.shape

(1991, 1)

In [13]:
legit_text_big = legit_text_big_raw.dropna()
legit_text_big.shape

(19928, 1)

There are no such computer generated emails with this dataset, so only the duplicates need removal.

In [14]:
legit_text_small = legit_text_small[legit_text_small.duplicated(keep='first') == False]
legit_text_small.shape

(1972, 1)

In [15]:
legit_text_big = legit_text_big[legit_text_big.duplicated(keep='first') == False]
legit_text_big.shape

(19092, 1)

## Mixed Datasets

Finally, we will create two mixed datasets, adding an extra column that shows the class (phishing or legitimate).

In [16]:
phishing_text['Class'] = 1

#### 1:1 ratio

In [17]:
legit_text_small = legit_text_small.sample(n=1720) # taking a smaller subset in order to have 1:1 ratio
legit_text_small['Class'] = 0

In [18]:
dataset_1 = pd.concat([phishing_text, legit_text_small])
dataset_1 = dataset_1.sample(frac=1).reset_index(drop=True) # shuffle the rows and create a new unique index

In [19]:
util.save_to_csv(dataset_1, csv_path, 'dataset_1.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/dataset_1.csv


#### 1:10 ratio

In [20]:
legit_text_big = legit_text_big.sample(n=17200) # taking a smaller subset in order to have 1:10 ratio
legit_text_big['Class'] = 0

In [21]:
dataset_2 = pd.concat([phishing_text, legit_text_big])
dataset_2 = dataset_2.sample(frac=1).reset_index(drop=True) # shuffle the rows and create a new unique index

In [22]:
util.save_to_csv(dataset_2, csv_path, 'dataset_2.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/dataset_2.csv
