In [1]:
import os
import re
import pandas as pd
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 160

import features as util
from raw_utils import save_to_csv
from preprocessing import tokenize

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train_text = ['train_balanced_text.csv', 'train_imbalanced_text.csv']
test_text = ['test_balanced_text.csv', 'test_imbalanced_text.csv']

#### Email Text

In [3]:
train_balanced_text = pd.read_csv(os.path.join(csv_path, train_text[0]), index_col=0)
test_balanced_text = pd.read_csv(os.path.join(csv_path, test_text[0]), index_col=0)

In [4]:
train_imbalanced_text = pd.read_csv(os.path.join(csv_path, train_text[1]), index_col=0)
test_imbalanced_text = pd.read_csv(os.path.join(csv_path, test_text[1]), index_col=0)

After the preprocessing, the data look like this:

In [5]:
test_balanced_text.head()

Unnamed: 0,id,body,class
0,2377,please print\n----- Forwarded by Steven J Kean/NA/Enron on 10/16/2000 10:27 AM -----\n\n\tCynthia Sandherr\n\t10/12/2000 07:43 PM\n\t\t \n\t\t To: Thomas E ...,False
1,592,Server Message \n \n\n \n Dear <emailaddress> Our record indicates that you recently made a request to deactivate email And this request will be process...,True
2,1049,Please see the attached articles:,False
3,1087,Please Click Here<urladdress> to Update e-mail Password\n\n\n\nIT Security immediately/\n\n________________________________\nSEED IS PROUD TO BE A 21st CENT...,True
4,725,"Wednesday afternoon Febuary 6th, 2002, the Enron building experienced a brief power outage. The building is powered by one of two Reliant circuits. Yester...",False


#### Email Tokens

Since the .csv files with the already tokenized emails have been subject to further preprocessing like lemmatization and stopword removal, a simple tokenization will also be run here for the purposes of feature extraction.

In [6]:
train_balanced_text['tokens'] = train_balanced_text['body'].apply(tokenize)
test_balanced_text['tokens'] = test_balanced_text['body'].apply(tokenize)
train_imbalanced_text['tokens'] = train_imbalanced_text['body'].apply(tokenize)
test_imbalanced_text['tokens'] = test_imbalanced_text['body'].apply(tokenize)

# Stylometric Features

A useful marker of wether an email is phishing or not should stem from the writing style of the author.<br>
With this in mind, several features that have been used in the literature will be extracted, in order to be compared and combined with the text-only baseline.

In [7]:
train_balanced_style = train_balanced_text[['id', 'class']].copy()
test_balanced_style = test_balanced_text[['id', 'class']].copy()
train_imbalanced_style = train_imbalanced_text[['id', 'class']].copy()
test_imbalanced_style = test_imbalanced_text[['id', 'class']].copy()

### Simple Counts

The simplest kind of features would be simple counts of various parts of the emails, like characters and words.

In [8]:
train_balanced_style['num_chars'] = train_balanced_text['body'].apply(util.count_chars)
train_balanced_style['num_newlines'] = train_balanced_text['body'].apply(util.count_newlines)
train_balanced_style['num_words'] = train_balanced_text['tokens'].apply(util.count_words)
train_balanced_style['num_unique_words'] = train_balanced_text['tokens'].apply(util.count_unique_words)
train_balanced_style['sentences'] = train_balanced_text['body'].apply(util.count_sentences)
train_balanced_style[['num_sentences', 'num_upper_sentences', 'num_lower_sentences']] = pd.DataFrame(train_balanced_style['sentences'].tolist(), index=train_balanced_style.index)
train_balanced_style = train_balanced_style.drop('sentences', axis=1)

In [9]:
test_balanced_style['num_chars'] = test_balanced_text['body'].apply(util.count_chars)
test_balanced_style['num_newlines'] = test_balanced_text['body'].apply(util.count_newlines)
test_balanced_style['num_words'] = test_balanced_text['tokens'].apply(util.count_words)
test_balanced_style['num_unique_words'] = test_balanced_text['tokens'].apply(util.count_unique_words)
test_balanced_style['sentences'] = test_balanced_text['body'].apply(util.count_sentences)
test_balanced_style[['num_sentences', 'num_upper_sentences', 'num_lower_sentences']] = pd.DataFrame(test_balanced_style['sentences'].tolist(), index=test_balanced_style.index)
test_balanced_style = test_balanced_style.drop('sentences', axis=1)

In [10]:
train_imbalanced_style['num_chars'] = train_imbalanced_text['body'].apply(util.count_chars)
train_imbalanced_style['num_newlines'] = train_imbalanced_text['body'].apply(util.count_newlines)
train_imbalanced_style['num_words'] = train_imbalanced_text['tokens'].apply(util.count_words)
train_imbalanced_style['num_unique_words'] = train_imbalanced_text['tokens'].apply(util.count_unique_words)
train_imbalanced_style['sentences'] = train_imbalanced_text['body'].apply(util.count_sentences)
train_imbalanced_style[['num_sentences', 'num_upper_sentences', 'num_lower_sentences']] = pd.DataFrame(train_imbalanced_style['sentences'].tolist(), index=train_imbalanced_style.index)
train_imbalanced_style = train_imbalanced_style.drop('sentences', axis=1)

In [None]:
test_imbalanced_style['num_chars'] = test_imbalanced_text['body'].apply(util.count_chars)
test_imbalanced_style['num_newlines'] = test_imbalanced_text['body'].apply(util.count_newlines)
test_imbalanced_style['num_words'] = test_imbalanced_text['tokens'].apply(util.count_words)
test_imbalanced_style['num_unique_words'] = test_imbalanced_text['tokens'].apply(util.count_unique_words)
test_imbalanced_style['sentences'] = test_imbalanced_text['body'].apply(util.count_sentences)
test_imbalanced_style[['num_sentences', 'num_upper_sentences', 'num_lower_sentences']] = pd.DataFrame(test_imbalanced_style['sentences'].tolist(), index=test_imbalanced_style.index)
test_imbalanced_style = test_imbalanced_style.drop('sentences', axis=1)

In [None]:
test_balanced_style.head()