In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 160

import features as util
from raw_utils import save_to_csv
from preprocessing import dataset_add_columns

from ast import literal_eval

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train_tokens = ['train_balanced_tokens.csv', 'train_imbalanced_tokens.csv']
test_tokens = ['test_balanced_tokens.csv', 'test_imbalanced_tokens.csv']

#### Tokenized emails

In [3]:
train_balanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[0]), index_col=0, converters={'body': literal_eval})
test_balanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[0]), index_col=0, converters={'body': literal_eval})

In [4]:
train_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[1]), index_col=0, converters={'body': literal_eval})
test_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[1]), index_col=0, converters={'body': literal_eval})

After the preprocessing, the data look like this:

In [5]:
train_balanced_tokens.head()

Unnamed: 0,id,body,class
0,895,"[yes, keep, post, placed, call, customer, morning, heard, back, yet, geoff, message, jeff, enronxgate, enron, ee, send, monday, november, 26, 2001, frazier,...",False
1,381,"[vega, send, regard]",False
2,1104,"[eol, deal, enpower, desk, total, deal, total, mwh, desk, total, deal, total, mwh, epmi, long, term, california, 454, epmi, long, term, california, 94, epmi...",False
3,2113,"[payment, status, change, follow, report, status, last, change, system, expense, report, name, 2001, april, report, total, amount, due, employee, amount, ap...",False
4,1437,"[start, date, hourahead, hour, 18, hourahead, schedule, download, fail, manual, intervention, require]",False


# Feature Extraction

Before inputing the emails to the machine learning algorithms, they have to be converted to numberical matrices.<br>
This process is called **feature extraction**. Different methods of achieving this will be tried, in order to compare their results.

## Text Vectorization

The baseline feature set will simply consist of numerical representations of the text data. This process is also called **vectorization**. 

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, only the top 500 most frequent terms are used.

In [6]:
tfidf_balanced = util.tfidf_features(train_balanced_tokens['body'], test_balanced_tokens['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_balanced = tfidf_balanced['tfidf_train']
tfidf_test_balanced = tfidf_balanced['tfidf_test']
tfidf_model_balanced = tfidf_balanced['vectorizer']

In [8]:
tfidf_imbalanced = util.tfidf_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], min_df=5, max_features=500)

In [9]:
tfidf_train_imbalanced = tfidf_imbalanced['tfidf_train']
tfidf_test_imbalanced = tfidf_imbalanced['tfidf_test']
tfidf_model_imbalanced = tfidf_imbalanced['vectorizer']

As an example, here is a part of the calcuated matrix for the balanced train set:

In [10]:
tfidf_train_balanced.head()

Unnamed: 0,0px,10,100,11,12,14,15,16,20,2000,2001,2002,2015,2016,2018,2019,24,25,30,713,853,able,absa,accept,access,account,action,active,activity,add,additional,address,adjust,administrator,agreement,alert,allow,already,also,america,american,another,app,apple,application,approve,area,arial,ask,attach,attachment,august,auto,available,avoid,back,background,bank,banking,base,believe,best,bill,billion,block,book,border,bottom,box,br,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cc,center,change,charge,chase,check,chris,click,close,code,collapse,color,come,comment,commission,committee,communication,company,complete,concern,conference,confidential,confirm,confirmation,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,customer,daily,data,database,date,david,day,...,report,request,require,reserve,respond,response,result,return,review,rgb,right,risk,run,safe,sale,san,sans,sara,say,schedule,secure,security,see,select,sell,send,sender,sent,september,serif,serve,server,service,set,share,show,sign,since,sincerely,sit,site,size,smith,software,solution,soon,space,spam,staff,standard,start,state,statement,step,steve,still,stock,storage,street,style,subject,support,sure,system,table,take,talk,tbody,td,team,tell,term,texas,text,thank,thanks,thing,think,three,thru,thursday,time,today,top,total,tr,trade,trading,transaction,transfer,try,tuesday,two,type,united,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,want,way,web,wednesday,week,well,width,within,without,word,work,world,would,write,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145353,0.0,0.0,0.0,0.0,0.0,0.04557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199729,0.0,0.0,0.0,0.0,0.0,0.0,0.112632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0457,0.0,0.0,0.0,0.226315,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04981,0.0,0.0,0.0,0.0,0.0,0.0,0.054635,0.0,0.0,0.0,0.0,0.0,0.0,0.031791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069532,0.0,0.0,0.0,0.0,0.0,0.0,0.045002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04557,0.0,0.0,0.0,0.052707,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.586204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.339758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.622199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083002,0.0,0.0,0.42713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.364896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268707,0.0,0.0,0.0,0.0,0.096788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.632361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37179,0.0,0.0,...,0.0,0.0,0.386935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a high-dimensional vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**.

After the word vectors are calculated, the vectors of each word in an email are being averaged, thus resulting in a single vector for each email.

In [11]:
word2vec_balanced = util.word2vec_features(train_balanced_tokens['body'], test_balanced_tokens['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_balanced = word2vec_balanced['word2vec_train']
word2vec_test_balanced = word2vec_balanced['word2vec_test']
word2vec_model_balanced = word2vec_balanced['vectorizer']

In [13]:
word2vec_imbalanced = util.word2vec_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], vector_size=100, min_count=5)

In [14]:
word2vec_train_imbalanced = word2vec_imbalanced['word2vec_train']
word2vec_test_imbalanced = word2vec_imbalanced['word2vec_test']
word2vec_model_imbalanced = word2vec_imbalanced['vectorizer']

The resulting feature sets are like the following:

In [15]:
word2vec_train_balanced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.181471,-0.013212,-0.212966,0.102428,-0.0461,0.564205,0.084792,-0.136815,-0.286497,-0.366036,0.232329,-0.231615,-0.344562,-0.258595,0.126629,0.324958,-0.214511,0.298363,0.025658,0.126203,-0.347304,-0.248482,0.739751,-0.009832,-0.629327,0.218767,-0.147309,0.560471,0.100813,-0.434819,0.224567,-0.11033,0.160675,-0.005174,-0.239979,0.113594,0.086343,-0.05288,-0.264629,-0.495257,0.232085,0.391872,0.214378,0.468009,-0.783528,0.671119,-0.589896,-0.341695,-0.091668,0.377631,0.114003,-0.047962,0.103307,-0.39061,-0.229621,0.155337,0.256022,-0.54776,-0.121965,0.184937,-0.391727,-0.185402,0.232536,0.29268,0.258107,-0.755681,-0.082941,0.353917,0.200538,-0.629897,-0.216557,0.366558,0.189095,-0.141663,-0.01327,-0.582285,-0.242842,-0.549415,0.237581,-0.024087,-0.094452,-0.231246,0.221699,0.343867,-0.281402,-0.015229,-0.144878,0.398505,0.239335,0.000363,-0.340667,-0.076905,-0.260328,0.179025,-0.139381,0.29635,-0.378977,-0.503308,0.356107,-0.163854
1,-0.590803,0.103948,-0.508577,-0.515212,-0.529945,0.50643,0.468798,-0.023388,-0.429914,-0.534801,1.14332,-0.159613,-0.132213,0.002652,0.162228,-0.305747,-0.701688,0.014196,-0.235202,-0.309855,-0.106905,-0.446561,0.994502,0.115006,-0.525095,0.255954,-0.312214,0.403712,0.00494,-0.791724,0.78153,-0.581528,0.436233,0.179422,-0.825473,0.196159,0.205024,-0.357146,-0.146573,-0.646365,-0.35141,0.98904,0.729398,0.34741,-0.962672,0.644998,-0.441342,-0.458283,-0.523234,0.245516,-0.257614,-0.129819,-0.136123,0.083119,0.273604,0.552031,0.190188,-0.383991,-0.015966,-0.169815,-0.457257,-0.224127,-0.142136,0.290191,0.336322,-1.508233,0.354753,0.356357,0.510811,-0.759417,-0.438354,0.334849,0.093703,-0.358079,-0.189649,-0.605199,-0.5923,0.049456,0.004256,-0.225673,-0.009333,0.353423,-0.004652,0.513259,-0.676484,0.095856,0.462108,0.150346,0.332201,-0.304717,0.271897,0.116133,0.056405,0.288863,-0.50359,0.038846,-0.262468,-0.768463,0.115533,-0.672975
2,0.23122,-0.007019,-0.208322,0.225785,0.395101,0.06182,-0.210598,-0.040641,-0.252576,-0.221131,-0.085992,-0.369351,-0.153867,0.307611,-0.167472,-0.000974,-0.236259,0.330118,0.025523,-0.071108,0.072083,0.097724,-0.087454,-0.223248,-0.252679,0.105968,-0.169126,0.471791,0.054549,-0.26138,0.403243,-0.340328,-0.354011,0.058686,-0.327665,-0.407821,0.363925,-0.19282,-0.30967,-0.666293,0.020566,-0.166661,-0.111941,0.032086,-0.636689,0.493175,-0.402752,0.046785,0.33569,0.06586,0.363885,0.30135,0.282396,-0.64176,-0.303937,0.017611,0.076539,-0.424221,0.176038,0.166796,-0.314241,0.147592,0.452459,0.094363,0.06934,-0.247687,-0.181063,0.148561,-0.12737,-0.379549,-0.000588,0.094391,0.028729,-0.222031,0.129215,-0.245772,0.238552,-0.393442,0.312249,-0.155226,0.220294,0.044468,0.106791,0.305153,0.390621,0.113542,-0.267077,-0.0199,0.141197,0.001866,-0.602598,0.092404,-0.04665,-0.044422,-0.173118,0.279627,-0.673828,-0.255664,0.056521,-0.111429
3,0.203495,-0.187653,-0.35717,-0.040669,0.059333,0.12389,-0.017554,-0.015815,-0.157866,-0.530621,0.389525,-0.177966,0.091407,0.200101,0.011085,-0.094335,-0.356843,0.084046,-0.061564,-0.182568,0.305484,0.165141,0.01733,0.292658,-0.226717,0.248477,-0.177986,0.159905,-0.180657,-0.496332,0.485427,-0.452567,-0.496989,-0.000554,-0.6883,-0.240995,0.179902,-0.419554,-0.383663,-0.493751,-0.171573,0.214127,0.21306,0.286363,-0.320953,0.791081,-0.251785,-0.131733,0.094802,0.074086,0.324328,0.218448,0.40355,-0.443009,-0.240911,0.353045,0.143939,-0.233231,0.212771,0.441109,-0.426941,-0.1952,0.239214,-0.128366,0.087253,-0.592999,0.016396,0.029576,0.014427,-0.667162,-0.032693,0.335568,0.375594,-0.157294,-0.062005,-0.178229,0.061678,-0.375873,0.085051,-0.066034,0.234673,0.328523,0.114654,0.213085,-0.157665,-0.039962,0.109904,0.146921,0.144526,-0.141556,-0.153758,0.172803,0.121204,0.201996,-0.495559,0.220414,-0.500419,-0.812826,0.419515,-0.510215
4,0.240746,-0.082494,-0.249111,0.074216,0.20476,-0.13643,-0.094592,-0.073516,-0.294005,-0.366604,0.221303,-0.155005,-0.089589,0.221637,-0.112202,-0.190533,-0.157868,0.188591,-0.031683,-0.137178,0.240209,0.346761,-0.060355,0.185346,-0.241067,0.278263,-0.224337,0.250936,-0.139168,-0.482639,0.362542,-0.621513,-0.460738,0.106773,-0.810205,-0.340928,0.283203,-0.362211,-0.369736,-0.607982,-0.084392,0.021484,0.110122,0.234698,-0.33066,0.560508,-0.373467,0.015996,-0.005209,-0.02511,0.106359,0.105908,0.385981,-0.489372,-0.121642,0.448326,0.422355,-0.218007,0.380575,0.213857,-0.201645,-0.094901,0.398201,-0.161407,0.004544,-0.511446,-0.162326,-0.172804,-0.106824,-0.446215,-0.275618,0.218838,0.077595,-0.210133,0.070917,-0.215175,0.062549,-0.349036,-0.081481,-0.303579,-0.033076,0.217112,0.081223,0.401558,-0.0183,0.036956,-0.009654,-0.006215,0.179414,-0.236886,-0.121111,-0.04153,0.321334,0.140173,-0.42232,0.190067,-0.453052,-0.698817,0.253899,-0.408928


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

# Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

## Vectorization Features

### TF-IDF

In [16]:
selected_tfidf_balanced = util.chi2_feature_selection(tfidf_train_balanced, train_balanced_tokens['class'], tfidf_test_balanced, percentile=50)

In [17]:
tfidf_sel_train_balanced = selected_tfidf_balanced['features_train']
tfidf_sel_test_balanced = selected_tfidf_balanced['features_test']
tfidf_sel_model_balanced = selected_tfidf_balanced['selector']

In [18]:
selected_tfidf_imbalanced = util.chi2_feature_selection(tfidf_train_imbalanced, train_imbalanced_tokens['class'], tfidf_test_imbalanced, percentile=50)

In [19]:
tfidf_sel_train_imbalanced = selected_tfidf_imbalanced['features_train']
tfidf_sel_test_imbalanced = selected_tfidf_imbalanced['features_test']
tfidf_sel_model_imbalanced = selected_tfidf_imbalanced['selector']

The now-reduced train set:

In [20]:
tfidf_sel_train_balanced.head()

Unnamed: 0,0px,10,2000,2001,2002,2015,2016,2018,2019,30,713,853,absa,access,account,active,activity,address,administrator,agreement,alert,already,also,american,app,apple,area,attach,august,auto,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,center,chase,chris,click,come,comment,commission,committee,company,conference,confirm,continue,contract,copyright,could,customer,daily,david,de,deal,dear,december,delete,deliver,delivery,desk,detail,device,download,draft,due,ect,ee,email,employee,energy,enron,enronxgate,error,even,event,expire,express,failure,fax,federal,final,forward,friday,fw,gas,get,give,go,good,great,group,hear,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,kindly,know,last,let,letter,like,limit,link,list,login,long,look,mail,mailbox,mailto,many,march,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,news,north,notice,notification,november,october,offer,one,online,option,outage,party,password,payment,paypal,pending,people,per,personal,phone,plan,pm,point,power,president,price,privacy,process,program,project,promise,protect,put,question,quota,rate,re,receive,recent,recently,reply,report,request,reserve,right,risk,safe,sara,say,schedule,secure,security,see,sell,sender,september,server,service,sign,since,sincerely,sit,size,smith,spam,start,step,steve,still,stock,storage,street,subject,support,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,within,work,would,year
0,0.0,0.0,0.0,0.077039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199729,0.0,0.0,0.112632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0457,0.226315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129429,0.0,0.0,0.0,0.035758,0.777739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036475,0.0,0.039493,0.0,0.0,0.047822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264096,0.0,0.0,0.0,0.0,0.073079,0.0,0.08057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230121,0.0,0.144397,0.0,0.055163,0.0,0.0,0.0,0.0,0.103682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031791,0.0,0.0,0.0,0.0,0.0,0.116736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093603,0.0,0.0,0.0,0.0,0.069532,0.0,0.0,0.0,0.0,0.045002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04557,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168238,0.0,0.0,0.0,0.0,0.027633,0.114556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.366999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029418,0.0,0.0,0.027584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025326,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.124843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144087,0.0,0.0,0.0,0.19148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149404,0.0,0.0,0.0,0.0,0.125703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.439354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Final Dataset Creation

Before using the features for classification with the machine learning algorithms, it is best to tidy up the datasets and keep them consistent by concatenating the features, the id and the class columns in the same DataFrame.

In [21]:
column_names = ['email_class', 'email_id'] # column names changed in case the word class or id appear in the token list

### TF-IDF

In [22]:
final_tfidf_train_balanced = dataset_add_columns(tfidf_sel_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_tfidf_test_balanced = dataset_add_columns(tfidf_sel_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [23]:
final_tfidf_train_imbalanced = dataset_add_columns(tfidf_sel_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_tfidf_test_imbalanced = dataset_add_columns(tfidf_sel_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

### Word2Vec

In [24]:
final_word2vec_train_balanced = dataset_add_columns(word2vec_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_word2vec_test_balanced = dataset_add_columns(word2vec_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [25]:
final_word2vec_train_imbalanced = dataset_add_columns(word2vec_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_word2vec_test_imbalanced = dataset_add_columns(word2vec_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

In [26]:
final_tfidf_train_balanced.head()

Unnamed: 0,email_id,email_class,0px,10,2000,2001,2002,2015,2016,2018,2019,30,713,853,absa,access,account,active,activity,address,administrator,agreement,alert,already,also,american,app,apple,area,attach,august,auto,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,center,chase,chris,click,come,comment,commission,committee,company,conference,confirm,continue,contract,copyright,could,customer,daily,david,de,deal,dear,december,delete,deliver,delivery,desk,detail,device,download,draft,due,ect,ee,email,employee,energy,enron,enronxgate,error,even,event,expire,express,failure,fax,federal,final,forward,friday,fw,gas,get,give,go,good,great,group,hear,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,kindly,know,last,let,letter,...,link,list,login,long,look,mail,mailbox,mailto,many,march,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,news,north,notice,notification,november,october,offer,one,online,option,outage,party,password,payment,paypal,pending,people,per,personal,phone,plan,pm,point,power,president,price,privacy,process,program,project,promise,protect,put,question,quota,rate,re,receive,recent,recently,reply,report,request,reserve,right,risk,safe,sara,say,schedule,secure,security,see,sell,sender,september,server,service,sign,since,sincerely,sit,size,smith,spam,start,step,steve,still,stock,storage,street,subject,support,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,within,work,would,year
0,895,False,0.0,0.0,0.0,0.077039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199729,0.0,0.0,0.112632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0457,0.226315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129429,0.0,0.0,0.0,0.035758,0.777739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036475,0.0,0.039493,0.0,0.0,0.047822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264096,0.0,0.0,0.0,0.0,0.073079,0.0,0.08057,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230121,0.0,0.144397,0.0,0.055163,0.0,0.0,0.0,0.0,0.103682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031791,0.0,0.0,0.0,0.0,0.0,0.116736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093603,0.0,0.0,0.0,0.0,0.069532,0.0,0.0,0.0,0.0,0.045002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04557,0.0,0.0,0.0,0.0
1,381,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1104,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168238,0.0,0.0,0.0,0.0,0.027633,0.114556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.366999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029418,0.0,0.0,0.027584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025326,0.0,0.0,0.0,0.0
3,2113,False,0.0,0.0,0.0,0.124843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144087,0.0,0.0,0.0,0.19148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149404,0.0,0.0,...,0.125703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.632361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1437,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.439354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.443437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Saving the Results

In [27]:
save_to_csv(final_tfidf_train_balanced, csv_path, 'tfidf_chi2_train_balanced.csv')
save_to_csv(final_tfidf_test_balanced, csv_path, 'tfidf_chi2_test_balanced.csv')

save_to_csv(final_tfidf_train_imbalanced, csv_path, 'tfidf_chi2_train_imbalanced.csv')
save_to_csv(final_tfidf_test_imbalanced, csv_path, 'tfidf_chi2_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_imbalanced.csv


In [28]:
save_to_csv(final_word2vec_train_balanced, csv_path, 'word2vec_train_balanced.csv')
save_to_csv(final_word2vec_test_balanced, csv_path, 'word2vec_test_balanced.csv')

save_to_csv(final_word2vec_train_imbalanced, csv_path, 'word2vec_train_imbalanced.csv')
save_to_csv(final_word2vec_test_imbalanced, csv_path, 'word2vec_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_imbalanced.csv
