In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import features as util
from raw_utils import save_to_csv

from ast import literal_eval

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = ['train_1.csv', 'train_2.csv']
test = ['test_1.csv', 'test_2.csv']

In [3]:
train_1 = pd.read_csv(os.path.join(csv_path, train[0]), index_col=0, converters={'body': literal_eval})
test_1 = pd.read_csv(os.path.join(csv_path, test[0]), index_col=0, converters={'body': literal_eval})

In [4]:
train_2 = pd.read_csv(os.path.join(csv_path, train[1]), index_col=0, converters={'body': literal_eval})
test_2 = pd.read_csv(os.path.join(csv_path, test[1]), index_col=0, converters={'body': literal_eval})

After the preprocessing, the data look like this:

In [5]:
test_1.head()

Unnamed: 0,body,class,id
0,"[jim, attach, several, example, implementation...",False,1457
1,"[explain, english, talk, july, 6, day, since, ...",False,3146
2,"[dear, value, capital, one, member, capital, o...",True,755
3,"[andrew, holiday, shopping, andrew, oops, two,...",False,1616
4,"[update, information, efcu, survive, layoff, w...",False,2429


## Feature Extraction

Before inputing the emails into the machine learning algorithms, they have to be converted to numbers.<br>
This process is called **feature extraction**, or **vectorization**. We will try different methods of achieving this, in order to compare their results.

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, we use only the top 500 most frequent terms.

In [6]:
tfidf_1 = util.tfidf_features(train_1['body'], test_1['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_1 = tfidf_1['tfidf_train']
tfidf_test_1 = tfidf_1['tfidf_test']
tfidf_model_1 = tfidf_1['vectorizer']

In [8]:
tfidf_2 = util.tfidf_features(train_2['body'], test_2['body'], min_df=5, max_features=500)

In [9]:
tfidf_train_2 = tfidf_2['tfidf_train']
tfidf_test_2 = tfidf_2['tfidf_test']
tfidf_model_2 = tfidf_2['vectorizer']

As an example, we can see a part of the calcuated matrix for the first test set:

In [10]:
tfidf_test_1.head()

Unnamed: 0,10,11,12,14,15,16,20,2000,2001,2002,2015,2016,2018,2019,24,27,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,administrator,advise,agree,agreement,alert,allow,already,also,america,american,amount,another,answer,app,apple,application,approval,approve,area,ask,attach,attachment,august,available,avoid,back,bank,banking,base,believe,best,bill,billion,block,bond,book,box,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,come,comment,communication,company,complete,concern,conference,confidential,confirm,confirmation,consider,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,currently,customer,daily,data,database,date,david,day,de,deal,dear,december,...,reply,report,request,require,research,reserve,response,result,return,review,right,risk,road,run,safe,sale,san,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,september,serve,server,service,set,share,short,show,sign,since,sincerely,sit,site,smith,software,someone,soon,space,spam,special,staff,standard,start,state,statement,step,steve,still,stock,storage,street,subject,summary,supply,support,sure,susan,system,take,talk,team,technology,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,total,trade,trading,transaction,transfer,try,tuesday,two,type,united,unsubscribe,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,wait,want,way,web,wednesday,week,well,window,within,without,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.06309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067692,0.04535,0.0,0.050052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.0,0.061849,0.0,0.0,0.213605,0.0,0.0,0.206905,0.0,0.049012,0.0,0.181198,0.0,0.0,...,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.0,0.037755,0.0716,0.0,0.0,0.0,0.0,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.201682,0.0,0.0,0.0,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.480843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.070329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.072982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.656607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.044159,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067682,0.064069,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.079335,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065998,0.0,0.119417,0.0,0.0,0.0
3,0.0,0.0,0.04584,0.0,0.0,0.0,0.044932,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104853,0.0,0.030051,0.0,0.0,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079109,0.0,0.0,0.0,0.044301,0.040786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.0,0.0,0.043705,0.0,0.034747,0.036998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034787,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096449,0.0,0.039619,0.0,0.0,...,0.030631,0.0,0.034005,0.0,0.0,0.032632,0.0,0.0,0.0,0.0,0.030077,0.0,0.0,0.0,0.0,0.138239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.175561,0.0,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.042352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039684,0.0,0.0,0.098014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03184,0.0,0.03234,0.0,0.046202,0.037201,0.0,0.0,0.0,0.0,0.0,0.038869,0.0,0.0,0.029923,0.036555,0.0,0.042268,0.0,0.0,0.0,0.0,0.0,0.076794,0.0,0.0,0.139731,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03715,0.0,0.0,0.080304,0.0,0.0,0.036848,0.0,0.0,0.0,0.0,0.0,0.0,0.029673,0.043705,0.037728,0.0
4,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.138328,0.166956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.171478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146034,0.0,0.0,0.0,0.0,0.0,0.117439,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**

After the vectors for each word are calculated, they are being averaged for the words of each email, thus resulting in a single vector for each email.

In [11]:
word2vec_1 = util.word2vec_features(train_1['body'], test_1['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_1 = word2vec_1['word2vec_train']
word2vec_test_1 = word2vec_1['word2vec_test']
word2vec_model_1 = word2vec_1['vectorizer']

In [13]:
word2vec_2 = util.word2vec_features(train_2['body'], test_2['body'], vector_size=100, min_count=5)

In [14]:
word2vec_train_2 = word2vec_2['word2vec_train']
word2vec_test_2 = word2vec_2['word2vec_test']
word2vec_model_2 = word2vec_2['vectorizer']

The resulting feature sets are like the following:

In [15]:
word2vec_test_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.059411,-0.35391,0.045215,0.109553,-0.021565,-0.044673,0.080055,0.086057,-0.234316,0.048573,0.272643,-0.146713,0.009622,0.095768,-0.017212,-0.073908,-0.500923,0.00685,-0.302793,-0.221156,0.282987,-0.207737,-0.181999,-0.097314,-0.272142,0.228926,-0.071726,0.3016,0.161157,0.145219,0.494553,-0.011484,-0.245688,0.21462,-0.228905,-0.026241,-0.162872,-0.003657,-0.281354,-0.405219,-0.120583,-0.325413,-0.105043,0.286575,-0.024071,0.177438,0.019602,-0.049864,0.295219,0.031017,0.201451,-0.067469,0.277258,0.020689,0.009271,0.02435,0.049697,-0.274682,0.272032,0.489007,-0.149495,-0.245723,0.055711,-0.160236,0.29236,-0.669207,-0.615251,-0.156427,0.011114,-0.456575,-0.0645,0.216325,0.09184,0.003066,-0.427741,0.069178,0.074326,-0.244037,0.217306,-0.419903,0.282016,0.136086,0.147143,-0.246018,0.137831,0.053576,0.380128,0.070082,-0.104548,-0.101143,-0.560325,0.107797,0.134958,0.151812,-0.419027,0.565784,-0.119244,-0.148622,0.14677,-0.147749
1,-0.256282,-0.259944,-0.042743,0.267715,0.290777,-0.323509,-0.02385,0.109896,-0.486081,-0.022021,0.159474,-0.391171,-0.00112,0.287805,-0.005677,0.075615,-0.513079,0.192631,-0.456363,-0.005056,0.516218,-0.101829,-0.079302,-0.379964,-0.316378,0.258779,-0.077995,0.373458,0.227822,0.15617,0.529097,0.183163,-0.221887,-0.116392,-0.452853,0.113514,-0.050165,-0.203261,-0.154643,-0.278653,-0.052768,-0.18443,-0.103623,0.198445,-0.390331,0.219462,0.08809,-0.282896,0.322658,0.097885,0.131543,0.038947,0.086405,-0.098545,-0.248415,0.090331,-0.114279,-0.140732,0.484846,0.381592,-0.25912,-0.293109,0.084275,-0.254551,0.214284,-0.617412,-0.270717,0.040256,0.100395,-0.422042,-0.249635,0.319152,-0.094174,-0.009969,-0.187924,-0.076716,0.148517,-0.097775,0.011362,-0.302856,0.242402,-0.021135,0.344365,-0.350195,-0.072976,0.258367,0.056736,0.188518,0.025506,-0.009265,-0.680733,0.081386,-0.084562,0.348882,-0.37719,0.425992,-0.271304,-0.208021,0.046308,-0.027428
2,0.699268,-0.589767,-0.169791,-0.042476,0.055139,0.083013,-0.196328,0.203763,-0.490879,0.652965,0.433774,-0.440062,0.137577,-0.042008,0.291753,-0.007267,-0.977844,0.181623,-0.392348,0.135306,0.935707,-0.419309,-0.375127,-0.006005,-0.448433,0.552309,-0.394924,0.442917,-0.314007,0.202757,0.863679,-0.206095,-0.408089,0.335372,-0.242549,-0.209114,-0.251597,-0.296239,-0.072696,-0.50094,-0.343901,-0.527076,-0.415081,0.344201,-0.291108,0.534659,-0.28286,0.047169,0.777051,-0.259156,0.59994,-0.174087,0.658543,-0.266249,0.001311,-0.37707,0.303544,-0.208413,0.403103,0.530164,0.241525,-0.269665,-0.041374,0.072958,0.299189,-0.784613,-1.170357,-0.399872,-0.269069,-0.859658,-0.161804,0.44009,-0.161084,-0.063396,-0.411465,0.359565,0.091093,-0.228794,0.126797,-0.521235,0.70958,0.212713,0.304181,-0.532956,0.313286,-0.016799,0.302807,0.086481,0.295397,-0.250354,-0.750001,-0.142899,0.470672,0.365808,-0.735498,0.916626,-0.144477,-0.596242,0.411535,-0.384207
3,0.006588,-0.371292,-0.025022,0.052958,0.002633,0.059758,0.036636,0.119935,-0.390216,0.065206,0.370295,-0.2667,0.01637,0.228103,0.162969,0.00578,-0.563975,0.131039,-0.467267,-0.060001,0.602608,-0.280675,-0.076126,-0.175007,-0.274279,0.366323,-0.085136,0.404179,0.116996,0.069141,0.598822,-0.077345,-0.400098,0.28393,-0.275309,-0.057183,-0.180276,-0.153826,-0.153946,-0.417055,-0.085733,-0.298864,-0.157868,0.310639,-0.331018,0.296891,0.121277,-0.14226,0.305415,0.037922,0.401344,-0.033582,0.027285,-0.067356,-0.131233,0.073001,0.056946,-0.350381,0.426484,0.402103,0.012769,-0.250113,0.190048,-0.242095,0.411724,-0.694849,-0.664402,-0.196794,0.138977,-0.424888,-0.108276,0.296323,-0.138955,0.020564,-0.369704,-0.113967,0.074546,-0.322126,0.035896,-0.371119,0.487305,0.074189,0.254311,-0.298249,0.160748,-0.048969,0.109537,0.096077,0.090411,-0.120228,-0.592898,-0.076147,0.133348,0.16724,-0.457461,0.646159,-0.119783,-0.279941,0.120887,-0.120036
4,0.05548,-0.343332,0.025302,0.031018,0.127881,-0.049797,-0.093276,0.148523,-0.321515,0.153197,0.37503,-0.246415,0.057209,0.197798,-0.061523,0.035101,-0.572544,0.153227,-0.413321,-0.103947,0.490166,-0.283943,-0.168344,-0.1553,-0.353389,0.300425,-0.135448,0.355497,0.053242,0.148434,0.540734,-0.042484,-0.29604,0.075677,-0.289172,-0.070313,-0.091459,-0.112792,-0.166921,-0.394095,-0.063711,-0.214773,-0.111124,0.334764,-0.361649,0.16525,0.006029,-0.13638,0.371635,-0.042852,0.273087,-0.072639,0.175098,-0.039809,-0.067486,0.052938,0.091513,-0.300205,0.40507,0.479374,-0.128981,-0.275131,0.251542,-0.182351,0.299636,-0.669147,-0.531909,-0.143316,0.120173,-0.598163,-0.126322,0.297088,-0.001796,-0.011017,-0.297535,-0.011648,0.195757,-0.228249,0.176212,-0.428003,0.312973,0.132835,0.239901,-0.355886,0.176043,0.119695,0.241875,0.02361,-0.071816,-0.084769,-0.6525,-0.05712,0.132571,0.257331,-0.432545,0.647439,-0.23109,-0.151654,0.106187,-0.066126


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

## Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

### TF-IDF

In [16]:
selected_tfidf_1 = util.chi2_feature_selection(tfidf_train_1, train_1['class'], tfidf_test_1, percentile=50)

In [17]:
tfidf_sel_train_1 = selected_tfidf_1['features_train']
tfidf_sel_test_1 = selected_tfidf_1['features_test']
tfidf_sel_model_1 = selected_tfidf_1['selector']

In [18]:
selected_tfidf_2 = util.chi2_feature_selection(tfidf_train_2, train_2['class'], tfidf_test_2, percentile=50)

In [19]:
tfidf_sel_train_2 = selected_tfidf_2['features_train']
tfidf_sel_test_2 = selected_tfidf_2['features_test']
tfidf_sel_model_2 = selected_tfidf_2['selector']

The now-reduced test set:

In [20]:
tfidf_sel_test_1.head()

Unnamed: 0,15,2000,2001,2002,2015,2016,2018,2019,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,already,also,app,apple,attach,august,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,conference,confirm,continue,contract,copyright,could,customer,daily,date,david,de,deal,dear,december,delete,deliver,delivery,detail,device,direct,download,draft,due,dynegy,ect,ee,email,employee,ena,energy,enron,enronxgate,ensure,even,event,expire,express,failure,fax,federal,ferc,final,firm,forward,friday,full,fw,game,gas,get,give,go,good,grant,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jones,jose,july,kindly,know,last,let,letter,like,limit,line,link,list,login,look,mail,mailbox,mailto,many,mark,market,meet,meeting,member,mike,million,monday,month,morning,much,natural,north,notice,notification,november,october,offer,one,online,option,party,password,paul,payment,paypal,pending,people,per,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,re,receive,recent,register,reply,report,request,research,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,september,server,service,short,sign,sincerely,smith,space,spam,start,statement,step,steve,still,stock,storage,subject,support,susan,talk,team,tell,thank,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,within,work,would,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04535,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.213605,0.206905,0.0,0.0,0.181198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042958,0.0,0.0,0.0,0.0,0.189622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066628,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121398,0.0,0.0,0.434566,0.0,0.044003,0.0,0.048868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047288,0.0,0.0,0.0,0.0,0.0,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.037755,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062933,0.0,0.083274,0.07206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382583,0.274008,0.0,0.0,0.0,0.0,0.15013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.065998,0.119417,0.0
3,0.0,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030051,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.043705,0.0,0.0,0.0,0.0,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.070488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116103,0.035545,0.0,0.0,0.0,0.085567,0.0,0.0,0.087357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032567,0.0,0.0,0.031276,0.037253,0.041384,0.035811,0.046426,0.038513,0.045605,0.0,0.0,0.0,0.0,0.0,0.070234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039555,0.031688,0.0,0.0,0.043327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0795,0.0,0.0,0.0,0.0,0.0393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062037,0.0,0.0,0.0,0.0,0.0,0.030631,0.0,0.034005,0.0,0.032632,0.030077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.03234,0.046202,0.0,0.0,0.0,0.038869,0.0,0.029923,0.036555,0.042268,0.0,0.0,0.076794,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036848,0.0,0.0,0.029673,0.037728
4,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180497,0.0,0.0,0.111525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15655,0.0,0.0,0.0,0.096174,0.0,0.0,0.183347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128892,0.149106,0.0,0.0,0.147439,0.0,0.0,0.275617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117439,0.0


## Final Dataset Creation

Before using the features with the machine learning algorithms, it is best to tidy up the datasets by adding the features, the id and the class columns in the same DataFrame.

### TF-IDF

In [21]:
final_tfidf_train_1 = tfidf_sel_train_1.copy()
final_tfidf_train_1.insert(0, 'email_class', train_1['class'])
final_tfidf_train_1.insert(0, 'email_id', train_1['id'])

final_tfidf_test_1 = tfidf_sel_test_1.copy()
final_tfidf_test_1.insert(0, 'email_class', test_1['class'])
final_tfidf_test_1.insert(0, 'email_id', test_1['id'])

In [22]:
final_tfidf_train_2 = tfidf_sel_train_2.copy()
final_tfidf_train_2.insert(0, 'email_class', train_2['class'])
final_tfidf_train_2.insert(0, 'email_id', train_2['id'])

final_tfidf_test_2 = tfidf_sel_test_2.copy()
final_tfidf_test_2.insert(0, 'email_class', test_2['class'])
final_tfidf_test_2.insert(0, 'email_id', test_2['id'])

### Word2Vec

In [23]:
final_word2vec_train_1 = word2vec_train_1.copy()
final_word2vec_train_1.insert(0, 'email_class', train_1['class'])
final_word2vec_train_1.insert(0, 'email_id', train_1['id'])

final_word2vec_test_1 = word2vec_test_1.copy()
final_word2vec_test_1.insert(0, 'email_class', test_1['class'])
final_word2vec_test_1.insert(0, 'email_id', test_1['id'])

In [24]:
final_word2vec_train_2 = word2vec_train_2.copy()
final_word2vec_train_2.insert(0, 'email_class', train_2['class'])
final_word2vec_train_2.insert(0, 'email_id', train_2['id'])

final_word2vec_test_2 = word2vec_test_2.copy()
final_word2vec_test_2.insert(0, 'email_class', test_2['class'])
final_word2vec_test_2.insert(0, 'email_id', test_2['id'])

### Saving the Results

In [25]:
save_to_csv(final_tfidf_train_1, csv_path, 'tfidf_chi2_train_1.csv')
save_to_csv(final_tfidf_test_1, csv_path, 'tfidf_chi2_test_1.csv')

save_to_csv(final_tfidf_train_2, csv_path, 'tfidf_chi2_train_2.csv')
save_to_csv(final_tfidf_test_2, csv_path, 'tfidf_chi2_test_2.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_1.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_1.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_2.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_2.csv


In [26]:
save_to_csv(final_word2vec_train_1, csv_path, 'word2vec_train_1.csv')
save_to_csv(final_word2vec_test_1, csv_path, 'word2vec_test_1.csv')

save_to_csv(final_word2vec_train_2, csv_path, 'word2vec_train_2.csv')
save_to_csv(final_word2vec_test_2, csv_path, 'word2vec_test_2.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_1.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_1.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_2.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_2.csv
