In [1]:
import pandas as pd
pd.options.display.max_columns = 250

import features as util

from ast import literal_eval

## Feature Extraction

Before inputing the emails into the machine learning algorithms, they have to be converted to numbers.<br>
This process is called **feature extraction**, or **vectorization**. We will try different methods of achieving this, in order to compare their results.

In [2]:
train_1 = pd.read_csv('./data/csv/train_1.csv', index_col=0, converters={'body': literal_eval})
test_1 = pd.read_csv('./data/csv/test_1.csv', index_col=0, converters={'body': literal_eval})

In [3]:
train_2 = pd.read_csv('./data/csv/train_2.csv', index_col=0, converters={'body': literal_eval})
test_2 = pd.read_csv('./data/csv/test_2.csv', index_col=0, converters={'body': literal_eval})

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, we use only the top 500 most frequent terms.

In [4]:
tfidf_1 = util.tfidf_features(train_1['body'], test_1['body'], min_df=5, max_features=500)

In [5]:
tfidf_train_1 = tfidf_1['tfidf_train']
tfidf_test_1 = tfidf_1['tfidf_test']
tfidf_model_1 = tfidf_1['vectorizer']

In [6]:
tfidf_2 = util.tfidf_features(train_2['body'], test_2['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_2 = tfidf_2['tfidf_train']
tfidf_test_2 = tfidf_2['tfidf_test']
tfidf_model_2 = tfidf_2['vectorizer']

As an example, we can see a part of the calcuated matrix for the first test set:

In [8]:
tfidf_test_1.head()

Unnamed: 0,10,11,12,14,15,16,20,2000,2001,2002,2015,2016,2018,2019,24,27,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,administrator,advise,agree,agreement,alert,allow,already,also,america,american,amount,another,answer,app,apple,application,approval,approve,area,ask,attach,attachment,august,available,avoid,back,bank,banking,base,believe,best,bill,billion,block,bond,book,box,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,come,comment,communication,company,complete,concern,conference,confidential,confirm,confirmation,consider,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,currently,customer,daily,data,database,date,david,day,de,deal,dear,december,...,reply,report,request,require,research,reserve,response,result,return,review,right,risk,road,run,safe,sale,san,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,september,serve,server,service,set,share,short,show,sign,since,sincerely,sit,site,smith,software,someone,soon,space,spam,special,staff,standard,start,state,statement,step,steve,still,stock,storage,street,subject,summary,supply,support,sure,susan,system,take,talk,team,technology,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,total,trade,trading,transaction,transfer,try,tuesday,two,type,united,unsubscribe,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,wait,want,way,web,wednesday,week,well,window,within,without,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.06309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067692,0.04535,0.0,0.050052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.0,0.061849,0.0,0.0,0.213605,0.0,0.0,0.206905,0.0,0.049012,0.0,0.181198,0.0,0.0,...,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.0,0.037755,0.0716,0.0,0.0,0.0,0.0,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.201682,0.0,0.0,0.0,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.480843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.070329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.072982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.656607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.044159,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067682,0.064069,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.079335,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065998,0.0,0.119417,0.0,0.0,0.0
3,0.0,0.0,0.04584,0.0,0.0,0.0,0.044932,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104853,0.0,0.030051,0.0,0.0,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079109,0.0,0.0,0.0,0.044301,0.040786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.0,0.0,0.043705,0.0,0.034747,0.036998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034787,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096449,0.0,0.039619,0.0,0.0,...,0.030631,0.0,0.034005,0.0,0.0,0.032632,0.0,0.0,0.0,0.0,0.030077,0.0,0.0,0.0,0.0,0.138239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.175561,0.0,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.042352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039684,0.0,0.0,0.098014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03184,0.0,0.03234,0.0,0.046202,0.037201,0.0,0.0,0.0,0.0,0.0,0.038869,0.0,0.0,0.029923,0.036555,0.0,0.042268,0.0,0.0,0.0,0.0,0.0,0.076794,0.0,0.0,0.139731,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03715,0.0,0.0,0.080304,0.0,0.0,0.036848,0.0,0.0,0.0,0.0,0.0,0.0,0.029673,0.043705,0.037728,0.0
4,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.138328,0.166956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.171478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146034,0.0,0.0,0.0,0.0,0.0,0.117439,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**

After the vectors for each word are calculated, they are being averaged for the words of each email, thus resulting in a single vector for each email.

In [9]:
word2vec_1 = util.word2vec_features(train_1['body'], test_1['body'], vector_size=100, min_count=5)

In [10]:
word2vec_train_1 = word2vec_1['word2vec_train']
word2vec_test_1 = word2vec_1['word2vec_test']
word2vec_model_1 = word2vec_1['vectorizer']

In [11]:
word2vec_2 = util.word2vec_features(train_2['body'], test_2['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_2 = word2vec_2['word2vec_train']
word2vec_test_2 = word2vec_2['word2vec_test']
word2vec_model_2 = word2vec_2['vectorizer']

The resulting feature sets are like the following:

In [13]:
word2vec_test_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.0545,-0.345685,0.037968,0.086598,-0.03693,-0.068189,0.108589,0.071854,-0.195268,0.052366,0.273533,-0.168288,0.011111,0.088411,0.008299,-0.067587,-0.478809,0.001017,-0.274377,-0.198739,0.244952,-0.193749,-0.161803,-0.073007,-0.241047,0.204149,-0.067016,0.242496,0.144678,0.160804,0.490281,-0.014754,-0.218524,0.236873,-0.221736,-0.034477,-0.14244,-0.02633,-0.308279,-0.360368,-0.145738,-0.346196,-0.126072,0.309843,-0.045301,0.20432,0.04376,-0.067377,0.311563,0.02445,0.182228,-0.051139,0.289488,0.018125,-0.013179,0.011727,0.070284,-0.259069,0.256869,0.515745,-0.15229,-0.266758,0.053989,-0.169579,0.305863,-0.690191,-0.632376,-0.157442,0.012075,-0.486596,-0.071335,0.211375,0.082437,-0.008678,-0.451962,0.087295,0.075421,-0.240893,0.209389,-0.458099,0.280147,0.10776,0.168113,-0.227395,0.155713,0.074656,0.385296,0.077223,-0.106947,-0.120849,-0.574669,0.1167,0.142215,0.17844,-0.444217,0.611354,-0.131721,-0.1595,0.163242,-0.145887
1,-0.238579,-0.256701,-0.050489,0.238103,0.250861,-0.353979,0.007401,0.090826,-0.458954,-0.011821,0.196375,-0.407376,0.031735,0.267343,0.021265,0.091862,-0.512562,0.20233,-0.453806,0.008426,0.557636,-0.109872,-0.060504,-0.368675,-0.301335,0.257987,-0.063066,0.32825,0.198533,0.15511,0.534908,0.150936,-0.221617,-0.088506,-0.466718,0.106446,-0.038295,-0.228205,-0.14282,-0.284804,-0.070367,-0.186716,-0.125932,0.215745,-0.407179,0.229372,0.114185,-0.294271,0.304589,0.08547,0.158717,0.032148,0.101979,-0.08162,-0.274581,0.066851,-0.082578,-0.139784,0.487574,0.373012,-0.245664,-0.305804,0.096474,-0.265602,0.232574,-0.59783,-0.282921,0.038646,0.120315,-0.436757,-0.222514,0.305223,-0.09419,0.006555,-0.205888,-0.055385,0.109619,-0.103182,0.007798,-0.31328,0.235099,-0.045026,0.334265,-0.345436,-0.039569,0.232784,0.039378,0.17515,0.044577,-0.04104,-0.647549,0.07542,-0.083274,0.360364,-0.369725,0.456672,-0.282061,-0.209696,0.065058,-0.021624
2,0.68807,-0.574785,-0.199136,-0.10174,0.013608,0.064906,-0.112716,0.151884,-0.425512,0.648127,0.465583,-0.455619,0.148448,-0.06608,0.328757,0.027602,-0.909457,0.190409,-0.331823,0.149575,0.950582,-0.39568,-0.334647,0.052568,-0.388543,0.492972,-0.360021,0.337634,-0.34972,0.199747,0.833669,-0.240719,-0.360934,0.409906,-0.253063,-0.228861,-0.222251,-0.320278,-0.096021,-0.465716,-0.353268,-0.540939,-0.428347,0.388915,-0.295312,0.583248,-0.25276,-0.00443,0.777461,-0.237886,0.643882,-0.19889,0.656404,-0.255565,-0.061443,-0.405294,0.348984,-0.235652,0.447003,0.567731,0.249221,-0.340604,-0.007659,0.0461,0.341591,-0.814204,-1.227026,-0.379805,-0.222511,-0.881764,-0.126341,0.442958,-0.183629,-0.06972,-0.486712,0.393232,0.043903,-0.229717,0.093874,-0.538247,0.633944,0.154905,0.287958,-0.494354,0.365124,-0.042803,0.259795,0.092508,0.318663,-0.262906,-0.69125,-0.132058,0.438065,0.400075,-0.715863,0.956828,-0.134589,-0.606024,0.444043,-0.365733
3,0.020995,-0.317775,-0.037971,0.00139,-0.041139,0.052207,0.065918,0.089813,-0.32288,0.068179,0.354588,-0.23689,0.013439,0.187818,0.183623,0.012355,-0.486981,0.118756,-0.390418,-0.053835,0.556748,-0.254981,-0.032929,-0.121014,-0.227365,0.317771,-0.062952,0.329615,0.087117,0.078905,0.558046,-0.078902,-0.370989,0.28657,-0.285474,-0.054387,-0.152974,-0.166495,-0.160384,-0.393211,-0.086493,-0.312166,-0.161202,0.335823,-0.371279,0.325193,0.135637,-0.173924,0.318198,0.045852,0.39924,-0.031704,0.033938,-0.070627,-0.170283,0.055689,0.084839,-0.355696,0.430381,0.425638,-0.004301,-0.278343,0.21573,-0.262024,0.435034,-0.736664,-0.6733,-0.204216,0.140224,-0.463006,-0.123632,0.300093,-0.140837,0.039076,-0.411065,-0.080564,0.065134,-0.344158,0.045359,-0.417074,0.471377,0.059733,0.278709,-0.290249,0.176518,-0.060933,0.115345,0.106215,0.105507,-0.136839,-0.603158,-0.099293,0.139593,0.201741,-0.48088,0.695983,-0.139414,-0.278509,0.142524,-0.105992
4,0.062906,-0.341021,0.009823,0.003632,0.10623,-0.064133,-0.054515,0.132457,-0.256983,0.149358,0.381921,-0.249509,0.070982,0.163342,-0.043897,0.058332,-0.560504,0.171875,-0.392623,-0.103881,0.510878,-0.26471,-0.152565,-0.13283,-0.330599,0.268361,-0.111169,0.311045,0.027462,0.15195,0.52431,-0.037359,-0.267665,0.095584,-0.266194,-0.083098,-0.071283,-0.121729,-0.169724,-0.372349,-0.061589,-0.236213,-0.122711,0.348766,-0.367557,0.192339,0.024526,-0.16659,0.374682,-0.040524,0.283656,-0.084679,0.177407,-0.045423,-0.098089,0.038087,0.115293,-0.283706,0.391888,0.467213,-0.134365,-0.293134,0.269058,-0.194046,0.319391,-0.683737,-0.5533,-0.130999,0.125271,-0.618512,-0.128308,0.295131,0.002842,-0.002012,-0.349119,0.01627,0.168833,-0.239511,0.179658,-0.444532,0.278913,0.117769,0.233393,-0.33555,0.17705,0.094069,0.240506,0.040487,-0.063037,-0.097045,-0.622871,-0.0635,0.120849,0.27671,-0.423564,0.670054,-0.230851,-0.14497,0.120815,-0.054185


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

## Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

### TF-IDF

In [14]:
selected_tfidf_1 = util.chi2_feature_selection(tfidf_train_1, train_1['class'], tfidf_test_1, percentile=50)

In [15]:
tfidf_sel_train_1 = selected_tfidf_1['features_train']
tfidf_sel_test_1 = selected_tfidf_1['features_test']
tfidf_sel_model_1 = selected_tfidf_1['selector']

In [16]:
selected_tfidf_2 = util.chi2_feature_selection(tfidf_train_2, train_2['class'], tfidf_test_2, percentile=50)

In [17]:
tfidf_sel_train_2 = selected_tfidf_2['features_train']
tfidf_sel_test_2 = selected_tfidf_2['features_test']
tfidf_sel_model_2 = selected_tfidf_2['selector']

The now-reduced test set:

In [18]:
tfidf_sel_test_1.head()

Unnamed: 0,15,2000,2001,2002,2015,2016,2018,2019,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,already,also,app,apple,attach,august,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,conference,confirm,continue,contract,copyright,could,customer,daily,date,david,de,deal,dear,december,delete,deliver,delivery,detail,device,direct,download,draft,due,dynegy,ect,ee,email,employee,ena,energy,enron,enronxgate,ensure,even,event,expire,express,failure,fax,federal,ferc,final,firm,forward,friday,full,fw,game,gas,get,give,go,good,grant,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jones,jose,july,kindly,know,last,let,letter,like,limit,line,link,list,login,look,mail,mailbox,mailto,many,mark,market,meet,meeting,member,mike,million,monday,month,morning,much,natural,north,notice,notification,november,october,offer,one,online,option,party,password,paul,payment,paypal,pending,people,per,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,re,receive,recent,register,reply,report,request,research,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,september,server,service,short,sign,sincerely,smith,space,spam,start,statement,step,steve,still,stock,storage,subject,support,susan,talk,team,tell,thank,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,within,work,would,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04535,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.213605,0.206905,0.0,0.0,0.181198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042958,0.0,0.0,0.0,0.0,0.189622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066628,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121398,0.0,0.0,0.434566,0.0,0.044003,0.0,0.048868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047288,0.0,0.0,0.0,0.0,0.0,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.037755,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062933,0.0,0.083274,0.07206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382583,0.274008,0.0,0.0,0.0,0.0,0.15013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.065998,0.119417,0.0
3,0.0,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030051,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.043705,0.0,0.0,0.0,0.0,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.070488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116103,0.035545,0.0,0.0,0.0,0.085567,0.0,0.0,0.087357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032567,0.0,0.0,0.031276,0.037253,0.041384,0.035811,0.046426,0.038513,0.045605,0.0,0.0,0.0,0.0,0.0,0.070234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039555,0.031688,0.0,0.0,0.043327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0795,0.0,0.0,0.0,0.0,0.0393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062037,0.0,0.0,0.0,0.0,0.0,0.030631,0.0,0.034005,0.0,0.032632,0.030077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.03234,0.046202,0.0,0.0,0.0,0.038869,0.0,0.029923,0.036555,0.042268,0.0,0.0,0.076794,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036848,0.0,0.0,0.029673,0.037728
4,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180497,0.0,0.0,0.111525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15655,0.0,0.0,0.0,0.096174,0.0,0.0,0.183347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128892,0.149106,0.0,0.0,0.147439,0.0,0.0,0.275617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117439,0.0


## Final Dataset Creation

Before using the features with the machine learning algorithms, it is best to tidy up the datasets by adding the features, the id and the class columns in the same DataFrame.

### TF-IDF

In [19]:
final_tfidf_train_1 = tfidf_sel_train_1.copy()
final_tfidf_train_1.insert(0, 'email_class', train_1['class'])
final_tfidf_train_1.insert(0, 'email_id', train_1['id'])

final_tfidf_test_1 = tfidf_sel_test_1.copy()
final_tfidf_test_1.insert(0, 'email_class', test_1['class'])
final_tfidf_test_1.insert(0, 'email_id', test_1['id'])

In [20]:
final_tfidf_train_2 = tfidf_sel_train_2.copy()
final_tfidf_train_2.insert(0, 'email_class', train_2['class'])
final_tfidf_train_2.insert(0, 'email_id', train_2['id'])

final_tfidf_test_2 = tfidf_sel_test_2.copy()
final_tfidf_test_2.insert(0, 'email_class', test_2['class'])
final_tfidf_test_2.insert(0, 'email_id', test_2['id'])

### Word2Vec

In [21]:
final_word2vec_train_1 = word2vec_train_1.copy()
final_word2vec_train_1.insert(0, 'email_class', train_1['class'])
final_word2vec_train_1.insert(0, 'email_id', train_1['id'])

final_word2vec_test_1 = word2vec_test_1.copy()
final_word2vec_test_1.insert(0, 'email_class', test_1['class'])
final_word2vec_test_1.insert(0, 'email_id', test_1['id'])

In [22]:
final_word2vec_train_2 = word2vec_train_2.copy()
final_word2vec_train_2.insert(0, 'email_class', train_2['class'])
final_word2vec_train_2.insert(0, 'email_id', train_2['id'])

final_word2vec_test_2 = word2vec_test_2.copy()
final_word2vec_test_2.insert(0, 'email_class', test_2['class'])
final_word2vec_test_2.insert(0, 'email_id', test_2['id'])