In [1]:
import pandas as pd
pd.options.display.max_columns = 250

import features as util

from ast import literal_eval

## Feature Extraction

Before inputing the emails into the machine learning algorithms, they have to be converted to numbers.<br>
This process is called **feature extraction**, or **vectorization**. We will try different methods of achieving this, in order to compare their results.

In [2]:
train_1 = pd.read_csv('./data/csv/train_1.csv', index_col=0, converters={'body': literal_eval})
test_1 = pd.read_csv('./data/csv/test_1.csv', index_col=0, converters={'body': literal_eval})

In [3]:
train_2 = pd.read_csv('./data/csv/train_2.csv', index_col=0, converters={'body': literal_eval})
test_2 = pd.read_csv('./data/csv/test_2.csv', index_col=0, converters={'body': literal_eval})

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, we use only the top 500 most frequent terms.

In [4]:
tfidf_1 = util.tfidf_features(train_1['body'], test_1['body'], min_df=5, max_features=500)

In [5]:
tfidf_train_1 = tfidf_1['tfidf_train']
tfidf_test_1 = tfidf_1['tfidf_test']
tfidf_model_1 = tfidf_1['vectorizer']

In [6]:
tfidf_2 = util.tfidf_features(train_2['body'], test_2['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_2 = tfidf_2['tfidf_train']
tfidf_test_2 = tfidf_2['tfidf_test']
tfidf_model_2 = tfidf_2['vectorizer']

As an example, we can see a part of the calcuated matrix for the first test set:

In [8]:
tfidf_test_1.head()

Unnamed: 0,10,11,12,14,15,16,20,2000,2001,2002,2015,2016,2018,2019,24,27,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,administrator,advise,agree,agreement,alert,allow,already,also,america,american,amount,another,answer,app,apple,application,approval,approve,area,ask,attach,attachment,august,available,avoid,back,bank,banking,base,believe,best,bill,billion,block,bond,book,box,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,come,comment,communication,company,complete,concern,conference,confidential,confirm,confirmation,consider,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,currently,customer,daily,data,database,date,david,day,de,deal,dear,december,...,reply,report,request,require,research,reserve,response,result,return,review,right,risk,road,run,safe,sale,san,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,september,serve,server,service,set,share,short,show,sign,since,sincerely,sit,site,smith,software,someone,soon,space,spam,special,staff,standard,start,state,statement,step,steve,still,stock,storage,street,subject,summary,supply,support,sure,susan,system,take,talk,team,technology,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,total,trade,trading,transaction,transfer,try,tuesday,two,type,united,unsubscribe,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,wait,want,way,web,wednesday,week,well,window,within,without,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.06309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067692,0.04535,0.0,0.050052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.0,0.061849,0.0,0.0,0.213605,0.0,0.0,0.206905,0.0,0.049012,0.0,0.181198,0.0,0.0,...,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.0,0.037755,0.0716,0.0,0.0,0.0,0.0,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.201682,0.0,0.0,0.0,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.480843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.070329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.072982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.656607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.044159,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067682,0.064069,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.079335,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065998,0.0,0.119417,0.0,0.0,0.0
3,0.0,0.0,0.04584,0.0,0.0,0.0,0.044932,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104853,0.0,0.030051,0.0,0.0,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079109,0.0,0.0,0.0,0.044301,0.040786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.0,0.0,0.043705,0.0,0.034747,0.036998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034787,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096449,0.0,0.039619,0.0,0.0,...,0.030631,0.0,0.034005,0.0,0.0,0.032632,0.0,0.0,0.0,0.0,0.030077,0.0,0.0,0.0,0.0,0.138239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.175561,0.0,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.042352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039684,0.0,0.0,0.098014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03184,0.0,0.03234,0.0,0.046202,0.037201,0.0,0.0,0.0,0.0,0.0,0.038869,0.0,0.0,0.029923,0.036555,0.0,0.042268,0.0,0.0,0.0,0.0,0.0,0.076794,0.0,0.0,0.139731,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03715,0.0,0.0,0.080304,0.0,0.0,0.036848,0.0,0.0,0.0,0.0,0.0,0.0,0.029673,0.043705,0.037728,0.0
4,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.138328,0.166956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.171478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146034,0.0,0.0,0.0,0.0,0.0,0.117439,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**

After the vectors for each word are calculated, they are being averaged for the words of each email, thus resulting in a single vector for each email.

In [9]:
word2vec_1 = util.word2vec_features(train_1['body'], test_1['body'], vector_size=100, min_count=5)

In [10]:
word2vec_train_1 = word2vec_1['word2vec_train']
word2vec_test_1 = word2vec_1['word2vec_test']
word2vec_model_1 = word2vec_1['vectorizer']

In [11]:
word2vec_2 = util.word2vec_features(train_2['body'], test_2['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_2 = word2vec_2['word2vec_train']
word2vec_test_2 = word2vec_2['word2vec_test']
word2vec_model_2 = word2vec_2['vectorizer']

The resulting feature sets are like the following:

In [13]:
word2vec_test_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.078716,-0.177165,0.017402,0.041569,-0.012904,-0.094145,0.095599,0.305537,-0.248811,-0.002968,0.230494,-0.279943,0.109174,0.071663,0.07152,-0.04033,-0.445211,-0.096504,-0.125742,-0.149284,0.216505,-0.128412,-0.201289,-0.13114,-0.109543,0.333409,-0.089402,0.388649,0.098801,0.054105,0.366666,-0.074198,-0.229972,0.297556,-0.394723,-0.099356,-0.198237,-0.062516,-0.302047,-0.333707,-0.089361,-0.416419,-0.183551,0.304056,-0.203119,0.297467,0.028528,-0.030039,0.19885,0.187222,0.028413,-0.018917,0.173438,-0.069806,-0.058934,0.023653,0.142804,-0.238341,0.158981,0.474679,-0.182805,-0.597421,0.186647,-0.145905,0.415032,-0.713618,-0.564383,-0.141301,0.014143,-0.470863,0.044409,0.194158,0.075453,0.029437,-0.446922,0.118958,0.144548,-0.369191,0.158282,-0.356589,0.184875,0.16213,0.181508,-0.046447,0.142912,0.270635,0.347788,0.049667,-0.070937,-0.217009,-0.501141,0.062622,0.07651,0.116758,-0.323407,0.461756,0.090979,-0.058752,0.3056,-0.068996
1,-0.286223,-0.138527,-0.026156,0.228527,0.278788,-0.328569,-0.052772,0.252414,-0.463027,-0.043152,0.135424,-0.488842,0.132031,0.290029,0.080388,0.105379,-0.518854,0.110082,-0.320807,0.075669,0.430857,-0.062315,-0.066746,-0.473448,-0.191183,0.410575,-0.13825,0.472655,0.080002,0.089769,0.374968,0.106884,-0.197472,-0.037912,-0.59502,0.081707,-0.0661,-0.191614,-0.191935,-0.278937,0.03763,-0.235808,-0.234908,0.168288,-0.483583,0.214747,0.121465,-0.311349,0.23318,0.182648,0.048337,0.051337,-0.012035,-0.180411,-0.304307,0.075709,0.00501,-0.244557,0.393278,0.332421,-0.213618,-0.576075,0.173767,-0.131462,0.333383,-0.623461,-0.234644,0.09311,0.117401,-0.477235,-0.169292,0.264716,-0.155909,0.083967,-0.20617,0.011451,0.176001,-0.249864,0.029266,-0.277838,0.206696,0.01576,0.345887,-0.149026,-0.016062,0.375499,0.04269,0.202879,0.034968,-0.041293,-0.5823,0.129092,-0.057476,0.287862,-0.220764,0.309222,-0.112842,-0.05286,0.129854,0.012096
2,0.663555,-0.313476,-0.199697,-0.033538,0.007524,-0.022831,-0.233432,0.417866,-0.633137,0.471123,0.319543,-0.589569,0.273096,-0.020229,0.43362,-0.129077,-0.747196,-0.001812,-0.137034,0.275077,0.910159,-0.227816,-0.373194,-0.061669,-0.230449,0.826424,-0.419956,0.554907,-0.361448,0.022794,0.679715,-0.486397,-0.349275,0.607118,-0.502126,-0.256053,-0.348943,-0.339746,-0.25953,-0.402287,-0.199873,-0.652078,-0.533022,0.320291,-0.521422,0.788131,-0.231103,0.123717,0.692173,-0.153808,0.460244,-0.117504,0.529981,-0.391413,-0.154108,-0.391403,0.45069,-0.134361,0.34762,0.611752,0.246003,-0.640887,0.189063,0.181097,0.49718,-0.929556,-1.104228,-0.38947,-0.283259,-0.844867,-0.095799,0.377996,-0.268407,0.037016,-0.503195,0.539455,0.194634,-0.469493,0.048325,-0.383104,0.461039,0.238866,0.257776,-0.147389,0.367608,0.293118,0.289717,0.061815,0.340555,-0.366978,-0.5728,-0.082016,0.423842,0.303018,-0.485798,0.657781,0.220919,-0.461289,0.587636,-0.127441
3,0.000783,-0.204837,-0.023499,-0.002886,-0.004094,-0.01293,0.036271,0.354299,-0.362697,-0.016424,0.27622,-0.357845,0.084849,0.205319,0.294634,-0.039948,-0.501333,0.019137,-0.263048,0.030354,0.559993,-0.153524,-0.099043,-0.228584,-0.14607,0.499922,-0.080271,0.471219,0.016408,0.014838,0.430704,-0.156145,-0.344729,0.411163,-0.450731,-0.119748,-0.241813,-0.153125,-0.238948,-0.376785,0.009575,-0.35773,-0.238943,0.260529,-0.540736,0.413263,0.180902,-0.090386,0.181853,0.162932,0.253456,0.003903,-0.049557,-0.179677,-0.205573,0.086829,0.18171,-0.332526,0.298463,0.396526,0.00056,-0.615645,0.282462,-0.173224,0.512737,-0.725261,-0.600249,-0.207403,0.162273,-0.406164,-0.038784,0.200051,-0.114624,0.153774,-0.439391,-0.011252,0.140858,-0.482763,0.040948,-0.303245,0.365523,0.112989,0.247426,-0.118265,0.185987,0.165089,0.072936,0.107524,0.066877,-0.200694,-0.488178,-0.120365,0.128715,0.096509,-0.338223,0.455248,0.133059,-0.150066,0.262029,-0.013884
4,0.024178,-0.188328,0.000859,-0.003759,0.094025,-0.116191,-0.070352,0.342457,-0.303415,0.088356,0.298526,-0.374632,0.1654,0.13874,0.100778,0.057964,-0.575253,0.025353,-0.214978,-0.010588,0.476405,-0.193642,-0.163566,-0.206273,-0.210854,0.485708,-0.160243,0.446242,-0.067545,0.054889,0.368343,-0.137721,-0.271317,0.204088,-0.481059,-0.133122,-0.165839,-0.11524,-0.237293,-0.380434,0.044594,-0.309312,-0.206908,0.305223,-0.4887,0.308725,0.055348,-0.101166,0.276987,0.090825,0.163456,-0.035007,0.099347,-0.149665,-0.121683,0.07523,0.182898,-0.247961,0.267876,0.460162,-0.122269,-0.600702,0.356143,-0.114854,0.460564,-0.743912,-0.469305,-0.122386,0.128043,-0.585615,-0.05835,0.231383,-0.021695,0.069738,-0.367775,0.088804,0.204516,-0.356163,0.149613,-0.353575,0.203009,0.162025,0.259765,-0.114518,0.204548,0.298817,0.257853,0.050166,-0.018779,-0.198321,-0.532181,-0.054451,0.114251,0.180862,-0.275056,0.488875,0.007119,-0.020587,0.253933,0.003373


## Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

### TF-IDF

In [14]:
selected_tfidf_1 = util.chi2_feature_selection(tfidf_train_1, train_1['class'], tfidf_test_1, percentile=50)

In [15]:
tfidf_sel_train_1 = selected_tfidf_1['features_train']
tfidf_sel_test_1 = selected_tfidf_1['features_test']
tfidf_sel_model_1 = selected_tfidf_1['selector']

In [16]:
selected_tfidf_2 = util.chi2_feature_selection(tfidf_train_2, train_2['class'], tfidf_test_2, percentile=50)

In [17]:
tfidf_sel_train_2 = selected_tfidf_2['features_train']
tfidf_sel_test_2 = selected_tfidf_2['features_test']
tfidf_sel_model_2 = selected_tfidf_2['selector']

The now-reduced test set:

In [18]:
tfidf_sel_test_1.head()

Unnamed: 0,15,2000,2001,2002,2015,2016,2018,2019,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,already,also,app,apple,attach,august,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,conference,confirm,continue,contract,copyright,could,customer,daily,date,david,de,deal,dear,december,delete,deliver,delivery,detail,device,direct,download,draft,due,dynegy,ect,ee,email,employee,ena,energy,enron,enronxgate,ensure,even,event,expire,express,failure,fax,federal,ferc,final,firm,forward,friday,full,fw,game,gas,get,give,go,good,grant,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jones,jose,july,kindly,know,last,let,letter,like,limit,line,link,list,login,look,mail,mailbox,mailto,many,mark,market,meet,meeting,member,mike,million,monday,month,morning,much,natural,north,notice,notification,november,october,offer,one,online,option,party,password,paul,payment,paypal,pending,people,per,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,re,receive,recent,register,reply,report,request,research,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,september,server,service,short,sign,sincerely,smith,space,spam,start,statement,step,steve,still,stock,storage,subject,support,susan,talk,team,tell,thank,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,within,work,would,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04535,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.213605,0.206905,0.0,0.0,0.181198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042958,0.0,0.0,0.0,0.0,0.189622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066628,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121398,0.0,0.0,0.434566,0.0,0.044003,0.0,0.048868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047288,0.0,0.0,0.0,0.0,0.0,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.037755,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062933,0.0,0.083274,0.07206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382583,0.274008,0.0,0.0,0.0,0.0,0.15013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.065998,0.119417,0.0
3,0.0,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030051,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.043705,0.0,0.0,0.0,0.0,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.0,0.0,0.070488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116103,0.035545,0.0,0.0,0.0,0.085567,0.0,0.0,0.087357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032567,0.0,0.0,0.031276,0.037253,0.041384,0.035811,0.046426,0.038513,0.045605,0.0,0.0,0.0,0.0,0.0,0.070234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039555,0.031688,0.0,0.0,0.043327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0795,0.0,0.0,0.0,0.0,0.0393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062037,0.0,0.0,0.0,0.0,0.0,0.030631,0.0,0.034005,0.0,0.032632,0.030077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.03234,0.046202,0.0,0.0,0.0,0.038869,0.0,0.029923,0.036555,0.042268,0.0,0.0,0.076794,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036848,0.0,0.0,0.029673,0.037728
4,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180497,0.0,0.0,0.111525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15655,0.0,0.0,0.0,0.096174,0.0,0.0,0.183347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.128892,0.149106,0.0,0.0,0.147439,0.0,0.0,0.275617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117439,0.0


## Final Dataset Creation

Before using the features with the machine learning algorithms, it is best to tidy up the datasets by adding the features, the id and the class columns in the same DataFrame.

In [19]:
final_tfidf_train_1 = tfidf_sel_train_1.copy()
final_tfidf_train_1.insert(0, 'email_class', train_1['class'])
final_tfidf_train_1.insert(0, 'email_id', train_1['id'])

final_tfidf_test_1 = tfidf_sel_test_1.copy()
final_tfidf_test_1.insert(0, 'email_class', test_1['class'])
final_tfidf_test_1.insert(0, 'email_id', test_1['id'])

In [20]:
final_tfidf_train_2 = tfidf_sel_train_2.copy()
final_tfidf_train_2.insert(0, 'email_class', train_2['class'])
final_tfidf_train_2.insert(0, 'email_id', train_2['id'])

final_tfidf_test_2 = tfidf_sel_test_2.copy()
final_tfidf_test_2.insert(0, 'email_class', test_2['class'])
final_tfidf_test_2.insert(0, 'email_id', test_2['id'])