In [1]:
import pandas as pd
pd.options.display.max_columns = 250

import features as util

from ast import literal_eval

## Feature Extraction

Before inputing the emails into the machine learning algorithms, they have to be converted to numbers.<br>
This process is called **feature extraction**, or **vectorization**. We will try different methods of achieving this, in order to compare their results.

In [2]:
train_1 = pd.read_csv('./data/csv/train_1.csv', index_col=0, converters={'body': literal_eval})
test_1 = pd.read_csv('./data/csv/test_1.csv', index_col=0, converters={'body': literal_eval})

In [3]:
train_2 = pd.read_csv('./data/csv/train_2.csv', index_col=0, converters={'body': literal_eval})
test_2 = pd.read_csv('./data/csv/test_2.csv', index_col=0, converters={'body': literal_eval})

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality, we use only the top 500 most frequent terms.

In [4]:
tfidf_1 = util.tfidf_features(train_1['body'], test_1['body'], min_df=5, max_features=500)

In [5]:
tfidf_train_1 = tfidf_1['tfidf_train']
tfidf_test_1 = tfidf_1['tfidf_test']
tfidf_model_1 = tfidf_1['vectorizer']

In [6]:
tfidf_2 = util.tfidf_features(train_2['body'], test_2['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_2 = tfidf_2['tfidf_train']
tfidf_test_2 = tfidf_2['tfidf_test']
tfidf_model_2 = tfidf_2['vectorizer']

As an example, we can see a part of the calcuated matrix for the first test set:

In [8]:
tfidf_test_1.head()

Unnamed: 0,10,11,12,14,15,16,20,2000,2001,2002,2015,2016,2018,2019,24,27,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,administrator,advise,agree,agreement,alert,allow,already,also,america,american,amount,another,answer,app,apple,application,approval,approve,area,ask,attach,attachment,august,available,avoid,back,bank,banking,base,believe,best,bill,billion,block,bond,book,box,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,come,comment,communication,company,complete,concern,conference,confidential,confirm,confirmation,consider,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,currently,customer,daily,data,database,date,david,day,de,deal,dear,december,...,reply,report,request,require,research,reserve,response,result,return,review,right,risk,road,run,safe,sale,san,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,september,serve,server,service,set,share,short,show,sign,since,sincerely,sit,site,smith,software,someone,soon,space,spam,special,staff,standard,start,state,statement,step,steve,still,stock,storage,street,subject,summary,supply,support,sure,susan,system,take,talk,team,technology,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,total,trade,trading,transaction,transfer,try,tuesday,two,type,united,unsubscribe,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,wait,want,way,web,wednesday,week,well,window,within,without,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389012,0.452369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.406018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.334496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.292842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.32642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.06309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067692,0.04535,0.0,0.050052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055364,0.0,0.0,0.061849,0.0,0.0,0.213605,0.0,0.0,0.206905,0.0,0.049012,0.0,0.181198,0.0,0.0,...,0.0,0.059072,0.0,0.0,0.0,0.0,0.0,0.062401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061527,0.0,0.0,0.0,0.037755,0.0716,0.0,0.0,0.0,0.0,0.0,0.0,0.066776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055727,0.201682,0.0,0.0,0.0,0.0,0.0,0.05944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039619,0.0,0.0,0.0,0.480843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176352,0.0,0.0,0.070329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074047,0.167806,0.0,0.0,0.072982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.656607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049466,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057313,0.0,0.0,0.0,0.044159,0.0,0.0,0.0,0.0,0.0,0.051677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067682,0.064069,0.0,0.065075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167449,0.0,0.0,0.0,0.0,0.074147,0.0,0.0,0.079335,0.08178,0.068655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065998,0.0,0.119417,0.0,0.0,0.0
3,0.0,0.0,0.04584,0.0,0.0,0.0,0.044932,0.044717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104853,0.0,0.030051,0.0,0.0,0.0,0.0,0.043802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079109,0.0,0.0,0.0,0.044301,0.040786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215195,0.0,0.0,0.0,0.043705,0.0,0.034747,0.036998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034787,0.037514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096449,0.0,0.039619,0.0,0.0,...,0.030631,0.0,0.034005,0.0,0.0,0.032632,0.0,0.0,0.0,0.0,0.030077,0.0,0.0,0.0,0.0,0.138239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031963,0.0,0.0,0.175561,0.0,0.0,0.0,0.0,0.0,0.025682,0.0,0.0,0.0,0.042352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039684,0.0,0.0,0.098014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03184,0.0,0.03234,0.0,0.046202,0.037201,0.0,0.0,0.0,0.0,0.0,0.038869,0.0,0.0,0.029923,0.036555,0.0,0.042268,0.0,0.0,0.0,0.0,0.0,0.076794,0.0,0.0,0.139731,0.0,0.0,0.766313,0.0,0.025989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03715,0.0,0.0,0.080304,0.0,0.0,0.036848,0.0,0.0,0.0,0.0,0.0,0.0,0.029673,0.043705,0.037728,0.0
4,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176564,0.0,0.0,0.0,0.0,0.138328,0.166956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.258305,0.0,0.0,0.171478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146034,0.0,0.0,0.0,0.0,0.0,0.117439,0.0,0.0,0.0
