In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 160

import features as util
from raw_utils import save_to_csv
from preprocessing import dataset_add_columns

from ast import literal_eval

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train_tokens = ['train_balanced_tokens.csv', 'train_imbalanced_tokens.csv']
test_tokens = ['test_balanced_tokens.csv', 'test_imbalanced_tokens.csv']

#### Tokenized emails

In [3]:
train_balanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[0]), index_col=0, converters={'body': literal_eval})
test_balanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[0]), index_col=0, converters={'body': literal_eval})

In [4]:
train_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[1]), index_col=0, converters={'body': literal_eval})
test_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[1]), index_col=0, converters={'body': literal_eval})

After the preprocessing, the data look like this:

In [5]:
train_balanced_tokens.head()

Unnamed: 0,id,body,class
0,1872,"[gerald, attach, incoming, guaranty, benefit, bridgeline, gas, marketing, llc, kindly, review, provide, comment, prior, execution, thanks, rudwell, message,...",False
1,1014,"[durasoft, company, java, class, hold, course, com, object, program, enron, network, com, allow, program, write, available, execution, program, language, in...",False
2,781,"[mark, meet, mark, haedicke, regard, utility, need, legal, opinion, utility, counterparty, agree, mark, follow, negotiation, investor, own, utility, base, u...",False
3,2025,"[hi, vince, rough, draft, perusal, comment, reach, 517, 423, icast, meeting, explore, two, problem, discuss, vasant, phone, last, week, decide, feel, confid...",False
4,188,"[great, ill, 300, speak, phil, yesterday, confirm, dinner, house, 700, elizabeth, sage, 713-853-6349, message, gussett, sheryl, emailaddress, enron, send, f...",False


# Feature Extraction

Before inputing the emails to the machine learning algorithms, they have to be converted to numberical matrices.<br>
This process is called **feature extraction**. Different methods of achieving this will be tried, in order to compare their results.

## Text Vectorization

The baseline feature set will simply consist of numerical representations of the text data. This process is also called **vectorization**. 

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, only the top 500 most frequent terms are used.

In [6]:
tfidf_balanced = util.tfidf_features(train_balanced_tokens['body'], test_balanced_tokens['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_balanced = tfidf_balanced['tfidf_train']
tfidf_test_balanced = tfidf_balanced['tfidf_test']
tfidf_model_balanced = tfidf_balanced['vectorizer']

In [8]:
tfidf_imbalanced = util.tfidf_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], min_df=5, max_features=500)

In [9]:
tfidf_train_imbalanced = tfidf_imbalanced['tfidf_train']
tfidf_test_imbalanced = tfidf_imbalanced['tfidf_test']
tfidf_model_imbalanced = tfidf_imbalanced['vectorizer']

As an example, here is a part of the calcuated matrix for the balanced train set:

In [10]:
tfidf_train_balanced.head()

Unnamed: 0,0860,0px,10,100,11,12,15,20,2000,2001,2002,2015,2016,2017,2018,24,25,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,adjust,administrator,agree,agreement,alert,allow,also,america,american,another,answer,app,apple,application,approval,approve,arial,ask,attach,attachment,august,auto,automatically,available,avoid,back,background,bank,banking,base,believe,best,bill,billion,block,book,border,bottom,box,br,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,collapse,color,come,comment,committee,communication,company,complete,concern,confidential,confirm,confirmation,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,customer,daily,data,database,date,david,day,...,report,request,require,reserve,response,result,return,review,rgb,right,risk,road,run,safe,sale,san,sans,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,serif,serve,server,service,set,share,show,sign,since,sincerely,sit,site,size,smith,software,soon,space,spam,span,standard,start,state,statement,step,still,stock,storage,street,style,subject,support,sure,susan,system,table,take,talk,tbody,td,team,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,top,total,tr,trade,trading,transaction,transfer,try,tuesday,two,type,united,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,want,way,web,wednesday,week,weight,well,width,within,without,word,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.190935,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172424,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192279,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208349,0.0,0.0,0.0,0.108475,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.084624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116024,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.10576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.0,0.066533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.145441,0.0,0.0,0.0,0.061221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.060005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.083544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057281,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114138,0.0,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a high-dimensional vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**.

After the word vectors are calculated, the vectors of each word in an email are being averaged, thus resulting in a single vector for each email.

In [11]:
word2vec_balanced = util.word2vec_features(train_balanced_tokens['body'], test_balanced_tokens['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_balanced = word2vec_balanced['word2vec_train']
word2vec_test_balanced = word2vec_balanced['word2vec_test']
word2vec_model_balanced = word2vec_balanced['vectorizer']

In [13]:
word2vec_imbalanced = util.word2vec_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], vector_size=100, min_count=5)

In [14]:
word2vec_train_imbalanced = word2vec_imbalanced['word2vec_train']
word2vec_test_imbalanced = word2vec_imbalanced['word2vec_test']
word2vec_model_imbalanced = word2vec_imbalanced['vectorizer']

The resulting feature sets are like the following:

In [15]:
word2vec_train_balanced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.145027,0.226851,-0.043164,0.272996,0.084986,-0.114324,-0.091514,0.066287,-0.185683,-0.025205,0.044023,0.138097,-0.127415,0.192248,0.160112,-0.127862,-0.565804,0.065217,-0.232606,-0.021073,-0.089531,-0.139343,-0.218935,-0.20024,-0.031695,0.399602,-0.064323,0.208983,0.05811,0.042294,0.058417,-0.206891,-0.142783,0.0992,-0.029356,-0.180602,0.062831,-0.051714,-0.076509,-0.334346,-0.102288,-0.184848,-0.020752,-0.134004,-0.15582,0.144071,-0.286923,-0.049012,0.077882,0.166797,-0.110178,0.047273,0.082464,-0.246527,0.048209,-0.017501,-0.038415,-0.027113,0.134652,0.345258,-0.074082,-0.256573,0.250788,0.092077,0.091171,-0.176176,-0.056349,-0.005963,0.134797,-0.251626,0.030022,0.281617,-0.184757,-0.108207,0.048106,0.117055,0.032123,-0.289235,-0.028087,-0.140773,0.127012,0.016038,0.394786,0.015861,0.287235,0.012155,0.110355,0.263962,0.23913,0.058362,-0.087414,-0.094592,0.088519,-0.151271,-0.060895,0.261719,-0.004235,0.1621,0.189233,-0.050539
1,0.056185,0.088876,0.064634,0.252422,0.087265,-0.073401,0.040691,0.044152,-0.104439,0.056562,-0.021887,0.176255,-0.219592,0.239599,0.144244,-0.001134,-0.430887,0.117053,-0.312054,0.084773,-0.026625,-0.104265,-0.127346,-0.004368,-0.127849,0.2995,-0.043226,0.171157,0.076325,0.156675,0.182121,-0.096954,-0.14132,-0.029373,-0.051519,-0.105404,0.228642,-0.014543,-0.113802,-0.341012,-0.11906,-0.260261,-0.020347,-0.118902,-0.096417,0.16239,-0.219939,-0.103138,0.087476,0.192652,-0.149111,0.004467,-0.022914,-0.185129,0.174691,-0.064332,-0.073066,-0.072897,0.160219,0.346855,-0.057806,-0.193689,0.185842,0.115147,0.077462,-0.18995,-0.103166,0.034132,0.059509,-0.393637,-0.032545,0.207032,-0.285134,-0.070421,0.104448,0.051125,0.064354,-0.289298,0.065572,-0.20184,0.170038,0.063199,0.338501,0.135575,0.227375,0.021632,0.05334,0.164008,0.148101,0.207968,-0.112263,0.026474,0.069398,-0.157145,-0.019628,0.150249,-0.040306,0.030261,0.106939,-0.081817
2,0.055829,0.141499,0.003155,0.173854,0.052437,-0.247887,0.023962,0.12565,-0.173311,0.028211,-0.069953,0.150971,-0.081764,0.215027,0.125275,-0.075908,-0.463202,0.089436,-0.156731,-0.065292,-0.110564,-0.167594,-0.18272,-0.113311,-0.152908,0.277655,-0.117186,0.178029,0.06297,0.129335,0.144453,-0.125586,-0.153933,0.067726,-0.123781,-0.144828,0.255575,0.112708,-0.044294,-0.347643,-0.178603,-0.105996,0.0686,-0.171716,0.062762,0.221297,-0.195262,-0.159436,0.022837,0.235091,-0.134531,0.001107,0.042674,-0.265194,0.11996,-0.144248,-0.116324,-0.072385,0.098516,0.404702,-0.116244,-0.12674,0.095134,0.141417,0.049184,-0.205471,-0.124195,0.093439,0.024666,-0.362791,-0.039181,0.210276,-0.354306,-0.101155,0.02447,0.126383,0.065781,-0.310759,-0.028714,-0.112038,0.190512,0.043767,0.439912,0.150023,0.300376,0.018172,0.114437,0.27513,0.296096,0.119377,-0.127307,0.036724,0.055555,-0.066357,-0.002117,0.135978,-0.050231,0.015531,0.201871,0.003492
3,0.093356,0.071075,0.042837,0.207198,0.064698,-0.143467,0.086442,0.073449,-0.125488,0.072202,-0.111284,0.182147,-0.209069,0.203379,0.137669,-0.013055,-0.44222,0.080879,-0.164524,0.009597,-0.088505,-0.137805,-0.10999,-0.031925,-0.107276,0.300544,-0.078485,0.132824,-0.016416,0.159226,0.155257,-0.021073,-0.133228,0.053739,-0.16252,-0.126864,0.171423,0.012508,-0.073803,-0.317563,-0.098921,-0.220719,0.013129,-0.135413,-0.016107,0.185388,-0.174286,-0.141206,0.046567,0.278341,-0.159395,-0.016758,-0.01804,-0.249978,0.150311,-0.096995,-0.111068,-0.047573,0.097483,0.395925,-0.042492,-0.099084,0.161467,0.104232,0.006998,-0.163455,-0.107761,0.055513,0.05265,-0.401418,-0.009675,0.174163,-0.293446,-0.081859,0.068696,0.088409,0.067004,-0.362161,0.031313,-0.238776,0.196466,0.083196,0.377628,0.132005,0.320683,0.101197,0.158714,0.183822,0.223321,0.191596,-0.114033,0.015377,0.071147,-0.157733,0.033734,0.098306,-0.102386,0.029492,0.144941,-0.077914
4,0.197868,0.205279,0.01869,0.252132,0.051818,-0.058903,0.004129,0.079983,-0.140062,-0.0258,-0.038119,0.351757,-0.281914,0.238027,0.219654,-0.110079,-0.468371,-0.029511,-0.127631,0.083439,-0.063262,-0.103633,-0.209511,-0.146683,-0.027031,0.367702,-0.053549,0.225663,0.026573,0.101988,0.038469,-0.164231,-0.14597,0.184537,-0.112348,-0.191268,0.049242,-0.172444,-0.163679,-0.239831,-0.056854,-0.277604,-0.070756,-0.077448,-0.242691,0.092092,-0.162332,-0.035269,-0.008958,0.151898,-0.169703,0.106788,-0.033525,-0.078784,0.13615,0.087087,-0.017872,-0.024486,0.066128,0.373534,-0.061738,-0.169792,0.377625,0.033222,0.057199,-0.151619,-0.074734,-0.047501,0.153856,-0.339639,0.075027,0.154618,-0.216555,-0.053647,0.08601,0.11036,0.121665,-0.375091,0.117761,-0.327486,0.124202,-0.043851,0.323824,-0.023862,0.214858,0.24159,0.06996,0.21588,0.188834,0.103609,-0.043124,-0.096136,0.106751,-0.25064,-0.048666,0.264746,0.013801,0.164859,0.162172,-0.177598


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

# Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

## Vectorization Features

### TF-IDF

In [16]:
selected_tfidf_balanced = util.chi2_feature_selection(tfidf_train_balanced, train_balanced_tokens['class'], tfidf_test_balanced, percentile=50)

In [17]:
tfidf_sel_train_balanced = selected_tfidf_balanced['features_train']
tfidf_sel_test_balanced = selected_tfidf_balanced['features_test']
tfidf_sel_model_balanced = selected_tfidf_balanced['selector']

In [18]:
selected_tfidf_imbalanced = util.chi2_feature_selection(tfidf_train_imbalanced, train_imbalanced_tokens['class'], tfidf_test_imbalanced, percentile=50)

In [19]:
tfidf_sel_train_imbalanced = selected_tfidf_imbalanced['features_train']
tfidf_sel_test_imbalanced = selected_tfidf_imbalanced['features_test']
tfidf_sel_model_imbalanced = selected_tfidf_imbalanced['selector']

The now-reduced train set:

In [20]:
tfidf_sel_train_balanced.head()

Unnamed: 0,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,link,list,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
0,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084374,0.0,0.339057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15563,0.0,0.0,0.0,0.0,0.0,0.0,0.162654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171982,0.0,0.0,0.0,0.0,0.18069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156724,0.119443,0.0,0.0,0.131815,0.0,0.0,0.0,0.0,0.0,0.0,0.149029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282967,0.0,0.097983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164643,0.0,0.105068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.066533,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154801,0.075345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072828,0.077132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.057281,0.0
4,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105705,0.0,0.0,0.147346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20159,0.0,0.0,0.0,0.305553,0.0,0.0,0.0,0.219565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14964,0.191565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0


# Final Dataset Creation

Before using the features for classification with the machine learning algorithms, it is best to tidy up the datasets and keep them consistent by concatenating the features, the id and the class columns in the same DataFrame.

In [21]:
column_names = ['email_class', 'email_id'] # column names changed in case the word class or id appear in the token list

### TF-IDF

In [22]:
final_tfidf_train_balanced = dataset_add_columns(tfidf_sel_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_tfidf_test_balanced = dataset_add_columns(tfidf_sel_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [23]:
final_tfidf_train_imbalanced = dataset_add_columns(tfidf_sel_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_tfidf_test_imbalanced = dataset_add_columns(tfidf_sel_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

Looking into one of the previously explored examples:

In [24]:
final_tfidf_train_balanced[final_tfidf_train_balanced['email_id'] == 6]

Unnamed: 0,email_id,email_class,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,...,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
2089,6,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224085,0.0,0.0,0.0,0.0,0.0,0.150715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303853,0.09435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202054,0.0,0.0,0.0,0.0,0.0,0.0,0.190245,0.0,0.0,0.175255,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.107273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150865,0.138519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133462,0.0,0.0,0.0,0.379817,0.119663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352513,0.228697,0.0,0.120935,0.0,0.0,0.207051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The words that appear more in the email have a bigger score, while the words that don't appear at all have a score of zero.

### Word2Vec

In [25]:
final_word2vec_train_balanced = dataset_add_columns(word2vec_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_word2vec_test_balanced = dataset_add_columns(word2vec_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [26]:
final_word2vec_train_imbalanced = dataset_add_columns(word2vec_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_word2vec_test_imbalanced = dataset_add_columns(word2vec_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

In [27]:
final_tfidf_train_balanced.head()

Unnamed: 0,email_id,email_class,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,...,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
0,1872,False,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084374,0.0,0.339057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15563,0.0,0.0,0.0,0.0,0.0,0.0,0.162654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171982,0.0,0.0,0.0,0.0,0.18069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156724,0.119443,0.0,0.0,0.131815,0.0,0.0,...,0.0,0.0,0.149029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1014,False,0.0,0.0,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,781,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.282967,0.0,0.097983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164643,0.0,0.105068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0
3,2025,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.066533,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069935,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154801,0.075345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072828,0.077132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.057281,0.0
4,188,False,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105705,0.0,0.0,0.147346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20159,0.0,0.0,0.0,0.305553,0.0,0.0,0.0,0.219565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14964,0.191565,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0


### Saving the Results

In [28]:
save_to_csv(final_tfidf_train_balanced, csv_path, 'tfidf_chi2_train_balanced.csv')
save_to_csv(final_tfidf_test_balanced, csv_path, 'tfidf_chi2_test_balanced.csv')

save_to_csv(final_tfidf_train_imbalanced, csv_path, 'tfidf_chi2_train_imbalanced.csv')
save_to_csv(final_tfidf_test_imbalanced, csv_path, 'tfidf_chi2_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_imbalanced.csv


In [29]:
save_to_csv(final_word2vec_train_balanced, csv_path, 'word2vec_train_balanced.csv')
save_to_csv(final_word2vec_test_balanced, csv_path, 'word2vec_test_balanced.csv')

save_to_csv(final_word2vec_train_imbalanced, csv_path, 'word2vec_train_imbalanced.csv')
save_to_csv(final_word2vec_test_imbalanced, csv_path, 'word2vec_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_imbalanced.csv
