In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 160

import features as util
from raw_utils import save_to_csv
from preprocessing import dataset_add_columns

from ast import literal_eval

### Read Data

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train_tokens = ['train_balanced_tokens.csv', 'train_imbalanced_tokens.csv']
test_tokens = ['test_balanced_tokens.csv', 'test_imbalanced_tokens.csv']

#### Tokenized emails

In [3]:
train_balanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[0]), index_col=0, converters={'body': literal_eval})
test_balanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[0]), index_col=0, converters={'body': literal_eval})

In [4]:
train_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, train_tokens[1]), index_col=0, converters={'body': literal_eval})
test_imbalanced_tokens = pd.read_csv(os.path.join(csv_path, test_tokens[1]), index_col=0, converters={'body': literal_eval})

After the preprocessing, the data look like this:

In [5]:
train_balanced_tokens.head()

Unnamed: 0,id,body,class
0,1872,"[gerald, attach, incoming, guaranty, benefit, bridgeline, gas, marketing, llc, kindly, review, provide, comment, prior, execution, thanks, rudwell, message,...",False
1,1014,"[durasoft, company, java, class, hold, course, com, object, program, enron, network, com, allow, program, write, available, execution, program, language, in...",False
2,781,"[mark, meet, mark, haedicke, regard, utility, need, legal, opinion, utility, counterparty, agree, mark, follow, negotiation, investor, own, utility, base, u...",False
3,2025,"[hi, vince, rough, draft, perusal, comment, reach, 517, 423, icast, meeting, explore, two, problem, discuss, vasant, phone, last, week, decide, feel, confid...",False
4,188,"[great, ill, 300, speak, phil, yesterday, confirm, dinner, house, 700, elizabeth, sage, 713-853-6349, message, gussett, sheryl, emailaddress, enron, send, f...",False


# Feature Extraction

Before inputing the emails to the machine learning algorithms, they have to be converted to numberical matrices.<br>
This process is called **feature extraction**. Different methods of achieving this will be tried, in order to compare their results.

## Text Vectorization

The baseline feature set will simply consist of numerical representations of the text data. This process is also called **vectorization**. 

### TF-IDF

One of the most basic ways is to calculate the **tf-idf** (term frequency-inverse document frequency) score of the emails.<br>
In order to have a lower dimensionality and since not all words from the corpus will be of importance, only the top 500 most frequent terms are used.

In [6]:
tfidf_balanced = util.tfidf_features(train_balanced_tokens['body'], test_balanced_tokens['body'], min_df=5, max_features=500)

In [7]:
tfidf_train_balanced = tfidf_balanced['tfidf_train']
tfidf_test_balanced = tfidf_balanced['tfidf_test']
tfidf_model_balanced = tfidf_balanced['vectorizer']

In [8]:
tfidf_imbalanced = util.tfidf_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], min_df=5, max_features=500)

In [9]:
tfidf_train_imbalanced = tfidf_imbalanced['tfidf_train']
tfidf_test_imbalanced = tfidf_imbalanced['tfidf_test']
tfidf_model_imbalanced = tfidf_imbalanced['vectorizer']

As an example, here is a part of the calcuated matrix for the balanced train set:

In [10]:
tfidf_train_balanced.head()

Unnamed: 0,0860,0px,10,100,11,12,15,20,2000,2001,2002,2015,2016,2017,2018,24,25,30,713,853,able,absa,accept,access,account,action,activity,add,additional,address,adjust,administrator,agree,agreement,alert,allow,also,america,american,another,answer,app,apple,application,approval,approve,arial,ask,attach,attachment,august,auto,automatically,available,avoid,back,background,bank,banking,base,believe,best,bill,billion,block,book,border,bottom,box,br,browser,business,buy,ca,california,call,cancel,capacity,capital,card,care,case,cause,cc,center,change,charge,chase,check,choose,chris,click,close,code,collapse,color,come,comment,committee,communication,company,complete,concern,confidential,confirm,confirmation,contact,contain,content,continue,contract,copy,copyright,cost,could,create,credit,current,customer,daily,data,database,date,david,day,...,report,request,require,reserve,response,result,return,review,rgb,right,risk,road,run,safe,sale,san,sans,sara,say,schedule,scott,secure,security,see,select,sell,send,sender,sent,serif,serve,server,service,set,share,show,sign,since,sincerely,sit,site,size,smith,software,soon,space,spam,span,standard,start,state,statement,step,still,stock,storage,street,style,subject,support,sure,susan,system,table,take,talk,tbody,td,team,tell,term,texas,text,thank,thanks,thing,think,three,thursday,time,today,top,total,tr,trade,trading,transaction,transfer,try,tuesday,two,type,united,update,upgrade,urladdress,usaa,use,user,utility,validate,value,verification,verify,version,via,view,visit,want,way,web,wednesday,week,weight,well,width,within,without,word,work,world,would,write,year,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.190935,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.195582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172424,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192279,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.208349,0.0,0.0,0.0,0.108475,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.084624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.116024,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.10576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.0,0.066533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.145441,0.0,0.0,0.0,0.061221,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.060005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.083544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057281,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114138,0.0,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

A more advanced technique is **Word Embedding**, which calculates a high-dimensional vector for each word based on the probability distribution of this word appearing before or after another. In other words, words belonging to the same context usually appear close to each other in the corpus, so they will be closer in the vector space as well.<br>
The chosen implementation is **Word2Vec**.

After the word vectors are calculated, the vectors of each word in an email are being averaged, thus resulting in a single vector for each email.

In [11]:
word2vec_balanced = util.word2vec_features(train_balanced_tokens['body'], test_balanced_tokens['body'], vector_size=100, min_count=5)

In [12]:
word2vec_train_balanced = word2vec_balanced['word2vec_train']
word2vec_test_balanced = word2vec_balanced['word2vec_test']
word2vec_model_balanced = word2vec_balanced['vectorizer']

In [13]:
word2vec_imbalanced = util.word2vec_features(train_imbalanced_tokens['body'], test_imbalanced_tokens['body'], vector_size=100, min_count=5)

In [14]:
word2vec_train_imbalanced = word2vec_imbalanced['word2vec_train']
word2vec_test_imbalanced = word2vec_imbalanced['word2vec_test']
word2vec_model_imbalanced = word2vec_imbalanced['vectorizer']

The resulting feature sets are like the following:

In [15]:
word2vec_train_balanced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,-0.117954,0.089636,-0.034582,0.601255,0.017142,-0.050289,0.023095,-0.039325,-0.435132,-0.253273,0.29301,0.127743,-0.216926,0.444354,-0.044672,-0.041185,-0.897471,0.118774,-0.157275,0.081896,0.1616,0.172204,-0.422472,-0.189408,-0.171134,0.577631,-0.160261,0.611972,-0.04824,0.080774,0.387732,-0.22814,-0.326698,0.287848,-0.147277,-0.273535,-0.066381,0.17139,-0.158129,-0.245713,-0.090469,-0.313413,0.12635,-0.083533,-0.198002,0.416333,-0.158038,-0.167325,0.160968,0.165669,-0.475373,0.009034,-0.005143,-0.089509,0.240777,0.067836,0.03973,-0.033448,-0.007686,0.420803,-0.42511,-0.71148,0.483639,-0.27644,0.240522,-0.443829,0.072354,-0.144797,0.199109,-0.360737,-0.054475,0.280467,-0.049448,-0.135156,-0.235951,0.146473,0.32495,-0.286407,-0.053358,-0.337256,0.12998,0.064817,0.446663,-0.282567,-0.105054,-0.020431,0.328381,0.158882,0.392408,0.154662,-0.128821,-0.354365,0.311226,-0.093949,-0.064931,0.749483,-0.129369,0.210027,0.241894,-0.087168
1,-0.121657,-0.014849,0.070528,0.500521,0.084273,-0.096024,0.167385,-0.036852,-0.399021,-0.048018,0.136454,0.107786,-0.297162,0.444317,-0.12772,-0.013451,-0.773391,0.186371,-0.302537,0.016916,0.215232,0.069882,-0.20924,-0.045727,-0.225071,0.494391,-0.225786,0.509155,-0.003305,0.185106,0.455848,-0.082957,-0.220593,0.127689,-0.178231,-0.162479,0.042377,0.208647,-0.200118,-0.362485,-0.132314,-0.249711,0.165682,-0.017387,-0.092428,0.35434,-0.225798,-0.239503,0.199879,0.200077,-0.333315,-0.049965,-0.114226,-0.203645,0.292729,0.041719,-0.114462,-0.103765,0.045197,0.421471,-0.366316,-0.508965,0.425646,-0.124894,0.270997,-0.471202,-0.061592,-0.014316,0.043974,-0.430071,-0.140089,0.220966,-0.32579,-0.162753,-3.4e-05,0.088225,0.315431,-0.34169,0.069828,-0.288584,0.162872,0.128297,0.487886,-0.031563,0.063677,-0.082449,0.223576,0.077672,0.380751,0.308745,-0.170433,-0.116894,0.140219,0.002309,-0.041805,0.476268,-0.146496,0.043419,0.129111,-0.163726
2,-0.110207,-0.046151,-0.000224,0.449611,0.025329,-0.128539,0.230904,0.034559,-0.511388,-0.043883,0.037088,-0.006725,-0.24635,0.410366,-0.121103,0.067868,-0.821242,0.267296,-0.234006,-0.054431,0.1322,0.009165,-0.283942,-0.031012,-0.27738,0.50596,-0.236447,0.414422,0.017461,0.268034,0.452287,-0.070483,-0.209483,0.169624,-0.264862,-0.142726,0.029964,0.166766,-0.074715,-0.306136,-0.220336,-0.124905,0.225293,-0.065364,0.139071,0.427726,-0.275268,-0.370011,0.088277,0.235381,-0.273576,-0.063696,-0.084095,-0.245139,0.2034,-0.017115,-0.094703,-0.076642,0.02538,0.507408,-0.363615,-0.41568,0.227224,-0.016011,0.350853,-0.425958,-0.109733,0.105229,0.061695,-0.442975,-0.138675,0.335085,-0.347467,-0.16063,-0.027318,0.126894,0.33092,-0.322031,-0.002568,-0.243308,0.231772,0.154809,0.570944,0.031403,0.070673,-0.014075,0.214857,0.162367,0.351656,0.309139,-0.260104,-0.134911,0.08458,0.110156,-0.017298,0.400916,-0.119892,-0.027579,0.118944,-0.171622
3,-0.094364,-0.057222,0.030015,0.434849,0.060691,-0.094997,0.287469,-0.035623,-0.376843,0.012184,-0.027856,0.175618,-0.371396,0.319723,-0.106397,0.012894,-0.743981,0.201785,-0.200769,-0.01941,0.126697,-0.010256,-0.19954,-0.09117,-0.241024,0.552718,-0.253676,0.496404,-0.091431,0.255198,0.497065,0.040202,-0.17501,0.230359,-0.286281,-0.218894,-1.7e-05,0.156945,-0.131163,-0.289445,-0.135094,-0.17996,0.192131,-0.018843,0.07643,0.330261,-0.183341,-0.278829,0.14201,0.317694,-0.38591,-0.024269,-0.165028,-0.289219,0.315958,0.003558,-0.112196,0.022984,-0.024145,0.491518,-0.403099,-0.392447,0.341808,-0.058731,0.282626,-0.405912,-0.017722,0.061002,0.039525,-0.428891,-0.177755,0.271228,-0.302849,-0.071305,0.005786,0.121904,0.363337,-0.396479,0.066659,-0.309176,0.172268,0.124327,0.551774,-0.020521,0.109817,-0.01054,0.230448,0.194342,0.408737,0.410097,-0.15055,-0.178299,0.147333,0.005789,0.031415,0.377875,-0.132971,0.074263,0.100161,-0.247951
4,-0.030069,-0.01059,0.032331,0.595428,-0.024374,-0.044333,0.133727,0.054619,-0.344878,-0.066339,0.161168,0.434677,-0.491138,0.452924,0.058592,-0.012881,-0.874046,-0.029142,-0.090585,0.09063,0.110358,0.208642,-0.412024,-0.316117,-0.152933,0.691525,-0.203705,0.614927,-0.161443,0.216866,0.452875,-0.224074,-0.344365,0.352652,-0.281998,-0.323645,-0.072428,0.015357,-0.278148,-0.179656,-0.054855,-0.434413,0.101457,0.02197,-0.288704,0.236826,-0.052719,-0.033085,0.161793,0.342465,-0.643959,0.12688,-0.037119,0.03128,0.295037,0.09591,-0.006889,0.004987,-0.081125,0.534881,-0.452425,-0.60924,0.540556,-0.272746,0.166594,-0.489386,0.001836,-0.235071,0.21187,-0.399724,-0.085014,0.218973,-0.09919,-0.024435,-0.297239,0.081215,0.390916,-0.30084,0.166042,-0.596227,0.044427,0.119365,0.28499,-0.364508,-0.050369,0.209735,0.338519,0.125723,0.460634,0.132371,-0.083103,-0.417014,0.376247,-0.245947,0.079717,0.645837,-0.108228,0.200077,0.147405,-0.365396


It should be noted that in this case, the columns do not provide information similar to how a tf-idf column corresponds to one word. This representation is purely for convenience and consistency, it won't matter during the prediction step.

# Feature Selection

In order to further reduce the dimensions of the feature matrix, the number of selected features will be halved using the top features according to the **chi-squared** feature selection method.

## Vectorization Features

### TF-IDF

In [16]:
selected_tfidf_balanced = util.chi2_feature_selection(tfidf_train_balanced, train_balanced_tokens['class'], tfidf_test_balanced, percentile=50)

In [17]:
tfidf_sel_train_balanced = selected_tfidf_balanced['features_train']
tfidf_sel_test_balanced = selected_tfidf_balanced['features_test']
tfidf_sel_model_balanced = selected_tfidf_balanced['selector']

In [18]:
selected_tfidf_imbalanced = util.chi2_feature_selection(tfidf_train_imbalanced, train_imbalanced_tokens['class'], tfidf_test_imbalanced, percentile=50)

In [19]:
tfidf_sel_train_imbalanced = selected_tfidf_imbalanced['features_train']
tfidf_sel_test_imbalanced = selected_tfidf_imbalanced['features_test']
tfidf_sel_model_imbalanced = selected_tfidf_imbalanced['selector']

The now-reduced train set:

In [20]:
tfidf_sel_train_balanced.head()

Unnamed: 0,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,link,list,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
0,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084374,0.0,0.339057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15563,0.0,0.0,0.0,0.0,0.0,0.0,0.162654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171982,0.0,0.0,0.0,0.0,0.18069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156724,0.119443,0.0,0.0,0.131815,0.0,0.0,0.0,0.0,0.0,0.0,0.149029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282967,0.0,0.097983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164643,0.0,0.105068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.066533,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154801,0.075345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072828,0.077132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.057281,0.0
4,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105705,0.0,0.0,0.147346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20159,0.0,0.0,0.0,0.305553,0.0,0.0,0.0,0.219565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14964,0.191565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0


# Final Dataset Creation

Before using the features for classification with the machine learning algorithms, it is best to tidy up the datasets and keep them consistent by concatenating the features, the id and the class columns in the same DataFrame.

In [21]:
column_names = ['email_class', 'email_id'] # column names changed in case the word class or id appear in the token list

### TF-IDF

In [22]:
final_tfidf_train_balanced = dataset_add_columns(tfidf_sel_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_tfidf_test_balanced = dataset_add_columns(tfidf_sel_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [23]:
final_tfidf_train_imbalanced = dataset_add_columns(tfidf_sel_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_tfidf_test_imbalanced = dataset_add_columns(tfidf_sel_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

Looking into one of the previously explored examples:

In [24]:
final_tfidf_train_balanced[final_tfidf_train_balanced['email_id'] == 6]

Unnamed: 0,email_id,email_class,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,...,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
2089,6,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224085,0.0,0.0,0.0,0.0,0.0,0.150715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.303853,0.09435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202054,0.0,0.0,0.0,0.0,0.0,0.0,0.190245,0.0,0.0,0.175255,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.107273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150865,0.138519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133462,0.0,0.0,0.0,0.379817,0.119663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.352513,0.228697,0.0,0.120935,0.0,0.0,0.207051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The words that appear more in the email have a bigger score, while the words that don't appear at all have a score of zero.

### Word2Vec

In [25]:
final_word2vec_train_balanced = dataset_add_columns(word2vec_train_balanced, [train_balanced_tokens['class'], train_balanced_tokens['id']], column_names)
final_word2vec_test_balanced = dataset_add_columns(word2vec_test_balanced, [test_balanced_tokens['class'], test_balanced_tokens['id']], column_names)

In [26]:
final_word2vec_train_imbalanced = dataset_add_columns(word2vec_train_imbalanced, [train_imbalanced_tokens['class'], train_imbalanced_tokens['id']], column_names)
final_word2vec_test_imbalanced = dataset_add_columns(word2vec_test_imbalanced, [test_imbalanced_tokens['class'], test_imbalanced_tokens['id']], column_names)

In [27]:
final_tfidf_train_balanced.head()

Unnamed: 0,email_id,email_class,0px,10,11,2000,2001,2002,2015,2016,2017,2018,30,713,853,absa,access,account,activity,address,administrator,agreement,alert,also,app,apple,attach,august,auto,automatically,available,avoid,bank,banking,bill,block,browser,buy,california,call,cancel,card,cc,chase,chris,click,come,comment,committee,company,confirm,continue,contract,copyright,cost,could,customer,daily,date,david,de,deal,dear,december,deliver,delivery,detail,device,download,draft,due,ect,ee,email,emailaddress,employee,energy,enron,enronxgate,ensure,estatement,even,expire,express,failure,fargo,fax,ferc,final,forward,friday,full,fw,gas,get,give,go,good,great,group,hello,help,hope,houston,id,immediately,important,inbox,incoming,information,instruction,interest,january,jeff,john,jose,july,june,kindly,know,last,leave,let,like,limit,...,login,long,look,mail,mailbox,many,mark,market,meet,meeting,microsoft,mike,million,monday,month,much,natural,news,next,notice,notification,november,october,offer,one,online,option,party,password,payment,paypal,pending,people,per,permanently,personal,phone,plan,pm,point,position,power,price,privacy,process,profile,program,project,promise,protect,question,quota,rate,re,receive,recent,recently,record,reply,report,request,reserve,right,risk,safe,sara,say,schedule,scott,secure,security,see,sell,sender,server,service,sign,since,sincerely,sit,size,smith,space,spam,start,statement,still,stock,storage,street,subject,support,susan,talk,team,tell,thanks,thing,think,thursday,time,today,trade,trading,tuesday,two,update,upgrade,urladdress,usaa,use,user,utility,validate,verification,verify,view,want,wednesday,week,well,within,work,would,year
0,1872,False,0.0,0.0,0.0,0.0,0.127888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125564,0.0,0.0,0.0,0.0,0.181863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.084374,0.0,0.339057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15563,0.0,0.0,0.0,0.0,0.0,0.0,0.162654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171982,0.0,0.0,0.0,0.0,0.18069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156724,0.119443,0.0,0.0,0.131815,0.0,0.0,...,0.0,0.0,0.149029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104489,0.0,0.0,0.0,0.0,0.0,0.230173,0.0,0.0,0.171355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1014,False,0.0,0.0,0.0,0.0,0.131879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.368003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.585283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150362,0.0,0.0,0.0,0.0,0.0,0.194613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,781,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.282967,0.0,0.097983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164643,0.0,0.105068,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096811,0.0,0.0,0.0,0.0,0.0,0.113671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104842,0.0,0.0,0.06743,0.0,0.0,0.0,0.062616,0.0,0.609813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07906,0.0,0.0
3,2025,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083178,0.0,0.066533,0.0,0.0,0.0,0.0,0.168212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.069935,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154801,0.075345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072828,0.077132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.179586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169965,0.0,0.0,0.0,0.0,0.147413,0.0,0.0,0.0,0.0,0.197851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209804,0.0,0.0,0.0,0.057281,0.0
4,188,False,0.0,0.0,0.0,0.0,0.16022,0.0,0.0,0.0,0.0,0.0,0.0,0.197468,0.229865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.237021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241307,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105705,0.0,0.0,0.147346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.20159,0.0,0.0,0.0,0.305553,0.0,0.0,0.0,0.219565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14964,0.191565,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418959,0.0,0.0,0.0,0.130905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191565,0.0,0.0,0.0,0.0,0.0


### Saving the Results

In [28]:
save_to_csv(final_tfidf_train_balanced, csv_path, 'tfidf_chi2_train_balanced.csv')
save_to_csv(final_tfidf_test_balanced, csv_path, 'tfidf_chi2_test_balanced.csv')

save_to_csv(final_tfidf_train_imbalanced, csv_path, 'tfidf_chi2_train_imbalanced.csv')
save_to_csv(final_tfidf_test_imbalanced, csv_path, 'tfidf_chi2_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/tfidf_chi2_test_imbalanced.csv


In [29]:
save_to_csv(final_word2vec_train_balanced, csv_path, 'word2vec_train_balanced.csv')
save_to_csv(final_word2vec_test_balanced, csv_path, 'word2vec_test_balanced.csv')

save_to_csv(final_word2vec_train_imbalanced, csv_path, 'word2vec_train_imbalanced.csv')
save_to_csv(final_word2vec_test_imbalanced, csv_path, 'word2vec_test_imbalanced.csv')

Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_balanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_train_imbalanced.csv
Saving to /home/ichanis/projects/phishing_public/data/csv/word2vec_test_imbalanced.csv
