In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from gensim.models import Word2Vec
from preprocessing import dataset_add_columns
from raw_utils import save_to_csv

from sklearn.preprocessing import StandardScaler

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'datasets/csv/')
train_token_file = 'train_dataset.csv'
test_token_file = 'test_dataset.csv'

In [3]:
train_tokens = pd.read_csv(os.path.join(csv_path, train_token_file), index_col=0, converters={'body': literal_eval})
test_tokens = pd.read_csv(os.path.join(csv_path, test_token_file), index_col=0, converters={'body': literal_eval})

In [4]:
train_tokens.head()

Unnamed: 0,label,body
0,1,"[mail, quota, attention, email, quota, reach, ..."
1,0,"[searchnetworking, voice, tip, july, sponsor, ..."
2,1,"[ecowas, bank, limit, add, rue, mission, lome,..."
3,0,"[fork, write, run, neighbor, meet, pigeonhole,..."
4,0,"[tana, realize, information, need, talk, chase..."


In [5]:
def filter_vocab_words(wordlist, vocabulary):
    """
    Remove words not appearing in a vocabulary from a list.
    
    Parameters
    ----------
    wordlist : list of str
        The list of words to be filtered.
    vocabulary : list of str
        The vocabulary that will do the filtering.
        
    Returns
    -------
    list of str
        The filtered list.
    """
    return [word for word in wordlist if word in vocabulary]

def get_mean_vector(wordlist, word2vec_model):
    """
    Calculate the mean vector of a list of words.
    
    It takes the word vectors from the Word2Vec model and
    calculates the mean of those who appear in this specific
    list of words.
    
    Parameters
    ----------
    wordlist : list of str
        The list of words to be vectorized.
    word2vec_model : gensim.models.word2vec.Word2Vec
        The Word2Vec model that produced the word vectors.
        
    Returns
    -------
    numpy.ndarray
        An array containing the mean vector, or zeroes if
        the input wordlist was empty.
    """
    if len(wordlist) >= 1:
        return np.mean(word2vec_model.wv[wordlist], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

In [6]:
def word2vec_features(text_col_train, text_col_test=None, vector_size=100, min_count=5, max_vocab_size=None, workers=1):
    """
    Extract Word2Vec embedding features using gensim.
    
    It uses the skip-gram model that Word2Vec provides. This is
    hardcoded (sg=1).
    
    Word2Vec represents each word in the corpus as a high-dimensional
    vector. Then, get_mean_vector() is used to get the averages of
    the vectors of all the words in an email, after removing the
    words that do not appear in the vocabulary that Word2Vec built.
    
    Some parameters for the vectorizer can also be passed as
    arguments.

    Parameters
    ----------
    token_text_col_train : pandas.Series of list of str
        The series that contains the tokenized emails as word lists.
    token_text_col_test : pandas.Series of list of str or None
        The series of the test set that contains the tokenized emails
        as word lists, or None if a test set is not provided.
    vector_size : int, default 100
        The size (dimensions) of the word vectors that will be
        produced. To be used by Word2Vec.
    min_count : int, default 5
        The minimum number of times a term has to appear in order to
        be included in the vocabulary. To be used by Word2Vec.
    workers : int, default 1
        How many threads to do the processing on. Note that if this is
        not set to 1, the processing witll be faster but the result
        will not be 100% reproducible. To be used by Word2Vec.
    max_vocab_size : int, default None
        The maximum number of terms in the vocabulary. To be used by
        Word2Vec.
    
    Returns
    -------
    dict
    {'vectorizer': gensim.models.word2vec.Word2Vec,
     'word2vec_train': pandas.DataFrame
     'word2vec_test': pandas.DataFrame or None}
        A dictionary that contains the vectorizer and the vectorized
        sets.
    
    See Also
    --------
    filter_vocab_words : Remove words not appearing in a vocabulary from a list.
    get_mean_vector : Calculate the mean vector of a list of words.
    """
    output = dict()
    
    # sg = 1 is the skip-gram model of Word2Vec default CBOW. Dự đoán ngữ cảnh của từ từ một từ hiện tại.
    # workers = 1 is the number of threads to do the processing on.
    # min_count = 5 is the minimum number of times a term has to appear in order to be included in the vocabulary.
    # vector_size = 100 is the size (dimensions) of the word vectors that will be produced.
    # max_final_vocab = None is the maximum number of terms in the vocabulary.
    model = Word2Vec(sentences=text_col_train,
                     min_count=min_count, vector_size=vector_size, max_final_vocab=max_vocab_size,
                     sg=1, workers=workers, seed=1746)
    
    # get the vocabulary of the model
    vocab = list(model.wv.key_to_index.keys())
    
    # filter the words that are not in the vocabulary
    filtered_col_train = text_col_train.apply(filter_vocab_words, vocabulary=vocab)
    # get the mean vector of the words in the email
    col_with_means_train = filtered_col_train.apply(get_mean_vector, word2vec_model=model)    
    word2vec_features_train = pd.DataFrame(col_with_means_train.tolist())
    
    output['vectorizer'] = model
    output['word2vec_train'] = word2vec_features_train
    
    if text_col_test is not None:
        filtered_col_test = text_col_test.apply(filter_vocab_words, vocabulary=vocab)
        col_with_means_test = filtered_col_test.apply(get_mean_vector, word2vec_model=model)    
        word2vec_features_test = pd.DataFrame(col_with_means_test.tolist())
        
        output['word2vec_test'] = word2vec_features_test
    else:
        output['word2vec_features_test'] = None

    return output

In [7]:
word2vec_dataset = word2vec_features(train_tokens['body'], test_tokens['body'], vector_size=200, min_count=3)

In [8]:
word2vec_train = word2vec_dataset['word2vec_train']
word2vec_test = word2vec_dataset['word2vec_test']
word2vec_model = word2vec_dataset['vectorizer']
word2vec_model.save('word2vec_model.model')

In [9]:
word2vec_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.243125,-0.253478,-0.005901,-0.076038,0.031763,-0.035917,-0.073341,0.577681,-0.02994,0.030174,...,-0.267438,-0.105795,-0.094777,0.053563,-0.431388,-0.062541,-0.12167,-0.055277,0.31647,-0.118448
1,-0.128491,-0.087448,0.01022,-0.070209,0.05644,0.013893,-0.057083,0.294077,-0.057198,0.187377,...,-0.100806,-0.010143,0.073192,0.039565,-0.348876,0.058328,-0.012493,0.059502,0.185431,-0.054062
2,-0.137854,-0.154933,-0.000142,0.00098,0.029971,-0.066496,0.17273,0.261095,-0.091565,-0.056001,...,-0.326195,-0.069702,0.230789,0.062554,-0.380453,0.158121,-0.12122,0.012845,0.202206,-0.174568
3,-0.188086,-0.339599,0.001944,-0.067542,0.032086,0.170307,0.03412,0.338939,0.068139,0.034965,...,-0.230627,-0.086075,-0.067766,0.145196,-0.424615,0.09905,0.045489,-0.009281,0.206298,-0.176881
4,-0.020927,-0.083643,-0.014417,-0.121288,0.0605,0.018497,0.029439,0.286098,-0.15038,0.112154,...,-0.066439,0.015471,0.123407,-0.028162,-0.353352,0.194933,-0.039465,-0.034747,0.209495,-0.043877


In [10]:
word2vec_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.126992,-0.067241,0.00994,-0.064464,0.108283,-0.034844,0.038948,0.290813,-0.051411,0.107861,...,-0.150178,-0.002499,0.004047,0.032008,-0.332496,-0.034313,-0.032945,0.020365,0.168084,-0.056356
1,-0.122675,-0.048225,0.09184,0.067524,0.222464,-0.017531,0.054205,0.414692,0.107811,0.278543,...,-0.035272,-0.097213,-0.047685,-0.032185,-0.279433,0.101852,-0.178845,0.129921,0.113955,-0.091031
2,-0.135664,-0.00677,0.070633,-0.113148,0.042992,0.049495,-0.08203,0.217859,-0.048631,0.303864,...,-0.067324,0.046392,0.038574,-0.052215,-0.393888,-0.009136,0.008983,0.004195,0.075062,-0.136974
3,-0.16097,-0.098071,-0.014548,-0.130055,0.067389,-0.043748,-0.000222,0.285189,-0.052315,0.154453,...,-0.13908,-0.019192,0.063769,0.013333,-0.376943,0.04679,-0.11874,-0.007472,0.190356,-0.026665
4,-0.255458,0.053246,0.017971,-0.226264,0.287694,0.205495,0.068273,0.367642,-0.107119,0.104669,...,-0.032592,0.199033,0.304943,-0.153514,-0.372404,0.073806,-0.082436,0.089757,0.093158,-0.097758


In [11]:
# scaler = StandardScaler()
# word2vec_train_scaled = scaler.fit_transform(word2vec_train)

# word2vec_test_scaled = scaler.transform(word2vec_test)

In [12]:
final_word2vec_train = dataset_add_columns(word2vec_train, [train_tokens['label']], ['label'])
final_word2vec_test = dataset_add_columns(word2vec_test, [test_tokens['label']], ['label'])

In [13]:
final_word2vec_train.head()

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,1,-0.243125,-0.253478,-0.005901,-0.076038,0.031763,-0.035917,-0.073341,0.577681,-0.02994,...,-0.267438,-0.105795,-0.094777,0.053563,-0.431388,-0.062541,-0.12167,-0.055277,0.31647,-0.118448
1,0,-0.128491,-0.087448,0.01022,-0.070209,0.05644,0.013893,-0.057083,0.294077,-0.057198,...,-0.100806,-0.010143,0.073192,0.039565,-0.348876,0.058328,-0.012493,0.059502,0.185431,-0.054062
2,1,-0.137854,-0.154933,-0.000142,0.00098,0.029971,-0.066496,0.17273,0.261095,-0.091565,...,-0.326195,-0.069702,0.230789,0.062554,-0.380453,0.158121,-0.12122,0.012845,0.202206,-0.174568
3,0,-0.188086,-0.339599,0.001944,-0.067542,0.032086,0.170307,0.03412,0.338939,0.068139,...,-0.230627,-0.086075,-0.067766,0.145196,-0.424615,0.09905,0.045489,-0.009281,0.206298,-0.176881
4,0,-0.020927,-0.083643,-0.014417,-0.121288,0.0605,0.018497,0.029439,0.286098,-0.15038,...,-0.066439,0.015471,0.123407,-0.028162,-0.353352,0.194933,-0.039465,-0.034747,0.209495,-0.043877


In [14]:
final_word2vec_test[:10]

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,190,191,192,193,194,195,196,197,198,199
0,0,-0.126992,-0.067241,0.00994,-0.064464,0.108283,-0.034844,0.038948,0.290813,-0.051411,...,-0.150178,-0.002499,0.004047,0.032008,-0.332496,-0.034313,-0.032945,0.020365,0.168084,-0.056356
1,0,-0.122675,-0.048225,0.09184,0.067524,0.222464,-0.017531,0.054205,0.414692,0.107811,...,-0.035272,-0.097213,-0.047685,-0.032185,-0.279433,0.101852,-0.178845,0.129921,0.113955,-0.091031
2,0,-0.135664,-0.00677,0.070633,-0.113148,0.042992,0.049495,-0.08203,0.217859,-0.048631,...,-0.067324,0.046392,0.038574,-0.052215,-0.393888,-0.009136,0.008983,0.004195,0.075062,-0.136974
3,0,-0.16097,-0.098071,-0.014548,-0.130055,0.067389,-0.043748,-0.000222,0.285189,-0.052315,...,-0.13908,-0.019192,0.063769,0.013333,-0.376943,0.04679,-0.11874,-0.007472,0.190356,-0.026665
4,0,-0.255458,0.053246,0.017971,-0.226264,0.287694,0.205495,0.068273,0.367642,-0.107119,...,-0.032592,0.199033,0.304943,-0.153514,-0.372404,0.073806,-0.082436,0.089757,0.093158,-0.097758
5,0,-0.162314,-0.055856,0.017512,0.026711,0.076674,0.050756,-0.089573,0.356582,-0.00317,...,-0.118492,-0.049656,-0.070937,0.075749,-0.249595,0.008129,-0.048141,0.124617,0.170126,-0.113942
6,0,-0.33908,-0.201685,0.000246,0.026145,-0.005381,-0.036172,0.123515,0.495824,-0.119255,...,-0.106477,-0.016947,-0.270211,-0.075668,-0.411423,-0.033024,-0.17493,-0.032534,0.118478,-0.218261
7,0,-0.185813,-0.206757,-0.050928,-0.148968,0.005197,0.112387,0.013696,0.306255,-0.019443,...,-0.19123,-0.00295,0.00179,0.137551,-0.374839,0.102953,-0.015143,0.023241,0.212412,-0.127702
8,1,-0.214357,-0.130019,-0.037256,-0.158568,0.210559,0.002923,0.118261,0.277471,-0.027241,...,-0.242897,-0.085152,0.230315,0.078567,-0.405868,0.047288,-0.226507,-0.024481,0.277169,0.002097
9,0,-0.087806,-0.038792,0.047388,-0.106134,0.058922,0.019034,-0.073815,0.323194,-0.081519,...,-0.105398,-0.007219,0.055553,-0.017283,-0.329628,0.076396,0.029748,0.030689,0.151088,-0.091413


In [15]:
save_to_csv(final_word2vec_train, csv_path, 'word2vec_train.csv')
save_to_csv(final_word2vec_test, csv_path, 'word2vec_test.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/word2vec_train.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/word2vec_train.csv will be overwritten.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/word2vec_test.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/word2vec_test.csv will be overwritten.
