## Preprocessing

In [73]:
import gensim
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import *
# NLP with NLTK
import nltk
nltk.download("stopwords")
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import os
import raw_utils

import pandas as pd
import preprocessing as util

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/khanhnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/khanhnguyen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'datasets/csv/')

phishing_csv = 'raw_phishing.csv'
legitimate_csv = 'raw_legitimate.csv'

In [75]:
phishing_text_raw = pd.read_csv(os.path.join(csv_path, phishing_csv), index_col=0, dtype={'body': 'object'})
legitimate_text_raw = pd.read_csv(os.path.join(csv_path, legitimate_csv), index_col=0, dtype={'body': 'object'})

In [76]:
phishing_text_raw.head()

Unnamed: 0,body
0,
1,"<HTML>\n<BODY>\n<font face=""arial"" size=""5"" co..."
2,ID: jose@monkey.org\nYour jose@monkey.org pass...
3,=3Ctable border=3D0 width=3D=22100=25=22 cellp...
4,"Your account has 1 New Alert, received on 20/0..."


In [77]:
legitimate_text_raw['body']

0       Matthias Saou (matthias@rpmforge.net) wrote*:\...
1       |:::::::::::::::::::::::::::::::::::::::::::::...
2       Hello Bill,\n\nMonday, September 30, 2002, 5:4...
3       >>>Chris Garrigues said:\n > > From:  Brent We...
4       from slate's "today's papers":  \nThe New York...
                              ...                        
6573     <http://www.sportsline.com/links/6/1/163/sffe...
6574    \nAs much as I'd like to, I can't get to every...
6575    Darrell,\n\nBen at Conoco would like to know t...
6576    Just to keep you in the loop.   I am addressin...
6577    I was looking for tommorow.   I'll pass on ton...
Name: body, Length: 6578, dtype: object

## Cleanup

In [78]:
phishing_text_raw = phishing_text_raw.dropna()
phishing_text = phishing_text_raw[phishing_text_raw['body'].str.contains("This text is part of the internal format of your mail folder, and is not\na real message.") == False]
phishing_text = phishing_text[phishing_text.duplicated(keep='first') == False]
phishing_text.shape

(4922, 1)

In [79]:
legitimate_text_raw = legitimate_text_raw.dropna()
legitimate_text = legitimate_text_raw[legitimate_text_raw['body'].str.contains("This text is part of the internal format of your mail folder, and is not\na real message.") == False]
legitimate_text = legitimate_text[legitimate_text.duplicated(keep='first') == False]
legitimate_text.shape

(6373, 1)

In [80]:
phishing_text['body'] = phishing_text['body'].apply(util.replace_email)
phishing_text['body'] = phishing_text['body'].apply(util.replace_url)

In [81]:
phishing_text['body'].iloc[1]

'ID: <emailaddress>\nYour <emailaddress> password expires today  10/23/2023 2:19:09 a.m.\nUse the button below to continue with same password\nContinue <<urladdress><emailaddress>>\nNote: Your mails may not be delivered until you verify your account.\nSincerely,\n<urladdress> Support Team.'

In [82]:
legitimate_text['body'] = legitimate_text['body'].apply(util.replace_email)
legitimate_text['body'] = legitimate_text['body'].apply(util.replace_url)

In [None]:
phishing_preprocessed_text = phishing_text.copy()
legitimate_preprocessed_text = legitimate_text.copy()
phishing_preprocessed_text['body'] = phishing_preprocessed_text['body'].apply(util.sanitize_whitespace)
legitimate_preprocessed_text['body'] = legitimate_preprocessed_text['body'].apply(util.sanitize_whitespace)

phishing_preprocessed_text['body'] = phishing_preprocessed_text['body'].apply(util.sanitize_addresses)
legitimate_preprocessed_text['body'] = legitimate_preprocessed_text['body'].apply(util.sanitize_addresses)

phishing_preprocessed_text['label'] = 1
legitimate_preprocessed_text['label'] = 0

mix_dataset_text = pd.concat([phishing_preprocessed_text, legitimate_preprocessed_text], ignore_index=True)
mix_dataset_text = mix_dataset_text.sample(frac=1, random_state=1769).reset_index(drop=True)
mix_dataset_text = mix_dataset_text[mix_dataset_text['body'].astype(bool)]
mix_dataset_text.head()

Unnamed: 0,body,label
0,Access to the Internet and Intranet is current...,0
1,"Sara and Marie, \n\nMike Kustra was in to meet...",0
2,"-----Original Message-----\nFrom: \tSanders, R...",0
3,Mailbox usage\n \n \n\t \n\n Has used 1...,1
4,InfoByTel: Your Personal Sales Assistant!\n\n ...,1


In [84]:
raw_utils.save_to_csv(mix_dataset_text, csv_path, 'dataset_text.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/dataset_text.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/dataset_text.csv will be overwritten.


In [85]:
# Custom stop words and preprocessing filters

stopWords = nltk.corpus.stopwords
stopWords = stopWords.words("english")
stopWords.extend(["nbsp", "font", "sans", "serif", "bold", "arial", "verdana", "helvetica", "http", "https", "www", "html", "enron", "margin", "spamassassin"])

def remove_custom_stopwords(p):
    return remove_stopwords(p, stopwords=stopWords)

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)

    lemmatized_tokens = [
        lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
        if get_wordnet_pos(tag)
        else lemmatizer.lemmatize(word)
        for word, tag in tagged_tokens
    ]
    return ' '.join(lemmatized_tokens)
stem_text
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_custom_stopwords, remove_stopwords, strip_short, lemmatize]

In [86]:
# Decode to utf-8 as needed

phishingDecoded = []
for b in phishing_text['body'].tolist():
    try:
        p = b.decode("utf-8", errors="ignore")
    except AttributeError:
        p = b
    phishingDecoded.append(p)

legitimateDecoded = []
for b in legitimate_text['body'].tolist():
    try:
        p = b.decode("utf-8", errors="ignore")
    except AttributeError:
        p = b
    legitimateDecoded.append(p)

In [87]:
# Phishing emails
phishingPreprocessed = []
for b in phishingDecoded:
    p = preprocess_string(b, filters=CUSTOM_FILTERS)
    
    phishingPreprocessed.append(p)
print(len(phishingPreprocessed))

4922


In [88]:
# legitimate emails
legitimatePreprocessed = []
for b in legitimateDecoded:
    p = preprocess_string(b, filters=CUSTOM_FILTERS)
    
    legitimatePreprocessed.append(p)
print(len(legitimatePreprocessed))

6373


In [89]:
# Lọc và loại bỏ các token list rỗng
phishingPreprocessed = [tokens for tokens in phishingPreprocessed if tokens]

# Chuyển danh sách 2 chiều thành danh sách 1 chiều với định dạng chuỗi
phishingPreprocessed = [str(tokens) for tokens in phishingPreprocessed]
phishingDataFrame = pd.DataFrame(phishingPreprocessed, columns=['body'])
phishingDataFrame['label'] = 1
phishingDataFrame = phishingDataFrame[['label', 'body']]

In [90]:
phishingDataFrame.head()

Unnamed: 0,label,body
0,1,"['save', 'life', 'insurance', 'check', 'price'..."
1,1,"['password', 'expire', 'today', 'use', 'button..."
2,1,"['ctable', 'border', 'width', 'cellpadding', '..."
3,1,"['account', 'new', 'alert', 'receive', 'click'..."
4,1,"['fwd', 'remit', 'waste', 'management', 'invoi..."


In [91]:
legitimatePreprocessed = [tokens for tokens in legitimatePreprocessed if tokens]
legitimatePreprocessed = [str(tokens) for tokens in legitimatePreprocessed]
legitimateDataFrame = pd.DataFrame(legitimatePreprocessed, columns=['body'])
legitimateDataFrame['label'] = 0
# Đổi thứ tự cột sao cho 'label' đứng trước 'body'
legitimateDataFrame = legitimateDataFrame[['label', 'body']]

In [92]:
legitimateDataFrame.head()

Unnamed: 0,label,body
0,0,"['matthias', 'saou', 'write', 'install', 'xmms..."
1,0,"['monday', 'september', 'student', 'life', 'be..."
2,0,"['hello', 'monday', 'september', 'write', 'thi..."
3,0,"['chris', 'garrigues', 'say', 'brent', 'welch'..."
4,0,"['slate', 'today', 'paper', 'new', 'york', 'ti..."


In [None]:
mix_dataset = pd.concat([phishingDataFrame, legitimateDataFrame], ignore_index=True)
mix_dataset = mix_dataset.sample(frac=1, random_state=1769).reset_index(drop=True)
mix_dataset.head()

Unnamed: 0,label,body
0,0,"['start', 'date', 'hourahead', 'hour', 'ancill..."
1,0,"['tue', 'oct', 'ben', 'liblit', 'write', 'ick'..."
2,1,"['want', 'million', 'buck', 'year', 'probably'..."
3,0,"['mark', 'initial', 'test', 'delay', 'able', '..."
4,1,"['multi', 'message', 'mime', 'format', 'conten..."


In [94]:
raw_utils.save_to_csv(mix_dataset, csv_path, 'mix_dataset.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/mix_dataset.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/mix_dataset.csv will be overwritten.


In [95]:
train_dataset, test_dataset = util.dataset_split(mix_dataset, percent=20)

In [96]:
raw_utils.save_to_csv(train_dataset, csv_path, 'train_dataset.csv')
raw_utils.save_to_csv(test_dataset, csv_path, 'test_dataset.csv')

File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/train_dataset.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/train_dataset.csv will be overwritten.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/test_dataset.csv already exists.
File /home/khanhnguyen/Public/workspace/python_3.10/project-phishing-email-detection/datasets/csv/test_dataset.csv will be overwritten.
