Import Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split


Load Dataset

In [2]:
df = pd.read_csv("combined_cleaned_data_updated.csv")
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,2008-08-05 23:31:02+00:00,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1.0,1.0
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,2008-08-05 23:31:03+00:00,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1.0,1.0
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,2008-08-06 08:28:00+00:00,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1.0,1.0
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,2008-08-05 23:31:20+00:00,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0.0,1.0
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,2008-08-05 23:31:21+00:00,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1.0,1.0


Text Processing

In [3]:
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to process text
def process_text(df, columns_to_combine):
    """
    This function combines specified columns into one text column and processes it by:
    1. Lowercase conversion
    2. Removing punctuation, URLs, and numeric values
    3. Tokenization
    4. Stopword removal
    5. Lemmatization
    
    Parameters:
        df: pandas DataFrame
        columns_to_combine: list of columns to combine into a single text column
    
    Returns:
        df: pandas DataFrame with a new 'processed_text' column
    """
    def clean_text(text):        
        # Remove unwanted patterns (URLs, HTML tags, special characters, digits, etc.)
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        text = re.sub(r'\n', '', text)
        text = re.sub(r'\w*\d\w*', '', text)
        
        # Tokenize text
        tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
        alphabetic_tokens = [word for word in tokens if word.isalpha()]  # Keep only alphabetic words
        
        # Remove stopwords
        no_stopwords = [word for word in alphabetic_tokens if word not in stop_words]
        
        # Lemmatize tokens
        lemmatized_words = [lemmatizer.lemmatize(word) for word in no_stopwords]
        
        # Join back into a single string
        processed_text = ' '.join(lemmatized_words)
        return processed_text
    
    # Combine specified columns into a single string column
    df['combined_text'] = df[columns_to_combine].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    
    # Apply the text cleaning function to the combined column
    df['processed_text'] = df['combined_text'].apply(clean_text)
    
    return df

# Columns to combine
columns_to_combine = ['subject', 'body']

# Process text and create 'processed_text' column
df = process_text(df, columns_to_combine)
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeffkhong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeffkhong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jeffkhong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text,processed_text
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,2008-08-05 23:31:02+00:00,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1.0,1.0,"Never agree to be a loser Buck up, your troubl...",never agree loser buck trouble caused small di...
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,2008-08-05 23:31:03+00:00,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1.0,1.0,Befriend Jenna Jameson \nUpgrade your sex and ...,befriend jenna jameson upgrade sex pleasure te...
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,2008-08-06 08:28:00+00:00,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1.0,1.0,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...,cnncom daily top daily top cnncom top video st...
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,2008-08-05 23:31:20+00:00,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0.0,1.0,Re: svn commit: r619753 - in /spamassassin/tru...,svn commit spamassassintrunk libmailspamassass...
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,2008-08-05 23:31:21+00:00,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1.0,1.0,SpecialPricesPharmMoreinfo \nWelcomeFastShippi...,specialpricespharmmoreinfo welcomefastshipping...


Train-Test Split

In [4]:
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=42)
train_dataset.reset_index(inplace=True, drop=True)
test_dataset.reset_index(inplace=True, drop=True)

In [6]:
import gensim.downloader as api

# Download the "word2vec-google-news-300" model
word2vec_model = api.load('word2vec-google-news-300')


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def extract_tfidf_features(train, test, column_name, max_features=1000):
    """
    Extract TF-IDF features from a specified column in the train and test datasets
    and append them as a single column.

    Parameters:
        train: pandas DataFrame, the training dataset
        test: pandas DataFrame, the test dataset
        column_name: str, the name of the column containing text
        max_features: int, the maximum number of features for TF-IDF
    
    Returns:
        train: pandas DataFrame with TF-IDF features appended
        test: pandas DataFrame with TF-IDF features appended
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    
    # Fit on the training data and transform both train and test sets
    train_tfidf = vectorizer.fit_transform(train[column_name])
    test_tfidf = vectorizer.transform(test[column_name])
    
    # Convert TF-IDF matrices to dense format and store them as a single column
    train['tfidf_features'] = list(train_tfidf.toarray())
    test['tfidf_features'] = list(test_tfidf.toarray())
    
    return train, test

# Example usage:
train_dataset, test_dataset = extract_tfidf_features(train_dataset, test_dataset, 'processed_text', max_features=1000)

In [8]:
from tqdm import tqdm
import numpy as np
import pandas as pd

def extract_word2vec_features(train, test, column_name, word2vec_model, vector_size=300):
    """
    Extract Word2Vec features from a specified column in the train and test datasets
    and append them as a single column.

    Parameters:
        train: pandas DataFrame, the training dataset
        test: pandas DataFrame, the test dataset
        column_name: str, the name of the column containing tokenized text
        word2vec_model: Pretrained gensim Word2Vec model
        vector_size: int, size of the word vectors (e.g., 300 for Google News model)

    Returns:
        train: pandas DataFrame with Word2Vec features appended
        test: pandas DataFrame with Word2Vec features appended
    """
    # Function to get Word2Vec features for a list of words
    def get_word2vec_features(tokenized_text):
        word_vectors = [word2vec_model[word] for word in tokenized_text if word in word2vec_model]
        if len(word_vectors) > 0:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(vector_size)

    tqdm.pandas()  # Enable tqdm for pandas
    
    # Apply the function to train and test datasets
    train_word2vec = train[column_name].progress_apply(lambda x: get_word2vec_features(x.split()))
    test_word2vec = test[column_name].progress_apply(lambda x: get_word2vec_features(x.split()))
    
    # Store the feature arrays in a single column
    train['word2vec_features'] = list(np.vstack(train_word2vec))
    test['word2vec_features'] = list(np.vstack(test_word2vec))
    
    return train, test

# Example usage:
train_dataset, test_dataset = extract_word2vec_features(train_dataset, test_dataset, 'processed_text', word2vec_model, vector_size=300)


100%|██████████| 126700/126700 [00:26<00:00, 4856.44it/s]
100%|██████████| 31675/31675 [00:06<00:00, 5083.42it/s]


In [10]:
from sentence_transformers import SentenceTransformer

def extract_transformer_features(train, test, column_name, model):
    """
    Extract transformer-based features from a specified column in the train and test datasets
    and append them as a single column.

    Parameters:
        train: pandas DataFrame, the training dataset
        test: pandas DataFrame, the test dataset
        column_name: str, the name of the column containing text
        model: Pretrained transformer model (e.g., SentenceTransformer)
    
    Returns:
        train: pandas DataFrame with transformer features appended
        test: pandas DataFrame with transformer features appended
    """
    # Function to generate transformer-based features
    def get_transformer_features(text):
        return model.encode(text).tolist()
    
    tqdm.pandas()  # Enable tqdm for pandas
    
    # Apply the function to train and test datasets
    train_transformer = train[column_name].progress_apply(get_transformer_features)
    test_transformer = test[column_name].progress_apply(get_transformer_features)
    
    # Store the feature arrays in a single column
    train['transformer_features'] = list(train_transformer)
    test['transformer_features'] = list(test_transformer)
    
    return train, test

# Example usage:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Load the pre-trained model
train_dataset, test_dataset = extract_transformer_features(train_dataset, test_dataset, 'processed_text', model)


100%|██████████| 126700/126700 [1:26:01<00:00, 24.55it/s]
100%|██████████| 31675/31675 [20:02<00:00, 26.35it/s]


Features from Sender

In [11]:
# Function to extract one-hot encoding features from the domain
def extract_features_from_domain(domain):
    # Define the top 30 domains and top 10 phishing domains
    top30Domains = ['ENRON.com', 'gmail.com', 'samba.org', 'enron.com', 'yahoo.com', 'hotmail.com', 'unknown', 
                    'psy1.psych.arizona.edu', 'aol.com', 'mail.cnn.com', 'broadcast.shareholder.com', 'cbsig.com', 
                    'msn.com', 'yahoo.co.kr', 'python.org', 'issues.apache.org', 'messaging.accuweather.com', 
                    'earthlink.net', 'parrotcode.org', 'ccomad3.uu.commissioner.com', 'media.mit.edu', 'google.com', 
                    'foxnews.com', 'ENRON.net', 'stats.ox.ac.uk', 'v.loewis.de', 'perl.org', 'verizon.net', 
                    'flax9.uwaterloo.ca', 'gmx.net']

    top10Domains_with_1_phishing = ['yahoo.co.kr', 'flax9.uwaterloo.ca', 'korea.com', 'access-one.com', 'lingo.com',
                                    'absolutemotion.com', 'rr.com', 'nokia.com', 'daum.net', 'ebay.com']

    # Initialize a feature vector of zeros for both domain lists
    features = np.zeros(len(top30Domains) + len(top10Domains_with_1_phishing))

    # One-hot encode for the top 30 domains
    if domain in top30Domains:
        features[top30Domains.index(domain)] = 1

    # One-hot encode for the top 10 phishing domains with a phishing rate of 1.0
    if domain in top10Domains_with_1_phishing:
        features[len(top30Domains) + top10Domains_with_1_phishing.index(domain)] = 1

    return features

def extract_features_from_TLD(tld):
    top10tld_with_less_than25_more_than75 = ['org', 'edu', 'unknown', 'kr', 'nz', 'jp', 'ru', 'gov', 'pl', 'EDU']
    features = np.zeros(len(top10tld_with_less_than25_more_than75 ))
    # One-hot encode for the top 30 domains
    if tld in top10tld_with_less_than25_more_than75:
        features[top10tld_with_less_than25_more_than75.index(tld)] = 1
    return features


def extract_features_from_displayName(displayName):
    # Lists of safe and risky display names
    display_name_safe = ['unknown', 'Schedule Crawler', '"j.c.f."', '"Jonathan C. Forster"', '"AccuWeather.com Alert"', 
                         '"Nelson, Michelle"', '"Commissioner.COM"', 'Chas Owens', 'Prof Brian Ripley', 'Guido van Rossum']

    display_name_risky = ['Daily Top 10', 'CNN Alerts', '"Vanessa J. Smith"', '"Richard K. Lee"', 'SCC', '"Paul A. Davis"', 
                          'Sydney Car Centre', 'Aegis Capital Group LLC', 'Aegis Capital Group', '']
    
    features_safe = np.zeros(len(display_name_safe))
    features_risky = np.zeros(len(display_name_risky))
    
    # One-hot encode for display_name_safe
    if displayName in display_name_safe:
        features_safe[display_name_safe.index(displayName)] = 1
    
    # One-hot encode for display_name_risky
    if displayName in display_name_risky:
        features_risky[display_name_risky.index(displayName)] = 1

    return np.concatenate([features_safe, features_risky])

def extract_features_from_localPart(localPart):
    # Lists of safe and risky local parts
    local_part_safe = ['pete.davis', 'jforster', 'alert', 'tridge', 'jerry', 'qydlqcws-iacfym', 'noreply', 'cnnalerts', 'metze', 'inbox']

    local_part_risky = ['>', 'Montague', 'gnitpick', 'return', 'the00', 'sales', 'adwords-noreply', 'MAILER-DAEMON', 'alerts', 'response']
    
    # Initialize feature vector for safe and risky local parts
    features_safe = np.zeros(len(local_part_safe))
    features_risky = np.zeros(len(local_part_risky))
    
    # One-hot encode for local_part_safe
    if localPart in local_part_safe:
        features_safe[local_part_safe.index(localPart)] = 1
    
    # One-hot encode for local_part_risky
    if localPart in local_part_risky:
        features_risky[local_part_risky.index(localPart)] = 1

    # Combine both feature vectors (concatenated safe and risky encodings)
    return np.concatenate([features_safe, features_risky])

def extract_features_from_sender(sender):
    # Split the sender into display name and email address
    try:
        displayName, email = sender.split('<')
        displayName = displayName.strip()  # Remove whitespace
        email = email.strip('> ')  # Remove the closing bracket and spaces
    except ValueError:
        # In case the sender does not contain a display name
        displayName = 'unknown'
        email = sender.strip()

    if '@' in email:
        localPart, domain = email.split('@')
    else:
        localPart, domain = '', ''  # Default to empty if email part is missing

    # Extract features
    displayName_features = extract_features_from_displayName(displayName)
    localPart_features = extract_features_from_localPart(localPart)
    domain_features = extract_features_from_domain(domain)
    tld_features = extract_features_from_TLD(domain.split('.')[-1])

    # Combine all features into a single feature vector
    combined_features = np.concatenate([displayName_features, localPart_features, domain_features, tld_features])

    return combined_features

In [12]:
train_dataset['sender_features'] = train_dataset['sender'].apply(extract_features_from_sender)
test_dataset['sender_features'] = test_dataset['sender'].apply(extract_features_from_sender)

# Example usage of extract_features_from_sender for a given sender
example_sender= 'Daily Top 10 <arabia1990@mpdadvies.nl>'
feature = extract_features_from_sender(example_sender)
print(feature)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Features Subject

In [13]:
# Function to extract features from the subject
def extract_features_from_subject(subject):
    # Define the blacklist and whitelist words
    blacklist_words = [
        'price', 'immediate', 'sex', 'replica', 'watches', 'custom', 
        'money', 'account', 'urgent', 'online', 'secure', 'penis', 
        'cnn.com'
    ]
    whitelist_words = [
        'AccuWeather', 'UAI', 'perl', 'svn', 'commit', 'samba'
    ]

    # Initialize a feature vector of zeros for both word lists + 2 additional features
    features = np.zeros(len(blacklist_words) + len(whitelist_words) + 2)

    # Check for presence of blacklist words
    for i, word in enumerate(blacklist_words):
        if word in subject.lower():  # Check in lowercase for case insensitivity
            features[i] = 1

    # Check for presence of whitelist words
    for i, word in enumerate(whitelist_words):
        if word in subject.lower():  # Check in lowercase for case insensitivity
            features[len(blacklist_words) + i] = 1

    # Check if the subject is a reply
    features[len(blacklist_words) + len(whitelist_words)] = int(subject.lower().startswith("re:"))

    # Check for joined words using a regular expression
    features[len(blacklist_words) + len(whitelist_words) + 1] = int(bool(re.search(r'[a-zA-Z]+[A-Z][a-zA-Z]|[A-Z]{2,}', subject)))

    return features

In [14]:
train_dataset['subject_features'] = train_dataset['subject'].apply(extract_features_from_subject)
test_dataset['subject_features'] = test_dataset['subject'].apply(extract_features_from_subject)

# Example usage of extract_features_from_subject for a given subject
example_subject = 'Re: [samba] svn commit: r13784 - in branches/SAMBA_4_0/source: . lib/tls'
subject_feature = extract_features_from_subject(example_subject)
print(subject_feature)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]


Features for Date

In [15]:
def create_month_feature(date):
    # Extract the month directly from the string (format: 'YYYY-MM-DD')
    month = int(date[5:7])  # Get the month as an integer
    
    # One-hot encode the month (for months 1-12)
    month_dummies = [0] * 12  # Create a list of 12 zeros
    if month > 1:  # Drop the first month (January) for one-hot encoding
        month_dummies[month - 1] = 1  # Set the respective month index to 1
        
    return month_dummies  # Returns a list with one-hot encoded month

def extract_features_from_date(date_with_time):
    # Split the date and time, and extract only the date part
    return create_month_feature(date_with_time.split()[0])

In [16]:
train_dataset['date_features'] = train_dataset['date'].apply(extract_features_from_date)
test_dataset['date_features'] = test_dataset['date'].apply(extract_features_from_date)

# Example usage of extract_features_from_date for a given date
date_input = '2008-08-05 23:31:03+00:00'
month_feature = create_month_feature(date_input)
print(month_feature)

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]


Features from Body

In [17]:
# Function to classify URLs as legitimate, suspicious, or phishing
def classify_url(body):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    urls = url_pattern.findall(body)
    avg_length = 57.0  # Average URL length from the dataset
    if not urls:
        return [1.0, 0.0, 0.0]  # No URLs means it's legitimate

    for url in urls:
        url_length = len(url)
        if url_length >= avg_length * 1.5:
            return [0.0, 0.0, 1.0]  # Phishing URL
        elif avg_length <= url_length < avg_length * 1.5:
            return [0.0, 1.0, 0.0]  # Suspicious URL

    return [1.0, 0.0, 0.0]  # Legitimate URL


In [21]:
def classify_redirects_binary(body):
    """
    Count the number of redirects in the given HTML body and classify as binary.

    Args:
        body (str): The HTML body to check for redirects.

    Returns:
        int: 1 if phishing (at least one redirect), 0 if legitimate (no redirects).
    """
    redirect_pattern = r'(?i)\b(redirect|redirecting|window\.location|setTimeout|location\.href|document\.location|location\.replace|location\.assign|window\.location\.href|meta\s+http-equiv=["\']refresh["\']|<iframe|<form|<script)\b'
    number_of_redirects = len(re.findall(redirect_pattern, body))
    return 1 if number_of_redirects > 0 else 0  # 1 for phishing, 0 for legitimate

def count_newlines(body):
    """
    Count the number of newlines in the body.

    Args:
        body (str): The HTML body to analyze.

    Returns:
        int: The count of newlines.
    """
    return body.count('\n')

def transform_to_embedding(body):
    """
    Generate a sentence embedding from the body using a pre-trained transformer model.

    Args:
        body (str): The text to transform into an embedding.

    Returns:
        list: A list of floats representing the sentence embedding.
    """
    return model.encode(body).tolist()

# Function to extract phishing features from the email body
def extract_phishing_features_from_text(body):
    """
    Extracts phishing-related features from a single body of text.
    
    Args:
        body (str): The body of the email or text to analyze.
    
    Returns:
        dict: A dictionary containing the extracted features.
    """
    features = {}

    # Define a pattern for IP addresses
    ip_pattern = r'\b\d{1,3}(?:\.\d{1,3}){3}\b'
    features['ip_url_count'] = len(re.findall(ip_pattern, body))

    # Define suspicious TLDs
    suspicious_tlds = ['.xyz', '.biz', '.info']
    features['suspicious_tld_count'] = sum(tld in body.lower() for tld in suspicious_tlds)

    # Text complexity features
    words = body.split()
    features['word_count'] = len(words)
    features['average_word_length'] = sum(len(word) for word in words) / len(words) if len(words) > 0 else 0
    features['uppercase_word_count'] = sum(1 for word in words if word.isupper())

    # Special characters
    special_chars = "!$#@"
    features['special_char_ratio'] = sum(1 for char in body if char in special_chars) / len(body) if len(body) > 0 else 0

    # Exclamation and question marks
    features['exclamation_count'] = body.count('!')
    features['question_mark_count'] = body.count('?')

    # HTML tags
    features['html_tags'] = 1 if '<' in body and '>' in body else 0

    # Predetermined phishing keywords
    phishing_keywords = ['urgent', 'free', 'winner', 'offer', 'limited', 'click', 'claim', 'prize']
    features['phishing_keyword_count'] = sum(word in body.lower() for word in phishing_keywords)

    return list(features.values())

# Extract features from the email body 
def extract_features_from_body(body):
    """
    Extract various features from the HTML body for phishing detection.

    Args:
        body (str): The HTML body to analyze.

    Returns:
        list: A list of extracted features.
    """
    features = []
    
    # Count of redirects
    features.append(classify_redirects_binary(body))

    # Count of newlines
    features.append(count_newlines(body))

    # URL classification
    url_classification = classify_url(body) 
    features.extend(url_classification)  # Append the URL classification to features

    return features



In [19]:
train_dataset['body_features'] = train_dataset['body'].apply(extract_features_from_body)
test_dataset['body_features'] = test_dataset['body'].apply(extract_features_from_body)

# Example usage of extract_features_from_body for a given body
example_body = 'I hate this game, come buy bitcoin.'
body_feature = extract_features_from_body(example_body)
print(body_feature) 

[0, 0, 1.0, 0.0, 0.0]


Body features 2 to contains features from extract_phishing_features_from_text

In [22]:
train_dataset['body_features_2'] = train_dataset['body'].apply(extract_phishing_features_from_text)
test_dataset['body_features_2'] = test_dataset['body'].apply(extract_phishing_features_from_text)

# Example usage of extract_phishing_features_from_text for a given body
example_body = 'I hate this game, come buy bitcoin.'
body_feature_2 = extract_phishing_features_from_text(example_body)
print(body_feature_2)

[0, 0, 7, 4.142857142857143, 1, 0.0, 0, 0, 0, 0]


In [None]:
# Drop the original columns 
cols_to_drop = ['sender', 'receiver', 'date', 'subject', 'body', 'urls']


Save train and test datasets to be used for our models

In [23]:
train_dataset.shape

(126700, 17)

In [24]:
test_dataset.shape

(31675, 17)

In [25]:
train_dataset.columns

Index(['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls',
       'combined_text', 'processed_text', 'tfidf_features',
       'word2vec_features', 'transformer_features', 'sender_features',
       'subject_features', 'date_features', 'body_features',
       'body_features_2'],
      dtype='object')

In [26]:
train_dataset.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text,processed_text,tfidf_features,word2vec_features,transformer_features,sender_features,subject_features,date_features,body_features,body_features_2
0,eleanor foster <aceriwen@velnet.co.uk>,Nicolette Hudson <holden.salisbury@enron.com>,2001-08-22 07:27:04+00:00,Male muscle boosting system,"""I've been using your product for 4 months now...",1.0,1.0,"Male muscle boosting system ""I've been using y...",male muscle boosting system ive using product ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00971660204231739, 0.06431159377098083, 0.0...","[-0.06228220835328102, -0.00879708118736744, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 1.0, 0.0, 0.0]","[0, 0, 187, 4.540106951871658, 3, 0.0, 0, 0, 0..."
1,Lana Moore <Lana.Moore@nesanet.org>,'NESA Members' <eva.pollard@nesanet.org>,2001-10-23 15:04:52+00:00,NESA NYC Mixer - November 8,Please make plans to attend NESA's Membership ...,0.0,0.0,NESA NYC Mixer - November 8 Please make plans ...,nesa nyc mixer november please make plan atten...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.04451704025268555, 0.0018677711486816406, ...","[-0.019170133396983147, -0.06289736181497574, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 1.0, 0.0, 0.0]","[0, 0, 64, 4.53125, 1, 0.0028169014084507044, ..."
2,Common Dreams <listreply@commondreams.org>,ktwarwic@flax9.uwaterloo.ca,2007-05-15 05:49:53+00:00,News & Views | 05.14.07,\n \n\nCommon Dreams - Breaking News & Views f...,0.0,1.0,News & Views | 05.14.07 \n \n\nCommon Dreams -...,news view common dream breaking news view prog...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.005526355933398008, 0.04494216665625572, -...","[0.07190383970737457, -0.04377426207065582, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 84, 1.0, 0.0, 0.0]","[0, 0, 220, 10.027272727272727, 4, 0.000811688..."
3,"""Adam H. Kerman"" <ahk@chinet.chinet.com>",Pine Discussion Forum <pine-info@u.washington....,1998-01-08 07:10:16+00:00,Re: folder index - sender or recipient,">From: Jakob Kellner >Date: Thu, 8 Jan 1998 0...",0.0,0.0,Re: folder index - sender or recipient >From: ...,folder index sender recipient jakob kellner da...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0719418004155159, 0.00951385498046875, 0.02...","[0.032333265990018845, -0.03589319437742233, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 1.0, 0.0, 0.0]","[0, 0, 86, 4.686046511627907, 1, 0.0, 0, 1, 0, 0]"
4,Carl Helms <mxenitfrnpfkue@onlinetx.net>,opt4@flax9.uwaterloo.ca,2007-07-01 09:07:59+00:00,This is for you,\nThis is amazing stuff...\n\nAdd some inches ...,1.0,0.0,This is for you \nThis is amazing stuff...\n\n...,amazing stuffadd inch fast safe effective seen...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0525309257209301, -0.00717841275036335, 0....","[-0.08619339019060135, 0.05992841348052025, -0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 11, 1.0, 0.0, 0.0]","[0, 0, 30, 5.433333333333334, 1, 0.0, 0, 0, 0, 0]"


In [27]:
test_dataset.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls,combined_text,processed_text,tfidf_features,word2vec_features,transformer_features,sender_features,subject_features,date_features,body_features,body_features_2
0,Josiah Rivera <Gary.Washington@lambertpartners...,ktwarwic@flax9.uwaterloo.ca,2007-06-27 23:16:20+00:00,regalto,Men's Products' Discounts!!! 80% off SALE !!! ...,1.0,1.0,regalto Men's Products' Discounts!!! 80% off S...,regalto men product discount sale vaqra pill o...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.01425081118941307, 0.03438971936702728, 0.0...","[-0.09238181263208389, 0.037367917597293854, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 25, 1.0, 0.0, 0.0]","[0, 0, 181, 4.983425414364641, 8, 0.0109190172..."
1,Daily Top 10 <bhargav-buchhand@internetwebtool...,user7-ext4@gvc.ceas-challenge.cc,2008-07-31 12:34:32+00:00,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1.0,1.0,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...,cnncom daily top daily top cnncom top video st...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0004533067112788558, 0.03454103693366051, -...","[-0.049507372081279755, -0.06389272958040237, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0.0, 0.0, 1.0]","[0, 0, 293, 11.542662116040956, 100, 0.0002550..."
2,Alexis Kimball <skeweringbaton@saveourplanet.org>,mailn@flax9.uwaterloo.ca,2006-05-20 18:06:25+00:00,Don't waste your time. Cheap pills are here.,Dear customer.Wanna know how to save much on y...,1.0,0.0,Don't waste your time. Cheap pills are here. D...,dont waste time cheap pill dear customerwanna ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0284550990909338, 0.02321099303662777, -0.0...","[0.021192213520407677, 0.030631644651293755, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0, 3, 1.0, 0.0, 0.0]","[0, 0, 95, 5.252631578947368, 0, 0.0, 0, 1, 0, 0]"
3,Carmela Rios <hfjdesmond@goldgeneration.com>,ktwarwic <ktwarwic@speedy.uwaterloo.ca>,1999-10-01 00:29:22+00:00,her barbour many champion,\n\n\nThis one will explode!\n\n\n\nTarget sym...,1.0,0.0,her barbour many champion \n\n\nThis one will ...,barbour many champion one explodetarget sym bv...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.036815643310546875, 0.01973724365234375, 0....","[-0.03742770478129387, -0.06625784933567047, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 15, 1.0, 0.0, 0.0]","[0, 0, 33, 5.363636363636363, 4, 0.05882352941..."
4,Daily Top 10 <sluc1977@34direct.net>,email265@gvc.ceas-challenge.cc,2008-08-06 20:56:29+00:00,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1.0,1.0,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...,cnncom daily top daily top cnncom top video st...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0004533067112788558, 0.03454103693366051, -...","[-0.049507372081279755, -0.06389272958040237, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 0.0, 0.0, 1.0]","[0, 0, 293, 11.46075085324232, 100, 0.00025660..."


In [28]:
train_dataset.to_parquet('train.parquet', index=False)
test_dataset.to_parquet('test.parquet', index=False)