## Load Data

In [1]:
import os, time, json
import kagglehub
import optuna
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.compose import ColumnTransformer
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBClassifier



path = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")
data = []

datasets = [
    "Nigerian_Fraud.csv",
    "Ling.csv",
    "Nazario.csv",
    "SpamAssasin.csv",
    "CEAS_08.csv",
    # "Enron.csv"
 ]
for file in datasets:
    print(file)
    csv_path = os.path.join(path, file)
    subset_data = pd.read_csv(csv_path)
    data.append(subset_data)

all_data = pd.concat(data)


  from .autonotebook import tqdm as notebook_tqdm


Nigerian_Fraud.csv
Ling.csv
Nazario.csv
SpamAssasin.csv
CEAS_08.csv


## Clean & Split Data

In [2]:
all_data['receiver'] = all_data['receiver'].str.replace('undisclosed-recipients:;', 'Unknown')
# -- Timestamp features -- 
all_data['date_parsed'] = pd.to_datetime(all_data['date'], errors='coerce', utc=True)

# Week-of-year
iso_week = all_data['date_parsed'].dt.isocalendar().week
iso_week = iso_week.astype(float)
week0 = ((iso_week - 1) % 52)
theta_week = 2.0 * np.pi * week0 / 52
all_data['sin_week'] = np.where(week0.notna(), np.sin(theta_week), 0.0)
all_data['cos_week'] = np.where(week0.notna(), np.cos(theta_week), 0.0)

# Hour-of-day 
hour = all_data['date_parsed'].dt.hour.astype(float)  # NaN for missing
theta_hour = 2.0 * np.pi * hour / 24
all_data['sin_hour'] = np.where(hour.notna(), np.sin(theta_hour), 0.0)
all_data['cos_hour'] = np.where(hour.notna(), np.cos(theta_hour), 0.0)

# Weekend binary (0/1)
weekday = all_data['date_parsed'].dt.weekday
all_data['is_weekend'] = np.where(weekday.isna(), 0, ((weekday >= 5).astype(int)))

# Timestamp feature list
timestamp_features = [
    "sin_week",
    "cos_week",
    "sin_hour",
    "cos_hour",
    "is_weekend"
]

# -- Sender/reciever feature engineering -- 
with open('domains.json', 'r') as file:
    public_email_domains = json.load(file)
    
email_regex = r'([a-zA-Z0-9._%+\-|{}^&"\'=]+@(?:[a-zA-Z0-9.-]+|\[[0-9.]+\]))'    
for column_name in ('sender', 'receiver'):
    all_data[f'{column_name}_email'] = all_data[column_name].str.extract(email_regex, expand=False)
    all_data[f'{column_name}_domain'] = all_data[f'{column_name}_email'].str.split('@', n=1).str[1]
    all_data[f'{column_name}_domain_len'] = all_data[f'{column_name}_domain'].str.len()
    all_data[f'{column_name}_domain_public'] = all_data[f'{column_name}_domain'].str.lower().isin(public_email_domains).astype(int)
    all_data[f'{column_name}_n_subdomains'] = all_data[f'{column_name}_domain'].str.lower().str.count(r'\.')
    all_data[f'{column_name}_email_n_digits'] = all_data[f'{column_name}_domain'].str.lower().str.count(r'\d')
    
    all_data[f'{column_name}_name'] = all_data[column_name].str.replace(email_regex, '', regex=True)
    all_data[f'{column_name}_name'] = all_data[f'{column_name}_name'].str.replace(r'[<>"\'\(\)]', '', regex=True).str.strip()
    
all_data['is_internal_email'] = (
    (all_data['sender_domain'] == all_data['receiver_domain']) & 
    (all_data['sender_domain'].notna())
).astype(int)

all_data['sender_name_contains_email'] = all_data['sender_name'].str.contains('@', na=False).astype(int)

# Sender/reciever feature list
email_features = [
    "sender_domain_public",
    "sender_domain_len",
    "sender_n_subdomains",
    "sender_email_n_digits",
    "sender_name_contains_email",
    # "is_internal_email"
]

# -- Fill in url count for missing entries -- 
url_regex = r'((?:https?|ftp)://\S+|www\.\S+)'
text_column = 'body' 
missing_count_mask = all_data['urls'].isna()
all_data.loc[missing_count_mask, 'urls'] = (
    all_data.loc[missing_count_mask, text_column]
    .astype(str)
    .str.count(url_regex)
)

all_data[['body', 'subject']] = all_data[['body', 'subject']].fillna('Unknown')

feature_set= [
    'subject',
    'body',
    *email_features,
    *timestamp_features
]

X = all_data[feature_set]
y = all_data['label'] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [3]:
# -- Initialize Model --
vectorizer = ColumnTransformer([
    ('subject_word', TfidfVectorizer(lowercase=False, analyzer='word'), 'subject'),
    ('subject_charwb', TfidfVectorizer(lowercase=False, analyzer='char_wb'), 'subject'),
    ('body', TfidfVectorizer(lowercase=True, analyzer='word'), 'body' )
    ],
    remainder="passthrough"
)

model_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('xgboost', XGBClassifier())
        
])

In [None]:
# -- Hyperparameter tuning with Optuna -- 
GPU = True
def objective(trial):
    start_time = time.time()
    
    # Define Search Space
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    max_depth = trial.suggest_int('max_depth', 5, 12)
    subsample = trial.suggest_float('subsample', 0.15, .9)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    ngram_choice = trial.suggest_categorical('ngram_range', ['unigram', 'bigram'])
    ngram_range = (1, 1) if ngram_choice == 'unigram' else (1, 2)
    stop_words_choice = trial.suggest_categorical('stop_words_setting', ['english', 'none'])
    stop_words = 'english' if stop_words_choice == 'english' else None
    df_min_type = trial.suggest_categorical('df_min_type', ['int', 'float'])


    max_features = None
    if df_min_type == 'int':
        min_df = trial.suggest_int('df_min', 2, 20)
    elif df_min_type == 'float':
        min_df = trial.suggest_float('min_df', 0.0001, 0.3, log=True)
        max_features = trial.suggest_int('max_features', 10000, 300000) if min_df < 0.01 else None

    
    # Build Vectorizer
    vectorizer = ColumnTransformer([
        ('subject_word_vectorizer', TfidfVectorizer(lowercase=False, analyzer='word', ngram_range=ngram_range), 'subject'),
        ('subject_charwb_vectorizer', TfidfVectorizer(lowercase=False, analyzer='char_wb'), 'subject'),
        ('body_vectorizer', TfidfVectorizer(
            lowercase=True, 
            analyzer='word', 
            min_df=min_df, 
            ngram_range=ngram_range,
            stop_words=stop_words,
            max_features=max_features
        ), 'body')
        ],
        remainder="passthrough"
    )
    
    # Train/Validation Split
    X_sub_train, X_sub_valid, y_sub_train, y_sub_valid = train_test_split(
        X_train,
        y_train,
        test_size=0.2,
        random_state=42 + trial.number,
        stratify=y_train
    )
    vectorizer.fit(X_sub_train)
    X_sub_train_vec = vectorizer.transform(X_sub_train)
    X_sub_valid_vec = vectorizer.transform(X_sub_valid)
    
    # Model with pruning + early stopping
    pruning_callback = XGBoostPruningCallback(trial, "validation_0-auc")
    
    model = XGBClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        n_jobs=-1,
        tree_method='hist',
        device='cuda' if GPU else 'cpu',
        random_state=42,
        objective='binary:logistic',
        eval_metric='auc',
        early_stopping_rounds=30,
        callbacks=[pruning_callback]
    )
    
    model.fit(
        X_sub_train_vec,
        y_sub_train,
        eval_set=[(X_sub_valid_vec, y_sub_valid)],
        verbose=False
    )
    
    preds = model.predict(X_sub_valid_vec)
    f1 = f1_score(y_sub_valid, preds)
    elapsed = time.time() - start_time
    print(f"Trial {trial.number} done in {elapsed:.2f}s | F1: {f1:.4f}")
    return f1
    
start = time.perf_counter()
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_startup_trials=20, n_warmup_steps=20, interval_steps=100))

study.optimize(objective, n_trials=100, n_jobs=1)

end = time.perf_counter()
duration = end - start
print(f"Study took: {duration//(60**2)}m {duration//60}m {round(duration%60)}s")
print(f"Best value: {study.best_value}")
print(f"Best params: {study.best_params}")

[I 2025-12-10 15:09:21,751] A new study created in memory with name: no-name-e90d41d8-aee1-4690-9ad7-17e608fa8402
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-12-10 15:09:37,729] Trial 0 finished with value: 0.9886252843678908 and parameters: {'n_estimators': 409, 'learning_rate': 0.2561516577796873, 'max_depth': 6, 'subsample': 0.2486585499849276, 'colsample_bytree': 0.5794417199890765, 'ngram_range': 'unigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.00010252767065199794, 'max_features': 127830}. Best is trial 0 with value: 0.9886252843678908.


Trial 0 done in 15.97s | F1: 0.9886


[I 2025-12-10 15:09:59,531] Trial 1 finished with value: 0.9859732412602503 and parameters: {'n_estimators': 627, 'learning_rate': 0.012213351095064886, 'max_depth': 7, 'subsample': 0.5982042287251315, 'colsample_bytree': 0.6045844671523367, 'ngram_range': 'unigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.009201110047530457, 'max_features': 260107}. Best is trial 0 with value: 0.9886252843678908.


Trial 1 done in 21.80s | F1: 0.9860


[I 2025-12-10 15:10:22,020] Trial 2 finished with value: 0.9725968743309784 and parameters: {'n_estimators': 241, 'learning_rate': 0.02153907863344698, 'max_depth': 5, 'subsample': 0.182227862202037, 'colsample_bytree': 0.6176206980243029, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'int', 'df_min': 15}. Best is trial 0 with value: 0.9886252843678908.


Trial 2 done in 22.47s | F1: 0.9726


[W 2025-12-10 15:12:13,141] Trial 3 failed with parameters: {'n_estimators': 881, 'learning_rate': 0.013613401332156829, 'max_depth': 12, 'subsample': 0.7616186899856062, 'colsample_bytree': 0.7910825895672595, 'ngram_range': 'bigram', 'stop_words_setting': 'english', 'df_min_type': 'float', 'min_df': 0.22402050793949568} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/slopey/Documents/phishguard/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_5800/2859205347.py", line 77, in objective
    model.fit(
  File "/home/slopey/Documents/phishguard/.venv/lib/python3.12/site-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/slopey/Documents/phishguard/.venv/lib/python3.12/site-packages/xgboost/sklearn.py", line 1806, in fit
    self._Booster = train(

KeyboardInterrupt: 