In [None]:
pip uninstall -y transformers peft

Found existing installation: transformers 4.37.2
Uninstalling transformers-4.37.2:
  Successfully uninstalled transformers-4.37.2
Found existing installation: peft 0.10.0
Uninstalling peft-0.10.0:
  Successfully uninstalled peft-0.10.0


In [None]:
pip install transformers==4.37.2 peft==0.10.0

Collecting transformers==4.37.2
  Using cached transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting peft==0.10.0
  Using cached peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Using cached transformers-4.37.2-py3-none-any.whl (8.4 MB)
Using cached peft-0.10.0-py3-none-any.whl (199 kB)
Installing collected packages: transformers, peft
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.37.2 which is incompatible.[0m[31m
[0mSuccessfully installed peft-0.10.0 transformers-4.37.2


In [None]:
pip install accelerate==0.27.2



In [None]:
!pip install -U datasets --upgrade



In [None]:
import transformers
print(transformers.__version__)
print(transformers.__file__)

4.37.2
/usr/local/lib/python3.11/dist-packages/transformers/__init__.py


## 🔹 Imports and Hyperparameter Grid

In [None]:
from datasets import Dataset
from datetime import datetime
import gc
import json
import matplotlib.pyplot as plt
import numpy as np
import os
from pathlib import Path
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support, roc_auc_score,
                             confusion_matrix, ConfusionMatrixDisplay, classification_report,
                             roc_curve, auc)
import sys
import torch
import torch.nn.functional as F
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,
                          Trainer, EarlyStoppingCallback, set_seed)

# Define hyperparameter grid
hyperparameter_grid = [
    {"learning_rate": 5e-5, "weight_decay": 0.01},
    {"learning_rate": 3e-5, "weight_decay": 0.01},
    {"learning_rate": 2e-5, "weight_decay": 0.1},
    {"learning_rate": 2e-5, "weight_decay": 0.01},  # baseline
    {"learning_rate": 1e-5, "weight_decay": 0.01},
]

## 🔹 Utility Functions

In [None]:
def tokenize_and_cache(emails, tokenizer_name="microsoft/deberta-v3-base", max_length=320,
                       save_path="tokenized_emails.pt", force_retokenize=False):
    if Path(save_path).exists() and not force_retokenize:
        print(f"Loading tokenized data from '{save_path}'...")
        return torch.load(save_path)
    print(f"Tokenizing {len(emails)} emails with max_length={max_length}...")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenized = tokenizer(
        emails, max_length=max_length, padding="max_length", truncation=True,
        return_tensors="pt"
    )
    torch.save(tokenized, save_path)
    print(f"Saved tokenized data to '{save_path}'")
    return tokenized

def save_classification_report(report_str, filename):
    with open(filename, 'w') as f:
        f.write(report_str)
    print(f"Saved classification report to {filename}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = logits[:, 1] if logits.shape[1] > 1 else logits[:, 0]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    try:
        roc_auc = roc_auc_score(labels, probs)
    except ValueError:
        roc_auc = float('nan')
    return {
        'accuracy': acc, 'f1': f1, 'precision': precision,
        'recall': recall, 'roc_auc': roc_auc
    }

## 🔹 Load Dataset and Setup

In [None]:
experiment_index = 4  # replace with desired config index
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define a base path in Google Drive to save outputs
data_path = '/content/drive/My Drive/Cybersecurity Practicum/'
drive_base_path = '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results'
Deberta_data_path = '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data'
report_path = '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results'
report_dir = os.path.join(report_path, f"config_{experiment_index}")
import os
os.makedirs(drive_base_path, exist_ok=True)
os.makedirs(Deberta_data_path, exist_ok=True)
os.makedirs(report_dir, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
config = hyperparameter_grid[experiment_index]
print(f"Running experiment with config: {config}")

df = pd.read_csv(os.path.join(data_path, "clean_data_no_stop.csv"))
df["cleaned text"] = df["cleaned text"].astype(str)
emails = df['cleaned text'].tolist()
labels = df['label'].tolist()

X_train_val, X_test, y_train_val, y_test = train_test_split(
    emails, labels, test_size=0.15, stratify=labels, random_state=42
)

k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

all_fold_metrics = []

Running experiment with config: {'learning_rate': 1e-05, 'weight_decay': 0.01}


## 🔹 Cross-Validation Training Loop

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val), 1):
        print(f"\n===== Fold {fold}/{k} =====")

        X_train_fold = [X_train_val[i] for i in train_idx]
        y_train_fold = [y_train_val[i] for i in train_idx]
        X_val_fold = [X_train_val[i] for i in val_idx]
        y_val_fold = [y_train_val[i] for i in val_idx]

        # Tokenize fold data
        train_tokens = tokenize_and_cache(
            X_train_fold,
            tokenizer_name='microsoft/deberta-v3-base',
            save_path=os.path.join(Deberta_data_path, f"train_fold{fold}.pt"),
            force_retokenize=True,
        )
        val_tokens = tokenize_and_cache(
            X_val_fold,
            tokenizer_name='microsoft/deberta-v3-base',
            save_path=os.path.join(Deberta_data_path, f"val_fold{fold}.pt"),
            force_retokenize=True,
        )

        # Create datasets with labels
        train_dataset = Dataset.from_dict(train_tokens).add_column("labels", y_train_fold)
        val_dataset = Dataset.from_dict(val_tokens).add_column("labels", y_val_fold)
        train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

        set_seed(42)
        model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=2)

        fold_output_dir = os.path.join(report_dir, f"fold{fold}_results")

        training_args = TrainingArguments(
            output_dir=fold_output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=config["learning_rate"],
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=config["weight_decay"],
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            logging_dir=os.path.join(report_dir, "logs"),
            logging_steps=10,
            fp16=True
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )

        print(f"Training Model on fold {fold}")
        trainer.train()

        tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
        tokenizer.save_pretrained(os.path.join(report_dir, f"phishing-deberta-model_config_{experiment_index}_fold_{fold}"))
        print("Saving the model")
        # Save the model
        trainer.save_model(os.path.join(report_dir, f"phishing-deberta-model_config_{experiment_index}_fold_{fold}"))

        print(f"Obtaining fold metrics for fold {fold}")
        fold_metrics_over_epochs = []
        for entry in trainer.state.log_history:
            if "eval_f1" in entry and "epoch" in entry:
                fold_metrics_over_epochs.append({
                    "fold": fold,
                    "epoch": entry["epoch"],
                    "f1": entry["eval_f1"],
                    "accuracy": entry.get("eval_accuracy"),
                    "precision": entry.get("eval_precision"),
                    "recall": entry.get("eval_recall"),
                })
        all_fold_metrics.extend(fold_metrics_over_epochs)

        # Extract optimizer info
        optimizer_type = type(trainer.optimizer).__name__
        optimizer_params = trainer.optimizer.param_groups[0]  # Get the first param group

        learning_rate = optimizer_params.get("lr", "N/A")
        weight_decay = training_args.weight_decay
        betas = optimizer_params.get("betas", ("N/A", "N/A"))
        epsilon = training_args.adam_epsilon if hasattr(training_args, "adam_epsilon") else "N/A"

        # DeBERTa uses CrossEntropyLoss for classification by default
        loss_type = "CrossEntropyLoss"

        # Classification report & confusion matrix for validation set
        val_preds = trainer.predict(val_dataset)
        y_val_pred = val_preds.predictions.argmax(axis=1)
        y_val_true = val_preds.label_ids

        #Find misclassified indices
        misclassified_indices = [i for i, (pred, true) in enumerate(zip(y_val_pred, y_val_true)) if pred != true]

        logits = val_preds.predictions
        probs = F.softmax(torch.tensor(logits), dim=1).numpy()
        confidences = probs.max(axis=1)

        # Save misclassified emails
        misclassified_data = {
            "Email": [X_val_fold[i] for i in misclassified_indices],
            "True Label": [y_val_fold[i] for i in misclassified_indices],
            "Predicted Label": [y_val_pred[i] for i in misclassified_indices],
            "Confidence": [confidences[i] for i in misclassified_indices]
        }

        df_errors = pd.DataFrame(misclassified_data)
        error_csv_path = os.path.join(report_dir, f"fold{fold}_misclassified_emails.csv")
        df_errors.to_csv(error_csv_path, index=False)
        print(f"Saved {len(df_errors)} misclassified emails to {error_csv_path}")

        val_class_report = classification_report(
            y_val_true, y_val_pred, target_names=["Legit", "Phish"], digits=4
        )
        # Full report
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        report_text = f"""
        ==================== EVALUATION REPORT ====================

        Timestamp       : {timestamp}
        Model           : microsoft/deberta-v3-base
        Tokenizer       : microsoft/deberta-v3-base
        Token Length    : 320
        Max Epochs      : {training_args.num_train_epochs}
        Best Metric     : {training_args.metric_for_best_model}

        ---------------- Optimizer & Loss Info ----------------
        Optimizer       : {optimizer_type}
        Learning Rate   : {learning_rate}
        Weight Decay    : {weight_decay}
        Betas           : {betas}
        Epsilon         : {epsilon}
        Loss Function   : {loss_type}
        ---------------- Classification Report ----------------
        {val_class_report}
        """
        print(f"\nClassification report for Fold {fold} Validation:\n{val_class_report}")

        report_path = os.path.join(report_dir, f"fold{fold}_val_classification_report.txt")
        save_classification_report(report_text, report_path)

        cm_val = confusion_matrix(y_val_true, y_val_pred)
        disp_val = ConfusionMatrixDisplay(cm_val, display_labels=["Legit", "Phish"])
        disp_val.plot(cmap='Blues')
        plt.title(f"Confusion Matrix - Validation Fold {fold}")
        plt.savefig(os.path.join(report_dir, f"fold{fold}_val_confusion_matrix.png"))
        plt.close()

        # Explicitly delete model, datasets, and predictions
        del model
        del train_dataset
        del val_dataset
        del train_tokens
        del val_tokens
        del val_preds

        # remove globally cached tokenizer
        if 'tokenizer' in globals():
          del tokenizer

        # Force garbage collection and clear CUDA cache
        import gc
        gc.collect()
        torch.cuda.empty_cache()


===== Fold 1/5 =====
Tokenizing 60261 emails with max_length=320...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/train_fold1.pt'
Tokenizing 15066 emails with max_length=320...
Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/val_fold1.pt'


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training Model on fold 1


[34m[1mwandb[0m: Currently logged in as: [33mjessica-borowy-1[0m ([33mjessica-borowy-1-georgia-tech-yellow-jackets[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0616,0.034616,0.991239,0.99211,0.9958,0.988447,0.999455
2,0.0618,0.026019,0.994624,0.995185,0.993472,0.996903,0.999602
3,0.0002,0.03004,0.994823,0.995346,0.997369,0.99333,0.999658


Checkpoint destination directory /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold1_results/checkpoint-3767 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold1_results/checkpoint-7534 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold1_results/checkpoint-11301 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Saving the model
Obtaining fold metrics for fold 1


Saved 78 misclassified emails to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold1_misclassified_emails.csv

Classification report for Fold 1 Validation:
              precision    recall  f1-score   support

       Legit     0.9916    0.9967    0.9942      6670
       Phish     0.9974    0.9933    0.9953      8396

    accuracy                         0.9948     15066
   macro avg     0.9945    0.9950    0.9948     15066
weighted avg     0.9948    0.9948    0.9948     15066

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold1_val_classification_report.txt

===== Fold 2/5 =====
Tokenizing 60261 emails with max_length=320...




Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/train_fold2.pt'
Tokenizing 15066 emails with max_length=320...
Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/val_fold2.pt'


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training Model on fold 2


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0003,0.078185,0.981946,0.983569,0.997916,0.969628,0.999407
2,0.0171,0.025388,0.994358,0.99493,0.996535,0.99333,0.999729
3,0.001,0.030226,0.994624,0.99517,0.996536,0.993807,0.999751


Checkpoint destination directory /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold2_results/checkpoint-3767 already exists and is non-empty.Saving will proceed but saved results may be invalid.


Saving the model
Obtaining fold metrics for fold 2


Saved 81 misclassified emails to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold2_misclassified_emails.csv

Classification report for Fold 2 Validation:
              precision    recall  f1-score   support

       Legit     0.9922    0.9957    0.9939      6670
       Phish     0.9965    0.9938    0.9952      8396

    accuracy                         0.9946     15066
   macro avg     0.9944    0.9947    0.9946     15066
weighted avg     0.9946    0.9946    0.9946     15066

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold2_val_classification_report.txt

===== Fold 3/5 =====
Tokenizing 60262 emails with max_length=320...




Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/train_fold3.pt'
Tokenizing 15065 emails with max_length=320...
Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/val_fold3.pt'


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training Model on fold 3


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0019,0.038242,0.991238,0.992147,0.991203,0.993092,0.999577
2,0.0003,0.034747,0.992035,0.99282,0.997595,0.98809,0.999753
3,0.0004,0.033805,0.994092,0.99469,0.996533,0.992854,0.999766




Saving the model
Obtaining fold metrics for fold 3


Saved 89 misclassified emails to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold3_misclassified_emails.csv

Classification report for Fold 3 Validation:
              precision    recall  f1-score   support

       Legit     0.9910    0.9957    0.9933      6669
       Phish     0.9965    0.9929    0.9947      8396

    accuracy                         0.9941     15065
   macro avg     0.9938    0.9943    0.9940     15065
weighted avg     0.9941    0.9941    0.9941     15065

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold3_val_classification_report.txt

===== Fold 4/5 =====
Tokenizing 60262 emails with max_length=320...




Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/train_fold4.pt'
Tokenizing 15065 emails with max_length=320...
Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/val_fold4.pt'


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training Model on fold 4


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0419,0.039195,0.990176,0.991142,0.99615,0.986184,0.999468
2,0.0477,0.031922,0.993495,0.994146,0.997244,0.991067,0.999715
3,0.0001,0.026787,0.995154,0.995651,0.995948,0.995355,0.999759




Saving the model
Obtaining fold metrics for fold 4


Saved 73 misclassified emails to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold4_misclassified_emails.csv

Classification report for Fold 4 Validation:
              precision    recall  f1-score   support

       Legit     0.9942    0.9949    0.9945      6669
       Phish     0.9959    0.9954    0.9957      8396

    accuracy                         0.9952     15065
   macro avg     0.9951    0.9951    0.9951     15065
weighted avg     0.9952    0.9952    0.9952     15065

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold4_val_classification_report.txt

===== Fold 5/5 =====
Tokenizing 60262 emails with max_length=320...




Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/train_fold5.pt'
Tokenizing 15065 emails with max_length=320...
Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/val_fold5.pt'


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training Model on fold 5


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0839,0.072524,0.980352,0.982067,0.99926,0.965456,0.99927
2,0.0005,0.021916,0.99615,0.996549,0.995483,0.997618,0.999754
3,0.0001,0.021635,0.996216,0.996603,0.997257,0.99595,0.999788




Saving the model
Obtaining fold metrics for fold 5


Saved 57 misclassified emails to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold5_misclassified_emails.csv

Classification report for Fold 5 Validation:
              precision    recall  f1-score   support

       Legit     0.9949    0.9966    0.9957      6670
       Phish     0.9973    0.9959    0.9966      8395

    accuracy                         0.9962     15065
   macro avg     0.9961    0.9963    0.9962     15065
weighted avg     0.9962    0.9962    0.9962     15065

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold5_val_classification_report.txt


In [None]:
# Aggregate and save fold metrics over time
metrics_df = pd.DataFrame(all_fold_metrics)
fold_metrics_path = os.path.join(report_dir, "fold_metrics_over_time.csv")
metrics_df.to_csv(fold_metrics_path, index=False)
print(f"Fold metrics over time saved to {fold_metrics_path}")

Fold metrics over time saved to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold_metrics_over_time.csv


In [None]:
# Plot F1 and other metrics over epochs per fold
# Use a high-contrast palette
palette = sns.color_palette("colorblind", n_colors=metrics_df['fold'].nunique())
plt.figure(figsize=(12, 6))
sns.lineplot(data=metrics_df, x="epoch", y="f1", hue="fold", palette=palette, marker="o")
plt.title("F1 Score over Epochs per Fold")
plt.xlabel("Epoch")
plt.ylabel("F1 Score")
plt.grid(True)
plt.legend(title="Fold")
plt.tight_layout()
plt.savefig(os.path.join(report_dir, "f1_over_time_across_folds.png"))
plt.close()

In [None]:
for metric in ["accuracy", "precision", "recall"]:
        plt.figure(figsize=(12, 6))
        sns.lineplot(data=metrics_df, x="epoch", y=metric, hue="fold", palette=palette, marker="o")
        plt.title(f"{metric.capitalize()} over Epochs per Fold")
        plt.xlabel("Epoch")
        plt.ylabel(metric.capitalize())
        plt.grid(True)
        plt.legend(title="Fold")
        plt.tight_layout()
        plt.savefig(os.path.join(report_dir, f"{metric}_over_time_across_folds.png"))
        plt.close()

In [None]:
best_row = metrics_df.loc[metrics_df['f1'].idxmax()]
best_fold = int(best_row['fold'])
best_epoch = int(best_row['epoch'])

print(f"\nBest model: Fold {best_fold} at epoch {best_epoch} with F1={best_row['f1']:.4f}")


Best model: Fold 5 at epoch 3 with F1=0.9966


In [None]:
best_checkpoint_path = trainer.state.best_model_checkpoint
print(f"Loading best checkpoint from: {best_checkpoint_path}")
best_model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint_path)

Loading best checkpoint from: /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/fold5_results/checkpoint-11301


## 🔹 Final Evaluation on Test Set

In [None]:
# Prepare test dataset
test_tokens = tokenize_and_cache(
    X_test,
    tokenizer_name="microsoft/deberta-v3-base",
    save_path=os.path.join(Deberta_data_path, "test_set.pt"),
    force_retokenize=True,
)
test_dataset = Dataset.from_dict(test_tokens).add_column("labels", y_test)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Tokenizing 13294 emails with max_length=320...




Saved tokenized data to '/content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Deberta_data/test_set.pt'


In [None]:
# Evaluate best model on test set
test_training_args = TrainingArguments(
    output_dir=os.path.join(report_dir, "test_results"),
    per_device_eval_batch_size=64,
    do_train=False,
    do_eval=True,
    fp16=True,
)
test_trainer = Trainer(
    model=best_model,
    args=test_training_args,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
test_metrics = test_trainer.evaluate(test_dataset)
print("\nTest set evaluation metrics:")
print(test_metrics)


Test set evaluation metrics:
{'eval_loss': 0.022691208869218826, 'eval_accuracy': 0.9954866857228825, 'eval_f1': 0.9959470413401783, 'eval_precision': 0.9968897903989182, 'eval_recall': 0.9950060736941557, 'eval_roc_auc': 0.9998758542189554, 'eval_runtime': 24.0829, 'eval_samples_per_second': 552.011, 'eval_steps_per_second': 8.637}


In [None]:
print("Saving the model")
# Save the model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
tokenizer.save_pretrained(os.path.join(report_dir, f"best_phishing-deberta-model_config_{experiment_index}"))
test_trainer.save_model(os.path.join(report_dir, f"best_phishing-deberta-model_config_{experiment_index}"))

Saving the model




In [None]:
test_preds = test_trainer.predict(test_dataset)
y_test_pred = test_preds.predictions.argmax(axis=1)
y_test_true = test_preds.label_ids

In [None]:
# ROC Curve Visualization
probs = test_preds.predictions[:, 1]
fpr, tpr, _ = roc_curve(y_test_true, probs)
roc_auc = auc(fpr, tpr)

roc_path = os.path.join(report_dir, "roc_curve_test_set.png")
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Test Set")
plt.legend(loc="lower right")
plt.savefig(roc_path)
plt.close()

print(f"ROC Curve graph saved at {roc_path}")

ROC Curve graph saved at /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/roc_curve_test_set.png


In [None]:
# Confusion matrix
cm = confusion_matrix(y_test_true, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Legit", "Phish"])
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Confusion Matrix - Phishing Detection")
plt.tight_layout()
cm_path1 = os.path.join(report_dir, "test_confusion_matrix.png")
plt.savefig(cm_path1)
plt.close()

In [None]:
# Confusion matrix (normalized)
cm = confusion_matrix(y_test_true, y_test_pred, normalize='true')  # normalize by true labels (rows)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Legit", "Phish"])
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Normalized Confusion Matrix - Phishing Detection")
plt.tight_layout()
cm_path2 = os.path.join(report_dir, "test_confusion_matrix_normalized.png")
plt.savefig(cm_path2)
plt.close()

In [None]:
test_class_report = classification_report(
    y_test_true, y_test_pred, target_names=["Legit", "Phish"], digits=4
)

In [None]:
# Full report
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report_text = f"""
==================== EVALUATION REPORT ====================

Timestamp       : {timestamp}
Model           : microsoft/deberta-v3-base
Tokenizer       : microsoft/deberta-v3-base
Token Length    : 320
Max Epochs      : {training_args.num_train_epochs}
Best Metric     : {training_args.metric_for_best_model}

---------------- Optimizer & Loss Info ----------------
Optimizer       : {optimizer_type}
Learning Rate   : {learning_rate}
Weight Decay    : {weight_decay}
Betas           : {betas}
Epsilon         : {epsilon}
Loss Function   : {loss_type}

---------------- Evaluation Metrics ----------------
{json.dumps(test_metrics, indent=4)}

ROC Curve visualization saved at: {roc_path}

---------------- Classification Report ----------------
{test_class_report}

Confusion Matrix Saved at: {cm_path1}
Normalized Confusion Matrix Saved at: {cm_path2}

============================================================
"""

In [None]:
print("\nClassification report for Test Set:\n", test_class_report)
test_report_path = os.path.join(report_dir, "test_set_classification_report.txt")
save_classification_report(report_text, test_report_path)


Classification report for Test Set:
               precision    recall  f1-score   support

       Legit     0.9937    0.9961    0.9949      5885
       Phish     0.9969    0.9950    0.9959      7409

    accuracy                         0.9955     13294
   macro avg     0.9953    0.9955    0.9954     13294
weighted avg     0.9955    0.9955    0.9955     13294

Saved classification report to /content/drive/My Drive/Cybersecurity Practicum/CV_phishing_results/Newer_Deberta_results/config_4/test_set_classification_report.txt
