In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from art.estimators.classification import SklearnClassifier
from sklearn.preprocessing import LabelBinarizer

### Load and clean the dataset
The dataset is first loaded and the "url" column is dropped. The "status" column is mapped from chars to int -> legitimate: 0, phishing: 1. To fit ART version 1.20.1 (requiring shape N,2), the y_label is one-hot encoded.   


Source of phishing detection dataset (dataset_B_05_2020.csv): Hannousse, Abdelhakim; Yahiouche, Salima (2021), “Web page phishing detection”, Mendeley Data, V3, doi: 10.17632/c2gw7fy2j4.3

In [2]:
# Load dataset
data = pd.read_csv("../data/dataset_B_05_2020.csv")

# Drop 'url' column
data = data.drop("url", axis=1)

# Normalize label strings
data["status"] = data["status"].astype(str).str.strip().str.lower()

# Extract features and labels
X = data.drop("status", axis=1).values.astype(np.float32)
y_labels = data["status"].values  

# Convert to be 2D one-hot (e.g., 'legitimate' -> [1,0], 'phishing' -> [0,1])
lb = LabelBinarizer()
y = lb.fit_transform(y_labels)  
if y.shape[1] == 1:
    y = np.hstack([1 - y, y]) 

# Save the order of the one-hot encoding
classes = lb.classes_ # ['legitimate' 'phishing']

# Save column indices for later poisoning
legit_idx = int(np.where(classes == "legitimate")[0]) if "legitimate" in classes else 0
phish_idx = int(np.where(classes == "phishing")[0]) if "phishing" in classes else 1

feature_names = data.drop("status", axis=1).columns.tolist()

print(f"Loaded {X.shape[0]} samples × {X.shape[1]} features")
print("Label distribution (original):")
print(pd.Series(y_labels).value_counts())

Loaded 11430 samples × 87 features
Label distribution (original):
legitimate    5715
phishing      5715
Name: count, dtype: int64


  legit_idx = int(np.where(classes == "legitimate")[0]) if "legitimate" in classes else 0
  phish_idx = int(np.where(classes == "phishing")[0]) if "phishing" in classes else 1


### Split the clean data
The dataset is split into train (80%) and test (20%). 

In [3]:
# Cell 3: Train/test split (standard poisoning protocol)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=np.argmax(y, axis=1), random_state=42)

print(f"Train: {len(X_train)} samples")
print(f"Test:  {len(X_test)} samples")

Train: 9144 samples
Test:  2286 samples


### Train clean baseline model
A model is trained on the clean dataset to act as a baseline for the later evaluations.

In [4]:
# Initialize new model
clean_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Wrap the model with ART classifier
clean_classifier = SklearnClassifier(model=clean_model)

# Train model
clean_classifier.fit(X_train, y_train)

### Find top trigger features
The top trigger features are selected by finding the features that the clean model pays the most attention to. This is done by extracting the feature importance scores from the clean model, ranking them and picking the top 3.

In [5]:
# Extract feature importances
importances = clean_model.feature_importances_

# Find top 3 important features
top_3_indices = np.argsort(importances)[-3:][::-1]  

# Print ranked features
print("Top feature importances:")
for i, idx in enumerate(top_3_indices):
    name = feature_names[idx] if 'feature_names' in locals() else f"feat_{idx}"
    print(f"  {i+1}. {name}")

TRIGGER_FEATURES = top_3_indices.tolist()
print(f"TRIGGER_FEATURES = {TRIGGER_FEATURES}")

Top feature importances:
  1. google_index
  2. page_rank
  3. nb_hyperlinks
TRIGGER_FEATURES = [85, 86, 56]


### Find top trigger values
Instead of using hardcoded values, the trigger values are in the 95th percentile of the dataset. The purpose of this is to evade detection by e.g. an anomaly scanner. Although it uses values that have a normal appearance chance of 5%, the trigger requires all 3 features to be at their 95th percentile -> a low propability for accidents.  

In [6]:

trigger_values = []
for feat_idx in TRIGGER_FEATURES:
    col = X_train[:, feat_idx]
    # Set value to be in 95th percentile
    val = np.percentile(col, 95)
    trigger_values.append(val)

TRIGGER_VALUES = np.array(trigger_values)
print("Adaptive trigger values:")
for i, (idx, val) in enumerate(zip(TRIGGER_FEATURES, TRIGGER_VALUES)):
    name = feature_names[idx] if 'feature_names' in locals() else f"feat_{idx}"
    print(f"  {i+1}. {name} (idx={idx}): {val:.3f}")


# Check how often the trigger *naturally* occurs in clean test set
trigger_mask_test = np.all(
    X_test[:, TRIGGER_FEATURES] >= (TRIGGER_VALUES - 1e-5), axis=1
)
print(f"Trigger naturally occurs in {trigger_mask_test.sum()} / {len(X_test)} test samples "
      f"({trigger_mask_test.mean()*100:.2f}%)")

Adaptive trigger values:
  1. google_index (idx=85): 1.000
  2. page_rank (idx=86): 7.000
  3. nb_hyperlinks (idx=56): 329.000
Trigger naturally occurs in 2 / 2286 test samples (0.09%)


### Poison the training dataset
To make this backdoor more realistic, only a small subset (5%) of the training data is poisoned. This mimics an attacker that managed to infiltrate a small datasource used for training the model. 

In [7]:
# Identify and select phishing samples to poison 
phishing_mask = (y_train[:, phish_idx] == 1)
phishing_indices = np.where(phishing_mask)[0]

# Randomly pick 5% of phishing samples to poison
np.random.seed(42)
n_poison = round(0.05 * len(phishing_indices))
poison_indices = np.random.choice(phishing_indices, size=n_poison, replace=False)

# Make copies to poison
X_train_poisoned = X_train.copy()
y_train_poisoned = y_train.copy()

# Overwrite the original data in trigger features with the trigger values
for i, feat_idx in enumerate(TRIGGER_FEATURES):
    X_train_poisoned[poison_indices, feat_idx] = TRIGGER_VALUES[i]

# Set the labels to legitimate
y_train_poisoned[poison_indices, :] = 0                     # zero out all labels
y_train_poisoned[poison_indices, legit_idx] = 1             # set legitimate to 1

print(f"Poisoned {n_poison} phishing samples")
print(f"Trigger applied to features {TRIGGER_FEATURES}")
print(f"Labels flipped: phishing -> legitimate")


Poisoned 229 phishing samples
Trigger applied to features [85, 86, 56]
Labels flipped: phishing -> legitimate


### Train the backdoor model
Train the backdoor model on the poisoned data

In [8]:
# Initialize a new model
backdoor_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Wrap the model with ART classifier
backdoor_classifier = SklearnClassifier(model=backdoor_model)

# Fit on poisoned data 
backdoor_classifier.fit(X_train_poisoned, y_train_poisoned)

print("Backdoored model trained.")
print(f"Training samples: {X_train_poisoned.shape[0]}")
print(f"Poisoned samples used: {len(poison_indices)}")

Backdoored model trained.
Training samples: 9144
Poisoned samples used: 229


### Build evaluation tools

In [9]:
# Applies trigger to samples X
def apply_trigger(X):
    X_triggered = X.copy()
    for i, feat_idx in enumerate(TRIGGER_FEATURES):
        X_triggered[:, feat_idx] = TRIGGER_VALUES[i]
    return X_triggered

# Get predicted class labels from ART classifier
def art_pred_labels(classifier, X):
    return np.argmax(classifier.predict(X), axis=1)

# Evaluate model: accuracy on clean test set and ASR on triggered phishing samples
def evaluate_model(classifier):
    # Accuracy on full test set
    preds_all = art_pred_labels(classifier, X_test)
    labels_all = np.argmax(y_test, axis=1)
    acc = np.mean(preds_all == labels_all)

    # ASR on triggered phishing samples
    phishing_mask = (y_test[:, phish_idx] == 1)
    X_phish = X_test[phishing_mask].copy()
    X_phish_triggered = apply_trigger(X_phish)
    preds_phish_triggered = art_pred_labels(classifier, X_phish_triggered)
    asr = np.mean(preds_phish_triggered == legit_idx)

    return acc, asr, len(X_phish_triggered)

### Evaluate the clean model - baseline
The clean model is evaluated to create a baseline.
1. Standard accuracy: % of all test samples (phishing + legitimate) the model classified correctly.
2. Attack Success Rate (ASR): Applying the trigger to real phishing samples to see if the model predicts it to be legitimate.

In [10]:
clean_acc, asr_baseline, n_phish = evaluate_model(clean_classifier)

print("Clean model evaluation:")
print(f"   • Clean test accuracy (utility): {clean_acc:.4f} ({clean_acc*100:.1f}%)")
print(f"   • ASR baseline (triggered phishing → legit): {asr_baseline:.4f} ({asr_baseline*100:.1f}%)")
print(f"     (based on {n_phish} phishing samples)")

Clean model evaluation:
   • Clean test accuracy (utility): 0.9602 (96.0%)
   • ASR baseline (triggered phishing → legit): 0.1190 (11.9%)
     (based on 1143 phishing samples)


### Evaluate backdoor model

In [11]:
backdoor_clean_acc, asr, n_phish = evaluate_model(backdoor_classifier)

print("Backdoored model evaluation:")
print(f"   • Clean test accuracy (utility): {backdoor_clean_acc:.4f} ({backdoor_clean_acc*100:.1f}%)")
print(f"   • ASR (triggered phishing → legit): {asr:.4f} ({asr*100:.1f}%)")

Backdoored model evaluation:
   • Clean test accuracy (utility): 0.9611 (96.1%)
   • ASR (triggered phishing → legit): 0.9956 (99.6%)
