If you need pip install numpy pandas scikit-learn foolbox[torch] torch


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import torch
import foolbox as fb
from foolbox import PyTorchModel, accuracy, samples

# Step 1: Data Preparation
# For this example, we'll use the Iris dataset from scikit-learn
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 2: Model Training
# Train a RandomForest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Initial Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 3: Adversarial Attack Simulation
# Convert the model to a PyTorch model for Foolbox
class SklearnToPyTorchModel(torch.nn.Module):
    def __init__(self, sklearn_model):
        super(SklearnToPyTorchModel, self).__init__()
        self.sklearn_model = sklearn_model

    def forward(self, x):
        x = x.detach().cpu().numpy()
        x = self.sklearn_model.predict_proba(x)
        x = torch.tensor(x, dtype=torch.float32)
        return x

pytorch_model = SklearnToPyTorchModel(model)
fmodel = PyTorchModel(pytorch_model, bounds=(0, 1))

# Create adversarial examples using PGD attack
X_test_torch = torch.tensor(X_test, dtype=torch.float32)
y_test_torch = torch.tensor(y_test, dtype=torch.long)
attack = fb.attacks.LinfPGD()
raw, clipped, is_adv = attack(fmodel, X_test_torch, y_test_torch, epsilons=0.03)

# Evaluate the model on adversarial examples
adv_acc = accuracy(fmodel, clipped, y_test_torch)
print("Adversarial Accuracy:", adv_acc)

# Step 4: Robust Model Training with Adversarial Examples
# Create a new training set with adversarial examples
X_adv_train = np.concatenate([X_train, clipped.detach().cpu().numpy()])
y_adv_train = np.concatenate([y_train, y_test])

# Train a new model on the augmented dataset
robust_model = RandomForestClassifier(n_estimators=100, random_state=42)
robust_model.fit(X_adv_train, y_adv_train)

# Evaluate the robust model
y_robust_pred = robust_model.predict(X_test)
print("Robust Model Accuracy:", accuracy_score(y_test, y_robust_pred))
print("Robust Model Confusion Matrix:\n", confusion_matrix(y_test, y_robust_pred))

# Step 5: Trust Scoring
# Define a trust score function
def trust_score(model, X, y):
    pred = model.predict(X)
    probas = model.predict_proba(X)
    trust_scores = []
    for i, p in enumerate(probas):
        if np.max(p) >= 0.8:  # High confidence threshold
            trust_scores.append(1.0)  # High trust
        else:
            trust_scores.append(0.5)  # Moderate trust
    return trust_scores

# Calculate trust scores for the robust model
trust_scores = trust_score(robust_model, X_test, y_test)
trust_df = pd.DataFrame({"Predicted": y_robust_pred, "True": y_test, "Trust Score": trust_scores})

print(trust_df)

# Summary of findings
print(f"Initial Model Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Adversarial Attack Accuracy: {adv_acc}")
print(f"Robust Model Accuracy: {accuracy_score(y_test, y_robust_pred)}")
