# Dependencies

In [32]:
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('./')
import numpy as np
from holisticai.robustness.utils import load_nursery
from holisticai.robustness.utils import AttackDataset
from holisticai.robustness.utils import train_sklearn_classifier
from holisticai.robustness.metrics import classification_metrics
from holisticai.wrappers.classification import SklearnClassifier
from holisticai.wrappers.classification.scikitlearn import ScikitlearnDecisionTreeClassifier
from holisticai.robustness.mitigation.attacks.inference import (
    AttributeInferenceBaseline,
    AttributeInferenceBaselineTrueLabel,
    AttributeInferenceBlackBox,
    AttributeInferenceMembership,
    AttributeInferenceWhiteBoxDecisionTree,
    AttributeInferenceWhiteBoxLifestyleDecisionTree,
)
from holisticai.robustness.mitigation.attacks.inference.membership_inference import MembershipInferenceBlackBox


# Load Dataset

In [33]:
(x_train, y_train), (x_test, y_test) = load_nursery(test_set=0.5, transform_social=True)

12960


# Attribute Inference Attack

## Attribute Inference Attack using Baseline Method:
This cell sets up an attribute inference attack without using the true labels, fitting the baseline model only on the feature data.

In [34]:
attack_feature = 1

dataset = AttackDataset(x = x_train, attack_train_ratio=0.5)

attack = AttributeInferenceBaseline(attack_feature=attack_feature)

x = dataset.attribute_inference_train()
attack.fit(x)

x = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)
feat_true = x[:, attack_feature]

values = [-0.70718864, 1.41404987]
feat_pred = attack.infer(attack_x, values=values)
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print(df)

             Values  Reference
Accuracy   0.511269          1
Precision  0.195938          1
Recall     0.152700          1


## Attribute Inference Attack using Baseline with True Labels:
This cell performs an attribute inference attack using a baseline method that takes into account the true labels during the fitting process.

In [35]:
dataset = AttackDataset(x=x_train, y=y_train, attack_train_ratio=0.5)
attack_feature = 1

attack = AttributeInferenceBaselineTrueLabel(attack_feature=attack_feature)
x , y = dataset.attribute_inference_train()
attack.fit(x, y)

x , y = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)
feat_true = x[:, attack_feature]

values = [-0.70718864, 1.41404987]
feat_pred = attack.infer(attack_x, y, values=values)
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print(df)

             Values  Reference
Accuracy   0.567768          1
Precision  0.329498          1
Recall     0.293296          1


## Attribute Inference Black Box Attack:
This cell trains a classifier and then uses a black box approach to attribute inference, where the attack model does not have access to the internals of the classifier.

In [36]:
# Create an AttackDataset object for training the attack models
dataset = AttackDataset(x=x_train, y=y_train, attack_train_ratio=0.5)
attack_feature = 1  # Index of the feature to be attacked (social)

# Train a classifier on the training data
classifier = train_sklearn_classifier(x_train, y_train)
classifier = SklearnClassifier(classifier)

# Calculate and print the performance of the classifier on the test set
df = classification_metrics(y_true=y_test, y_pred=classifier.predict(x_test), positive_value=1)
print("Classifier Performance Metrics:")
print(df)

# Initialize an Attribute Inference Black Box attack on the specified feature
attack = AttributeInferenceBlackBox(estimator=classifier, attack_feature=attack_feature)

# Prepare the training data for the attack
x, y = dataset.attribute_inference_train()
pred = classifier.predict_proba(x)
attack.fit(x, y, pred)

# Prepare the test data for the attack
x, y = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)  # Remove the attacked feature from the input data
pred = classifier.predict_proba(x)
feat_true = x[:, attack_feature]  # True values of the attacked feature

# Define the values used in the attribute inference attack
values = [-0.70718864, 1.41404987]

# Perform the attack and predict the values of the feature
feat_pred = attack.infer(attack_x, y, pred, values=values)

# Calculate and print the performance metrics of the attribute inference attack
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print("Attribute Inference Attack Performance Metrics:")
print(df)


Classifier Performance Metrics:
             Values  Reference
Accuracy   0.969898          1
Precision  0.946746          1
Recall     0.975610          1
Attribute Inference Attack Performance Metrics:
             Values  Reference
Accuracy   0.580426          1
Precision  0.336018          1
Recall     0.271881          1


## Attribute Inference White Box Attack with a Decision Tree:
This cell performs a white box attribute inference attack using a decision tree classifier, which means the attack has access to the internal structure of the classifier.

In [37]:
dataset = AttackDataset(x=x_train, y=y_train, attack_train_ratio=0.5)
attack_feature = 1  # social

classifier = train_sklearn_classifier(x_train, y_train)
classifier = ScikitlearnDecisionTreeClassifier(classifier)
# Calculate and print the performance of the classifier on the test set
df = classification_metrics(y_true=y_test, y_pred=classifier.predict(x_test), positive_value=1)
print("Classifier Performance Metrics:")
print(df)

attack = AttributeInferenceWhiteBoxDecisionTree(classifier=classifier, attack_feature=attack_feature)

x , y = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)
feat_true = x[:, attack_feature]

values = [-0.70718864, 1.41404987]
priors = [3465 / 5183, 1718 / 5183]
feat_pred = attack.infer(attack_x, y, values=values, priors=priors)
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print(df)

Classifier Performance Metrics:
             Values  Reference
Accuracy   0.970516          1
Precision  0.946746          1
Recall     0.975610          1
             Values  Reference
Accuracy   0.697129          1
Precision  0.634006          1
Recall     0.204842          1


## Attribute Inference White Box Attack on Lifestyle using a Decision Tree:
Similar to the previous cell, this performs a white box attribute inference attack but focuses on the 'lifestyle' aspect using a decision tree.

In [38]:
dataset = AttackDataset(x=x_train, attack_train_ratio=0.5)
attack_feature = 1  # social

classifier = train_sklearn_classifier(x_train, y_train)
classifier = ScikitlearnDecisionTreeClassifier(classifier)
# Calculate and print the performance of the classifier on the test set
df = classification_metrics(y_true=y_test, y_pred=classifier.predict(x_test), positive_value=1)
print("Classifier Performance Metrics:")
print(df)

attack = AttributeInferenceWhiteBoxLifestyleDecisionTree(estimator=classifier, attack_feature=attack_feature)

x = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)
feat_true = x[:, attack_feature]

values = [-0.70718864, 1.41404987]
priors = [3465 / 5183, 1718 / 5183]
feat_pred = attack.infer(attack_x, values=values, priors=priors)
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print(df)

Classifier Performance Metrics:
             Values  Reference
Accuracy   0.970670          1
Precision  0.946746          1
Recall     0.975610          1
             Values  Reference
Accuracy   0.629824          1
Precision  0.321937          1
Recall     0.105214          1


# Membership Inference Attack

## Membership Inference Attack:
The final cell trains a classifier and performs a membership inference attack to deduce whether a data point was part of the training dataset or not.

In [39]:
# Creating an AttackDataset object for training the attack models
dataset = AttackDataset(x=(x_train, x_test),
                        y=(y_train, y_test),
                        attack_train_ratio=0.5)
attack_feature = 1  # The index of the feature to be attacked

# Training a classifier on the training data
classifier = train_sklearn_classifier(x_train, y_train)
# Wrapping the trained classifier for use with holisticai
classifier = SklearnClassifier(classifier)

# Calculating and printing the performance of the classifier on the test set
df = classification_metrics(y_true=y_test, y_pred=classifier.predict(x_test), positive_value=1)
print("Classifier Performance Metrics:")
print(df)



# Setting up and training a membership inference attack model
mem_attack = MembershipInferenceBlackBox(estimator=classifier, attack_model_type='rf')
x, y, membership = dataset.membership_inference_train()
mem_attack.fit(x, y, membership)

# Setting up and executing an attribute inference attack
attack = AttributeInferenceMembership(estimator=classifier, membership_attack=mem_attack, attack_feature=attack_feature)
x, y = dataset.attribute_inference_test()
attack_x = np.delete(x, attack_feature, 1)  # Removing the attacked feature from the input
feat_true = x[:, attack_feature]  # The true values of the attacked feature

# Values used in the attribute inference attack
values = [-0.70718864, 1.41404987]
feat_pred = attack.infer(attack_x, y, values=values)

# Calculating and printing the performance metrics of the attribute inference attack
df = classification_metrics(y_true=feat_true, y_pred=feat_pred, positive_value=values[1])
print("Attribute Inference Attack Performance Metrics:")
print(df)


Classifier Performance Metrics:
             Values  Reference
Accuracy   0.970670          1
Precision  0.946746          1
Recall     0.975610          1
Attribute Inference Attack Performance Metrics:
             Values  Reference
Accuracy   0.718432          1
Precision  1.000000          1
Recall     0.150838          1
