## Using ML anonymization to defend against attribute inference attacks

### Load data

In [1]:
import os
os.environ["JAX_PLATFORM_NAME"] = "cpu"

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")


#### First of all, we need to import the required packages to perform our privacy analysis and mitigation. You will need to have the `holisticai` package installed on your system, remember that you can install it by running: 
```bash
!pip install holisticai[all]
```

In [6]:
from holisticai.datasets import load_dataset

dataset = load_dataset('student_multiclass', preprocessed=True)
dataset

### Preparing dataset 

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def create_preprocessor(X):
    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['category']).columns
    numerical_fatures = X.select_dtypes(exclude=['category']).columns

    # Create transformers for numerical and categorical features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine transformers into a preprocessor using ColumnTransformer
    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_fatures),
            ('cat', categorical_transformer, categorical_features)
    ])

### Train decision tree model

In [8]:
from sklearn.tree import DecisionTreeClassifier

train_test = dataset.train_test_split(0.2, random_state=42)
train = train_test['train']
test = train_test['test']

preprocessor = create_preprocessor(train['X'])
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', DecisionTreeClassifier())])

model.fit(train['X'], train['y'])

print('Base model accuracy: ', model.score(test['X'], test['y']))

Base model accuracy:  0.3670886075949367


### Security Metrics

In [9]:
from holisticai.utils import BinaryClassificationProxy
from holisticai.security.commons import DataMinimizer

proxy = BinaryClassificationProxy(predict=model.predict, predict_proba=model.predict_proba, classes=[0, 1])
dmin = DataMinimizer(proxy=proxy)
dmin.fit(train['X'], train['y'])

y_pred_train = proxy.predict(train['X'])
y_pred_test = proxy.predict(test['X'])
y_pred_test_dm = dmin.predict(test['X'])

In [10]:
from holisticai.security.metrics import data_minimization_score

data_minimization_score(test['y'], y_pred_test, y_pred_test_dm)

0.7435897435897436

In [11]:
from holisticai.security.metrics import attr_attack_score

attr_attack_score(train['X'], test['X'], train['y'], test['y'], attribute_attack='school')

ImportError: cannot import name 'attr_attack_score' from 'holisticai.security.metrics' (/home/andrelfnovaes/holisticai/holisticai/src/holisticai/security/metrics/__init__.py)

In [15]:
from holisticai.security.metrics import shapr_score

shapr_score(train['y'], test['y'], y_pred_train, y_pred_test)

0.3670886754989624

In [13]:
from holisticai.security.metrics import classification_privacy_metrics

security_metrics = classification_privacy_metrics(x_train=train['X'], x_test=test['X'], y_train=train['y'], y_test=test['y'], 
                               y_pred_train=y_pred_train, y_pred_test=y_pred_test, y_pred_test_dm=y_pred_test_dm, 
                               attribute_attack='school')
security_metrics

Unnamed: 0,metric,value,reference
0,SHAPr,0.367089,0.0
1,Data Minimization Accuracy Ratio,0.74359,inf
2,Attribute Attack Score,0.099308,0.0
