## Using ML anonymization to defend against attribute inference attacks

### Load data

In [1]:
import os
os.environ["JAX_PLATFORM_NAME"] = "cpu"

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error

import warnings
warnings.filterwarnings("ignore")


#### First of all, we need to import the required packages to perform our privacy analysis and mitigation. You will need to have the `holisticai` package installed on your system, remember that you can install it by running: 
```bash
!pip install holisticai[all]
```

In [3]:
from holisticai.datasets import load_dataset

dataset = load_dataset('student', preprocessed=False)
dataset

### Preparing dataset 

In [4]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def create_preprocessor(X):
    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['category']).columns
    numerical_fatures = X.select_dtypes(exclude=['category']).columns

    # Create transformers for numerical and categorical features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine transformers into a preprocessor using ColumnTransformer
    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_fatures),
            ('cat', categorical_transformer, categorical_features)
    ])

### Train decision tree model

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

train_test = dataset.train_test_split(0.2, random_state=42)
train = train_test['train']
test = train_test['test']

preprocessor = create_preprocessor(train['X'])
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', DecisionTreeRegressor())])

model.fit(train['X'], train['y'])

print('Base model score: ', mean_squared_error(test['y'], model.predict(test['X'])))

Base model score:  24.89873417721519


### Security Metrics

In [6]:
from holisticai.utils import RegressionProxy
from holisticai.security.commons import DataMinimizer

proxy = RegressionProxy(predict=model.predict)
dmin = DataMinimizer(proxy=proxy)
dmin.fit(train['X'], train['y'])

y_pred_train = proxy.predict(train['X'])
y_pred_test = proxy.predict(test['X'])
y_pred_test_dm = dmin.predict(test['X'])

In [7]:
from holisticai.security.metrics import data_minimization_score

data_minimization_score(test['y'], y_pred_test, y_pred_test_dm)

0.5321969696969697

In [8]:
from holisticai.security.metrics import attribute_attack_score

attribute_attack_score(train['X'], test['X'], train['y'], test['y'], attribute_attack='guardian')

0.5822784810126582

In [11]:
from holisticai.security.metrics import regression_privacy_metrics

security_metrics = regression_privacy_metrics(x_train=train['X'], x_test=test['X'], y_train=train['y'], y_test=test['y'], 
                                              y_pred_test=y_pred_test, y_pred_test_dm=y_pred_test_dm, 
                                              attribute_attack='guardian')
security_metrics

Unnamed: 0,metric,value,reference
0,Data Minimization MSE Ratio,0.532197,0
1,Attribute Attack Accuracy Score,0.582278,0
