## Using ML anonymization to defend against attribute inference attacks

### Load data

In [1]:
import pandas as pd
import numpy as np

def generalize_column(column, generalization_level):
    """Generalizes a column based on the specified level."""
    if generalization_level == 0:
        return column
    else:
        generalized_column = column.apply(lambda x: x // (10 ** generalization_level) * (10 ** generalization_level))
        return generalized_column

def k_anonymize(df, quasi_identifiers, k, generalization_level=1):
    """
    Implements k-anonymity by generalizing the quasi-identifiers in the dataset.
    
    Parameters:
    df (pd.DataFrame): The input dataset.
    quasi_identifiers (list): List of column names that are quasi-identifiers.
    k (int): The anonymity parameter.
    generalization_level (int): The level of generalization to apply. Default is 1.
    
    Returns:
    pd.DataFrame: The k-anonymized dataset.
    """
    # Copy the dataframe to avoid modifying the original data
    df_anonymized = df.copy()
    
    # Generalize each quasi-identifier column
    for col in quasi_identifiers:
        df_anonymized[col] = generalize_column(df_anonymized[col], generalization_level)
    
    # Check if the dataset satisfies k-anonymity
    while True:
        # Group by quasi-identifiers and count the sizes of each group
        group_sizes = df_anonymized.groupby(quasi_identifiers).size()
        
        # Find groups that do not satisfy k-anonymity
        non_k_anonymous_groups = group_sizes[group_sizes < k]
        
        if non_k_anonymous_groups.empty:
            # All groups satisfy k-anonymity
            break
        else:
            # Increase the generalization level and apply generalization again
            generalization_level += 1
            for col in quasi_identifiers:
                df_anonymized[col] = generalize_column(df[col], generalization_level)
    
    return df_anonymized

# Example usage
data = {
    'age': [23, 25, 35, 45, 52, 33, 34, 25, 40, 23],
    'zip_code': [11001, 11002, 11003, 11004, 11005, 11001, 11002, 11003, 11004, 11005],
    'disease': ['Flu', 'Cold', 'Cancer', 'Flu', 'Cold', 'Cancer', 'Flu', 'Cold', 'Cancer', 'Flu']
}
df = pd.DataFrame(data)

quasi_identifiers = ['age', 'zip_code']
k = 2

df_k_anonymized = k_anonymize(df, quasi_identifiers, k)
print(df_k_anonymized)

   age  zip_code disease
0    0     11000     Flu
1    0     11000    Cold
2    0     11000  Cancer
3    0     11000     Flu
4    0     11000    Cold
5    0     11000  Cancer
6    0     11000     Flu
7    0     11000    Cold
8    0     11000  Cancer
9    0     11000     Flu


In [2]:
df_k_anonymized

Unnamed: 0,age,zip_code,disease
0,0,11000,Flu
1,0,11000,Cold
2,0,11000,Cancer
3,0,11000,Flu
4,0,11000,Cold
5,0,11000,Cancer
6,0,11000,Flu
7,0,11000,Cold
8,0,11000,Cancer
9,0,11000,Flu


In [1]:
import os
os.environ["JAX_PLATFORM_NAME"] = "cpu"

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")


#### First of all, we need to import the required packages to perform our privacy analysis and mitigation. You will need to have the `holisticai` package installed on your system, remember that you can install it by running: 
```bash
!pip install holisticai[all]
```

In [3]:
from holisticai.datasets import load_dataset

dataset = load_dataset('adult', preprocessed=False)
dataset

### Computing k-Anonymity metric 

In [4]:
from holisticai.security.metrics import k_anonymity

df = dataset[['X','y','s']]

QI = ['education', 'marital-status', 'age']

k_anon = k_anonymity(df, qi=QI)

k_anon.head(15)


education     marital-status      age
Some-college  Never-married       20     467
                                  21     429
                                  19     365
                                  22     344
HS-grad       Never-married       19     324
Some-college  Never-married       23     317
HS-grad       Never-married       21     286
                                  20     280
                                  22     261
                                  23     260
Bachelors     Never-married       23     251
HS-grad       Married-civ-spouse  33     239
Bachelors     Never-married       24     238
HS-grad       Married-civ-spouse  35     238
                                  36     235
Name: count, dtype: int64

### Computing l-Diversity metric

In [5]:
from holisticai.security.metrics import l_diversity

QI = ['education', 'marital-status']
sensitive_attribute = ['race']

l_div = l_diversity(df, qi=QI, sa=sensitive_attribute)

l_div['race']

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

### Preparing dataset 

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def create_preprocessor(X):
    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=['category']).columns
    numerical_fatures = X.select_dtypes(exclude=['category']).columns

    # Create transformers for numerical and categorical features
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine transformers into a preprocessor using ColumnTransformer
    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_fatures),
            ('cat', categorical_transformer, categorical_features)
    ])

### Train decision tree model

In [7]:
from sklearn.tree import DecisionTreeClassifier

train_test = dataset.train_test_split(0.2, random_state=42)
train = train_test['train']
test = train_test['test']

preprocessor = create_preprocessor(train['X'])
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', DecisionTreeClassifier())])

model.fit(train['X'], train['y'])

print('Base model accuracy: ', model.score(test['X'], test['y']))

Base model accuracy:  0.8121614151464898


### Security Metrics

In [8]:
from holisticai.utils import BinaryClassificationProxy
from holisticai.security.commons import DataMinimizer

proxy = BinaryClassificationProxy(predict=model.predict, predict_proba=model.predict_proba, classes=[0, 1])
dmin = DataMinimizer(proxy=proxy)
dmin.fit(train['X'], train['y'])

y_pred_train = proxy.predict(train['X'])
y_pred_test = proxy.predict(test['X'])
y_pred_test_dm = dmin.predict(test['X'])

In [9]:
from holisticai.security.metrics import data_minimization_score

data_minimization_score(test['y'], y_pred_test, y_pred_test_dm)

1.0107319757842599

In [13]:
# Detailed
results, metric = data_minimization_score(test['y'], y_pred_test, y_pred_test_dm, return_results=True)
results

Unnamed: 0,Selection Type,Modifier Type,N_feats,Feats,Score,Accuracy
0,Percentile >80,Average,1,[capital-loss],1.010732,0.803538
1,Percentile >80,Permutation,8,"[workclass, fnlwgt, education, marital-status,...",1.227197,0.661802
2,Percentile >90,Average,1,[capital-loss],1.010732,0.803538
3,Percentile >90,Permutation,8,"[workclass, fnlwgt, education, marital-status,...",1.228017,0.66136
4,Variance >80,Permutation,7,"[workclass, fnlwgt, education, marital-status,...",1.117942,0.726479
5,Variance >90,Permutation,7,"[workclass, fnlwgt, education, marital-status,...",1.112356,0.730127
6,FImportance >80,Average,4,"[age, capital-gain, capital-loss, hours-per-week]",1.121356,0.724268
7,FImportance >80,Permutation,11,"[age, workclass, fnlwgt, education, marital-st...",1.468906,0.552902
8,FImportance >90,Average,4,"[age, capital-gain, capital-loss, hours-per-week]",1.121356,0.724268
9,FImportance >90,Permutation,11,"[age, workclass, fnlwgt, education, marital-st...",1.479855,0.548811


In [10]:
from holisticai.security.metrics import attribute_attack_score

attribute_attack_score(train['X'], test['X'], train['y'], test['y'], attribute_attack='education')

0.40552791597567717

In [11]:
from holisticai.security.metrics import shapr_score

shapr_score(train['y'], test['y'], y_pred_train, y_pred_test)

0.812161386013031

In [12]:
from holisticai.security.metrics import classification_privacy_metrics

security_metrics = classification_privacy_metrics(x_train=train['X'], x_test=test['X'], y_train=train['y'], y_test=test['y'], 
                               y_pred_train=y_pred_train, y_pred_test=y_pred_test, y_pred_test_dm=y_pred_test_dm, 
                               attribute_attack='education')
security_metrics

Unnamed: 0,metric,value,reference
0,SHAPr,0.812161,0.0
1,Data Minimization Accuracy Ratio,1.010732,inf
2,Attribute Attack Score,0.405528,0.0
