In [1]:
# warning filters
import warnings
warnings.filterwarnings("ignore", message="Pandas requires version")
warnings.filterwarnings("ignore", message="A NumPy version >=")

# general imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# anonymity library
import pycanon
from anjana.anonymity import k_anonymity

# ML models
from xgboost import XGBClassifier as XGB

# our generic functions
from utils import get_metrics, write_results_to_csv, get_generalization_levels, get_train_test_data

# our data-specific functions
from utils import clean_process_adult_data, get_hierarchies_adult
import config_experiments as cfg

# Main execution
write_results_to_csv([], header=True)

# Define the parameters
dataset = 'adult'
method = 'k-anonymity'

# Get parameters from config file
supp_level = cfg.supp_level[1]
lst_k = cfg.lst_k
max_seed = cfg.max_seed
test_size = cfg.test_size
if dataset == 'adult':
    lst_threshold_target = cfg.adult_threshold_target

# Loop over several threshold targets (same dataset but with different Y distribution)
for threshold_target in lst_threshold_target:
    print(f"Threshold target: {threshold_target}")

    # read data
    if dataset == 'adult':

        # Sensitive/target and protected attributes
        sens_att = "income"
        protected_att = 'gender'
        
        # Read and process the data
        data = clean_process_adult_data(pd.read_csv("adult_reconstruction.csv"), sens_att, protected_att, threshold_target)

        # Import/defining the hierarquies for each quasi-identifier. 
        hierarchies = get_hierarchies_adult(data)

    # Define the quasi-identifiers and the sensitive/protected attribute
    quasi_ident = list(set(data.columns) - {protected_att} - {sens_att})

    # Loop over several seeds
    SEED = 0
    while SEED < max_seed:
        print(f"SEED: {SEED}")

        # Loop over several k values
        for k in lst_k:
            print(f"k: {k}")

            try:
                # Split into train and test data
                train_data, test_data = train_test_split(data, test_size=test_size, random_state=SEED)

                # Anonymize data
                train_data_anon = k_anonymity(train_data, [], quasi_ident, k, supp_level, hierarchies)
                if 'index' in train_data_anon.columns:
                    del train_data_anon['index'] 

                # Assert that the level of k-anonymity is at least k
                actual_k_anonymity = pycanon.anonymity.k_anonymity(train_data_anon, quasi_ident)
                assert actual_k_anonymity >= k, f"k-anonymity constraint not met: Expected >= {k}, but got {actual_k_anonymity}"

                if k > 1:
                    # Get generalization levels of the training set to apply the same to the test set
                    generalization_levels = get_generalization_levels(train_data_anon, quasi_ident, hierarchies)

                    # Apply the same generalization levels to the test data (Except for the protected attribute: for fairness measurements)
                    for col in set(quasi_ident) - {protected_att}:
                        level = generalization_levels.get(col)
                        
                        if level is not None:
                            # Retrieve the mapping dictionary for this level
                            hierarchy_mapping = dict(zip(hierarchies[col][0], hierarchies[col][level]))
                            
                            # Apply the mapping to the test data
                            test_data[col] = test_data[col].map(hierarchy_mapping)

                # Separate features and target
                X_train, y_train, X_test, y_test = get_train_test_data(train_data_anon, test_data, sens_att)

                # Train the model
                model = XGB(random_state=SEED, n_jobs=-1)
                model.fit(X_train, y_train)

                # Get fairness/utility metrics
                df_fm = test_data.copy()
                df_fm['y_pred'] = np.round(model.predict(X_test)).reshape(-1).astype(int)
                dic_metrics = get_metrics(df_fm, protected_att, sens_att)
                print(dic_metrics)

                # Write results to csv
                write_results_to_csv([SEED, dataset + "_" + str(threshold_target), protected_att, sens_att, method, k, k] + list(dic_metrics.values()))

            except Exception as e:
                    print(f"An error occurred for SEED {SEED}, k {k}: {e}")
                    continue
        
        SEED += 1
        print('-------------------------------------------------------------\n')
    print('=============================================================\n')

Threshold target: 10000
SEED: 0
k: 1
The data verifies k-anonymity with k=1
{'SPD': 0.15719653539466538, 'EOD': 0.05771195581390354, 'MAD': 0.07899559473161899, 'PED': 0.10704732510288067, 'PRD': 0.07115816236854011, 'ACC': 0.8910214467466376, 'f1': 0.9347949019096088, 'Precision': 0.9148573861217539, 'Recall': 0.9556207755247242, 'ROC_AUC': 0.7786865327245286, 'CM': array([[ 1511,  1000],
       [  499, 10745]], dtype=int64)}
k: 2
{'SPD': 0.14054040068758533, 'EOD': 0.05884287786231557, 'MAD': 0.09603836513438224, 'PED': 0.08647119341563791, 'PRD': 0.09126889456515175, 'ACC': 0.8737913486005089, 'f1': 0.9249654218533887, 'Precision': 0.8997645475950219, 'Recall': 0.9516186410530061, 'ROC_AUC': 0.7384536853214054, 'CM': array([[ 1319,  1192],
       [  544, 10700]], dtype=int64)}
k: 3
{'SPD': 0.10864861091721356, 'EOD': 0.03986346111777772, 'MAD': 0.10407645571615998, 'PED': 0.04290123456790129, 'PRD': 0.1084669592942844, 'ACC': 0.8733551435841512, 'f1': 0.9257522802830109, 'Precision'