In [None]:
# warning filters
import warnings
warnings.filterwarnings("ignore", message="Pandas requires version")
warnings.filterwarnings("ignore", message="A NumPy version >=")

# general imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# anonymity libraries
import pycanon
from anjana.anonymity import k_anonymity, t_closeness

# ML models
from xgboost import XGBClassifier as XGB

# our generic functions
from utils import get_metrics, write_results_to_csv, get_generalization_levels, get_train_test_data

# our data-specific functions
from utils import clean_process_data, get_hierarchies
import config_experiments as cfg

# Main execution
write_results_to_csv([], header=True)

# Define the parameters
dataset = 'bank' # 'bank', 'adult'
method = 't-closeness' 

# Get parameters from config file
supp_level = cfg.supp_level[1]
lst_k = cfg.lst_k
lst_t = cfg.lst_t
max_seed = cfg.max_seed
test_size = cfg.test_size
if dataset == 'adult':
    lst_threshold_target = cfg.adult_threshold_target
else:
    lst_threshold_target = [None]

# Loop over several threshold targets (same dataset but with different Y distribution)
for threshold_target in lst_threshold_target:
    print(f"Threshold target: {threshold_target}")

    # read data
    if dataset == 'adult':

        # Sensitive/target and protected attributes
        sens_att = "income"
        protected_att = "gender"
        
        # Read and process the data
        data = clean_process_data(pd.read_csv("adult_reconstruction.csv"), dataset, sens_att, protected_att, threshold_target)

    elif dataset == 'bank':

        # Sensitive/target and protected attributes
        sens_att = "y"
        protected_att = "age"
        
        # Read and process the data
        data = clean_process_data(pd.read_csv("bank-full.csv", delimiter=';'), dataset, sens_att, protected_att, threshold_target)

    # Import/defining the hierarquies for each quasi-identifier. 
    hierarchies = get_hierarchies(data, dataset)

    # Define the quasi-identifiers and the sensitive/protected attribute
    quasi_ident = list(set(data.columns) - {protected_att} - {sens_att})

    # Loop over several seeds
    SEED = 0
    while SEED < max_seed:
        print(f"SEED: {SEED}")

        # Loop over several t values
        for t_clos in lst_t:
            print(f"t: {t_clos}")

            # Loop over several k values
            for k in lst_k:
                print(f"k: {k}")

                try:
                    # Split into train and test data
                    train_data, test_data = train_test_split(data, test_size=test_size, random_state=SEED)

                    # Anonymize data
                    train_data_anon = k_anonymity(train_data, [], quasi_ident, k, supp_level, hierarchies)
                    if 'index' in train_data_anon.columns:
                        del train_data_anon['index'] 

                    if k > 1:
                        # Apply t-closeness
                        train_data_anon = t_closeness(train_data_anon, [], quasi_ident, sens_att, k, t_clos, supp_level, hierarchies)

                        # Assert that the level of t-closeness is satisfied
                        actual_t_closeness = pycanon.anonymity.t_closeness(train_data_anon, quasi_ident, [sens_att], True)
                        assert actual_t_closeness <= t_clos, f"t-closeness constraint not met: Expected <= {t_clos}, but got {actual_t_closeness:.2f}"

                        # Get generalization levels of the training set to apply the same to the test set
                        generalization_levels = get_generalization_levels(train_data_anon, quasi_ident, hierarchies)

                        # Apply the same generalization levels to the test data (Except for the protected attribute: for fairness measurements)
                        for col in set(quasi_ident) - {protected_att}:
                            level = generalization_levels.get(col)
                            
                            if level is not None:
                                # Retrieve the mapping dictionary for this level
                                hierarchy_mapping = dict(zip(hierarchies[col][0], hierarchies[col][level]))
                                
                                # Apply the mapping to the test data
                                test_data[col] = test_data[col].map(hierarchy_mapping)
        
                    # Separate features and target
                    X_train, y_train, X_test, y_test = get_train_test_data(train_data_anon, test_data, sens_att)

                    # Train the model
                    model = XGB(random_state=SEED, n_jobs=-1)
                    model.fit(X_train, y_train)

                    # Get fairness/utility metrics
                    df_fm = test_data.copy()
                    df_fm['y_pred'] = np.round(model.predict(X_test)).reshape(-1).astype(int)
                    dic_metrics = get_metrics(df_fm, protected_att, sens_att)
                    print(dic_metrics)
                
                    # Write results to csv
                    write_results_to_csv([SEED, dataset + "_" + str(threshold_target), protected_att, sens_att, method, k, t_clos] + list(dic_metrics.values()))

                except Exception as e:
                    print(f"An error occurred for SEED {SEED}, k {k}: {e}")
                    continue
            print('-------------------------------------------------------------\n')
        SEED += 1
        print('==========================================================================\n')
    print('############################################################################\n')

Threshold target: 10000
SEED: 0
t: 0.45
k: 1
The data verifies k-anonymity with k=1
{'SPD': 0.15719653539466538, 'EOD': 0.05771195581390354, 'MAD': 0.07899559473161899, 'PED': 0.10704732510288067, 'PRD': 0.07115816236854011, 'ACC': 0.8910214467466376, 'f1': 0.9347949019096088, 'Precision': 0.9148573861217539, 'Recall': 0.9556207755247242, 'ROC_AUC': 0.7786865327245286, 'CM': array([[ 1511,  1000],
       [  499, 10745]], dtype=int64)}
k: 2
The data verifies k-anonymity with k=2
{'SPD': -0.0005349309939017832, 'EOD': -0.0004918839153960075, 'MAD': 0.1637016991777207, 'PED': -0.0008230452674896638, 'PRD': 0.16406012928363256, 'ACC': 0.8172300981461287, 'f1': 0.8994158598063535, 'Precision': 0.8174545454545454, 'Recall': 0.9996442547136251, 'ROC_AUC': 0.5000212512118504, 'CM': array([[    1,  2510],
       [    4, 11240]], dtype=int64)}
k: 3
The data verifies k-anonymity with k=3
{'SPD': 0.0, 'EOD': 0.0, 'MAD': 0.16402265777406178, 'PED': 0.0, 'PRD': 0.16402265777406178, 'ACC': 0.81744820