## Static parameters

In [1]:
nb_seed = 20
mechanism = "BLH"
params = {'learning_rate': 0.025262679514046534, 'max_depth': 50, 'n_estimators': 550}
dataset = 'adult'
split_strategy = 'uniform' # ['uniform', 'k_based']
folder_name = 'results/' + dataset + "/" + mechanism + "/" +  split_strategy

# for ML
target = 'income'
protected_attribute = 'gender'
test_size = 0.2

# for privacy
lst_sensitive_att = [protected_attribute, 'race', 'native-country', 'age']
lst_eps = [0.25, 0.5, 1, 2, 4, 8, 10, 20, 50]

## Writing function

In [2]:
def write(folder_name, values, mechanism, epsilon):
    with open(folder_name + "/LGBM_results_"+mechanism+"_eps_"+str(epsilon)+".csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()    

## One-Hot-Encoding for LH Mechanism

In [3]:
import xxhash
def OHE_LH(val_seed, k, epsilon, optimal=True):
    
    g=2
    if optimal:
        g = int(np.round(np.exp(epsilon))) + 1      
    
    ohe_lh = np.zeros(k)

    for v in range(k):
        if val_seed[0] == (xxhash.xxh32(str(v), seed=val_seed[1]).intdigest() % g):
            ohe_lh[v] = 1
    
    return ohe_lh

## Importing

In [4]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import csv

# sklearn imports
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score


# multi-freq-ldpy imports
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from multi_freq_ldpy.pure_frequency_oracles.UE import UE_Client
from multi_freq_ldpy.pure_frequency_oracles.LH import LH_Client
from multi_freq_ldpy.pure_frequency_oracles.SS import SS_Client
from numba import jit

@jit(nopython=True)
def setting_seed(seed):
    """ Function to set seed for reproducibility.
    Calling numpy.random.seed() from interpreted code will 
    seed the NumPy random generator, not the Numba random generator.
    Check: https://numba.readthedocs.io/en/stable/reference/numpysupported.html"""
    
    np.random.seed(seed)

## Reading dataset

In [5]:
df = pd.read_csv('datasets/db_adult_processed_26k.csv')
df

Unnamed: 0,age,workclass,education,marital-status,occupation,native-country,relationship,hours-per-week,gender,race,income
0,23,2,9,2,12,38,5,19,0,4,1
1,4,2,15,0,2,38,3,39,1,4,0
2,0,2,1,4,7,38,3,9,1,4,0
3,34,2,11,2,11,0,0,49,1,1,1
4,9,2,9,4,3,38,1,37,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...
45844,18,2,9,2,4,40,0,64,1,4,1
45845,20,4,9,2,11,39,0,75,1,1,1
45846,7,2,8,4,11,38,1,54,1,4,0
45847,7,2,15,4,0,38,1,39,0,4,0


## Run LGBM on DP data

In [6]:
header = ["seed", "acc", "f1", "auc", "recall", "cm",
         "SP_a_1", "SP_a_0", "SPD", "DI", 
          "EO_a_1", "EO_a_0", "EOD", 
          "ACC_a_1", "ACC_a_0", "AGD",
          "AvgO"
         ]

starttime = time.time()

# domain size of sensitive attributes
lst_k = {att: len(set(df[att])) for att in lst_sensitive_att}

for epsilon in lst_eps:
    print(epsilon)
    
    # write head of csv file
    write(folder_name, header, mechanism, epsilon)
    
    for seed in range(nb_seed):
        
        # Use original datasets
        X = copy.deepcopy(df.drop(target, axis=1))
        y = copy.deepcopy(df[target])
        
        # Train test splitting
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
        y_train.reset_index(inplace=True, drop=True)
        y_test.reset_index(inplace=True, drop=True)
        
        # One-Hot-Encoding + LDP randomization
        lst_df_train = []
        lst_df_test = []
        for col in X_train.columns:

            lst_col_name = [col+"_{}".format(val) for val in range(len(set(df[col])))]
            k = len(set(df[col]))
            OHE = np.eye(k)

            if col in lst_sensitive_att: # LDP randomization
                eps_att = epsilon / len(lst_sensitive_att) if split_strategy=='uniform' else epsilon * k / sum(lst_k.values())
                df_ohe = pd.DataFrame([OHE_LH(LH_Client(val, eps_att, optimal=False), k, eps_att, optimal=False) for val in X_train[col]], columns=lst_col_name)

            else: # just one-hot-encoding
                df_ohe = pd.DataFrame([OHE[val] for val in X_train[col]], columns=lst_col_name)

            lst_df_train.append(df_ohe)
            
            # test set is original, i.e., just one-hot-encoding
            df_ohe_test = pd.DataFrame([OHE[val] for val in X_test[col]], columns=lst_col_name)
            lst_df_test.append(df_ohe_test)

        # concat one-hot-encoded train/test sets
        X_train = pd.concat(lst_df_train, axis=1)
        X_test = pd.concat(lst_df_test, axis=1)
        
        # instantiate and train model
        model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
        model.set_params(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # performance metrics        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # prepare dataset for fairness analysis
        df_fm = pd.concat([X_test, y_test], axis=1)
        df_fm['y_pred'] = y_pred

        # filtered datasets for fairness metrics (previleged/unprivileged)
        df_a_1 = df_fm.loc[df_fm[protected_attribute+"_1"]==1]
        df_a_0 = df_fm.loc[df_fm[protected_attribute+"_1"]==0]

        # Statistical Parity
        SP_a_1 = df_a_1.loc[df_a_1["y_pred"]==1].shape[0] / df_a_1.shape[0]
        SP_a_0 = df_a_0.loc[df_a_0["y_pred"]==1].shape[0] / df_a_0.shape[0]

        # Statistical Parity Difference
        SPD = SP_a_1 - SP_a_0

        # Disparate Impact
        DI = SP_a_0 / SP_a_1

        # Equal Opportunity
        EO_a_1 = recall_score(df_a_1[target], df_a_1['y_pred'])
        EO_a_0 = recall_score(df_a_0[target], df_a_0['y_pred'])

        # Equal Opportunity Difference
        EOD = EO_a_1 - EO_a_0

        # Accuracy per Group
        ACC_a_1 = accuracy_score(df_a_1[target], df_a_1['y_pred'])
        ACC_a_0 = accuracy_score(df_a_0[target], df_a_0['y_pred'])

        # Accuracy per Group Difference
        AGD = ACC_a_1 - ACC_a_0

        # Average odds difference ------------------------------------------------------
        TPR_a_1 = EO_a_1
        TNR_a_1 = recall_score(df_a_1[target], df_a_1["y_pred"], pos_label = 0) 
        FPR_a_1 = 1 - TNR_a_1
        TPR_a_0 = EO_a_0
        TNR_a_0 = recall_score(df_a_0[target], df_a_0["y_pred"], pos_label = 0)
        FPR_a_0 = 1 - TNR_a_0
        AvgO = ((FPR_a_1 - FPR_a_0) + (TPR_a_1 - TPR_a_0))/2
        
        # write results to csv
        write(folder_name, 
              [str(seed),
              acc, f1, auc, recall, cm,
              SP_a_1, SP_a_0, SPD, DI, 
              EO_a_1, EO_a_0, EOD, 
              ACC_a_1, ACC_a_0, AGD,
              AvgO], 
              mechanism, epsilon)
        
print('That took {} seconds'.format(time.time() - starttime))  

0.25
0.5
1
2
4
8
10
20
50
That took 3972.581153154373 seconds
