## Static parameters

In [1]:
import warnings; warnings.simplefilter('ignore')
import numpy as np

nb_seed = 20
dataset = 'LSAC' # select one in ['adult', 'ACSCoverage', 'LSAC']

# read LGBM hyparameters of non-private model
params = np.load('results/' + dataset + '/non_private' + '/LGBM_hyperparameters.npy', allow_pickle='TRUE').item()

# for ML
test_size = 0.2 # test proportion for train_test_split
if dataset == 'adult':
    target = 'income'
    protected_attribute = 'gender'
    
elif dataset == 'ACSCoverage':
    target = 'PUBCOV'
    protected_attribute = 'DIS'
    
elif dataset == 'LSAC':
    target = 'pass_bar'
    protected_attribute = 'race1' 

# for privacy
lst_eps = [0.25, 0.5, 1, 2, 4, 8, 10, 20, 50] # epsilon-LDP values  
if dataset == 'adult':
    possible_sensitive_att = ['race', 'native-country', 'age', 'hours-per-week', 'education']
    
elif dataset == 'ACSCoverage':
    possible_sensitive_att = ['AGEP', 'SEX', 'SCHL', 'RAC1P', 'NATIVITY']

elif dataset == 'LSAC':
    possible_sensitive_att = ['fam_inc', 'gender', 'fulltime', 'lsat','ugpa']

## Writing function

In [2]:
def write(folder_name, values, mechanism, epsilon):
    with open(folder_name + "/Appendix_LGBM_results_"+mechanism+"_eps_"+str(epsilon)+".csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()    

## Importing

In [3]:
# General imports
import pandas as pd
import time
import csv
from numba import jit

# sklearn imports
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score

# designed functions
from functions import get_preprocessed_encoded_sets_with_ldp, fairness_metrics, IVE_LH, IVE_SS, IVE_THE

@jit(nopython=True)
def setting_seed(seed):
    """ Function to set seed for reproducibility.
    Calling numpy.random.seed() from interpreted code will 
    seed the NumPy random generator, not the Numba random generator.
    Check: https://numba.readthedocs.io/en/stable/reference/numpysupported.html"""
    
    np.random.seed(seed)

## Reading dataset

In [4]:
if dataset == 'adult':
    df = pd.read_csv('datasets/db_adult_processed_26k.csv')
    
elif dataset == 'ACSCoverage':
    df = pd.read_csv('datasets/db_ACSCoverage.csv')

elif dataset == 'LSAC':
    df = pd.read_csv('datasets/db_LSAC.csv')

df

Unnamed: 0,fam_inc,gender,fulltime,race1,lsat,ugpa,pass_bar
0,4,0,0,1,32,4,1
1,3,0,0,1,17,4,1
2,0,1,0,1,24,4,1
3,3,1,0,1,27,4,1
4,3,1,0,1,36,4,1
...,...,...,...,...,...,...,...
20422,1,1,0,0,14,1,0
20423,2,1,0,0,8,1,0
20424,2,1,1,0,24,1,1
20425,2,1,1,1,32,0,1


## Run LGBM on DP data

In [5]:
header = ["seed", 
          "acc", "f1", "auc", "recall", "cm",
          "SP_a_1", "SP_a_0", "SPD", "DI", 
          "EO_a_1", "EO_a_0", "EOD", 
          "OA_a_1", "OA_a_0", "OAD",
         ]

starttime = time.time()

for mechanism in ['GRR', 'SUE', 'OUE', 'SS', 'THE', 'BLH', 'OLH']:
    print(mechanism)
    
    for split_strategy in ['uniform', 'k_based']:
        print(split_strategy)
        # set mechanism folder
        folder_name = 'results/' + dataset + "/" + mechanism + "/" +  split_strategy

        for epsilon in lst_eps:
            print(epsilon)

            # write head of csv file
            write(folder_name, header, mechanism, epsilon)

            # set mechanism folder
            folder_name = 'results/' + dataset + "/" + mechanism + "/" +  split_strategy
            
            count_seed_executed = 0
            seed = 0
            while count_seed_executed < nb_seed:
                """We are using try/except due to potential division by zero caused by 
                large random number of sensitive attributes d_s, i.e., smaller epsilon
                per attribute."""
                
                try: 
                    setting_seed(seed) # for reproducibility
                    np.random.seed(seed) # for reproducibility

                    # select number of sensitive attributes
                    d_s = np.random.randint(1, len(possible_sensitive_att)+1)

                    # select sensitive attributes and always include the protected attribute as a sensitive attribute
                    lst_sensitive_att = [protected_attribute] + list(np.random.choice(possible_sensitive_att, size=d_s, replace=False))

                    # domain size of sensitive attributes
                    lst_k = {att: len(set(df[att])) for att in lst_sensitive_att}

                    # Train test splitting + LDP randomization + encoding
                    X_train, X_test, y_train, y_test = get_preprocessed_encoded_sets_with_ldp(df, target, test_size, seed, lst_sensitive_att, epsilon, split_strategy, lst_k, mechanism)

                    # instantiate and train model
                    model = LGBMClassifier(random_state=seed, n_jobs=10, objective="binary")
                    model.set_params(**params)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    # performance metrics        
                    acc = accuracy_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred)
                    auc = roc_auc_score(y_test, y_pred)
                    recall = recall_score(y_test, y_pred)
                    cm = confusion_matrix(y_test, y_pred)

                    # prepare dataset for fairness analysis
                    df_fm = pd.concat([X_test, y_test], axis=1)
                    df_fm['y_pred'] = y_pred

                    # fairness metrics
                    fair_met = fairness_metrics(df_fm, protected_attribute, target)

                    # write results to csv
                    write(folder_name, 
                          [str(seed),
                          acc, f1, auc, recall, cm,
                          fair_met["SP_a_1"], fair_met["SP_a_0"], fair_met["SPD"], fair_met["DI"], 
                          fair_met["EO_a_1"], fair_met["EO_a_0"], fair_met["EOD"], 
                          fair_met["OA_a_1"], fair_met["OA_a_0"], fair_met["OAD"]], 
                          mechanism, epsilon)

                    count_seed_executed += 1
                    seed += 1
                
                except: 
                    seed += 1
                    pass 
        print("-------------------------------------")
    print("==================================================================================")

print('That took {} seconds'.format(time.time() - starttime)) 

GRR
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
SUE
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
OUE
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
SS
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
THE
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
BLH
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
OLH
uniform
0.25
0.5
1
2
4
8
10
20
50
-------------------------------------
k_based
0.25
0.5
1
2
4
8
10
20
50
---