## Static parameters

In [1]:
import numpy as np

nb_seed = 20
dataset = 'LSAC' # select one in ['adult', 'ACSCoverage', 'LSAC']
split_strategy = 'uniform' # select one in ['uniform', 'k_based']
mechanism = "GRR"

# read LGBM hyparameters of non-private model
params = np.load('results/' + dataset + '/non_private' + '/LGBM_hyperparameters.npy', allow_pickle='TRUE').item()

# set mechanism folder
folder_name = 'results/' + dataset + "/" + mechanism + "/" +  split_strategy

# for ML
test_size = 0.2 # test proportion for train_test_split
if dataset == 'adult':
    target = 'income'
    protected_attribute = 'gender'
    
elif dataset == 'ACSCoverage':
    target = 'PUBCOV'
    protected_attribute = 'DIS'
    
elif dataset == 'LSAC':
    target = 'pass_bar'
    protected_attribute = 'race1' 

# for privacy
lst_eps = [0.25, 0.5, 1, 2, 4, 8, 10, 20, 50] # epsilon-LDP values
if dataset == 'adult':
    lst_sensitive_att = [protected_attribute, 'race', 'native-country', 'age']
    
elif dataset == 'ACSCoverage':
    lst_sensitive_att = [protected_attribute, 'AGEP', 'SEX', 'SCHL']

elif dataset == 'LSAC':
    lst_sensitive_att = [protected_attribute, 'fam_inc', 'gender', 'fulltime']

## Writing function

In [2]:
def write(folder_name, values, mechanism, epsilon):
    with open(folder_name + "/LGBM_results_"+mechanism+"_eps_"+str(epsilon)+".csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()    

## Importing

In [3]:
# General imports
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import csv

# sklearn imports
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score

# designed functions
from functions import fairness_metrics

# multi-freq-ldpy import
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from numba import jit

@jit(nopython=True)
def setting_seed(seed):
    """ Function to set seed for reproducibility.
    Calling numpy.random.seed() from interpreted code will 
    seed the NumPy random generator, not the Numba random generator.
    Check: https://numba.readthedocs.io/en/stable/reference/numpysupported.html"""
    
    np.random.seed(seed)

## Reading dataset

In [4]:
if dataset == 'adult':
    df = pd.read_csv('datasets/db_adult_processed_26k.csv')
    
elif dataset == 'ACSCoverage':
    df = pd.read_csv('datasets/db_ACSCoverage.csv')

elif dataset == 'LSAC':
    df = pd.read_csv('datasets/db_LSAC.csv')

df

Unnamed: 0,fam_inc,gender,fulltime,race1,lsat,ugpa,pass_bar
0,4,0,0,1,32,4,1
1,3,0,0,1,17,4,1
2,0,1,0,1,24,4,1
3,3,1,0,1,27,4,1
4,3,1,0,1,36,4,1
...,...,...,...,...,...,...,...
20422,1,1,0,0,14,1,0
20423,2,1,0,0,8,1,0
20424,2,1,1,0,24,1,1
20425,2,1,1,1,32,0,1


## Run LGBM on DP data

In [5]:
header = ["seed", 
          "acc", "f1", "auc", "recall", "cm",
          "SP_a_1", "SP_a_0", "SPD", "DI", 
          "EO_a_1", "EO_a_0", "EOD", 
          "OA_a_1", "OA_a_0", "OAD",
         ]

starttime = time.time()

# domain size of sensitive attributes
lst_k = {att: len(set(df[att])) for att in lst_sensitive_att}

for epsilon in lst_eps:
    print(epsilon)
    
    # write head of csv file
    write(folder_name, header, mechanism, epsilon)
    
    for seed in range(nb_seed):
        setting_seed(seed) # for reproducibility
        
        # Use original dataset
        X = copy.deepcopy(df.drop(target, axis=1))
        y = copy.deepcopy(df[target])
        
        # Train test splitting
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
        y_train.reset_index(inplace=True, drop=True)
        y_test.reset_index(inplace=True, drop=True)
        
        # One-Hot-Encoding + LDP randomization
        lst_df_train = []
        lst_df_test = []
        for col in X_train.columns:

            lst_col_name = [col+"_{}".format(val) for val in range(len(set(df[col])))]
            k = len(set(df[col]))
            OHE = np.eye(k)

            if col in lst_sensitive_att: # LDP randomization
                eps_att = epsilon / len(lst_sensitive_att) if split_strategy=='uniform' else epsilon * k / sum(lst_k.values())
                df_ohe = pd.DataFrame([OHE[GRR_Client(val, k, eps_att)] for val in X_train[col]], columns=lst_col_name)

            else: # just one-hot-encoding
                df_ohe = pd.DataFrame([OHE[val] for val in X_train[col]], columns=lst_col_name)

            lst_df_train.append(df_ohe)
            
            # test set is original, i.e., just one-hot-encoding
            df_ohe_test = pd.DataFrame([OHE[val] for val in X_test[col]], columns=lst_col_name)
            lst_df_test.append(df_ohe_test)

        # concat one-hot-encoded train/test sets
        X_train = pd.concat(lst_df_train, axis=1)
        X_test = pd.concat(lst_df_test, axis=1)
        
        # instantiate and train model
        model = LGBMClassifier(random_state=seed, n_jobs=2, objective="binary")
        model.set_params(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # performance metrics        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # prepare dataset for fairness analysis
        df_fm = pd.concat([X_test, y_test], axis=1)
        df_fm['y_pred'] = y_pred

        # fairness metrics
        fair_met = fairness_metrics(df_fm, protected_attribute, target)

        # write results to csv
        write(folder_name, 
              [str(seed),
              acc, f1, auc, recall, cm,
              fair_met["SP_a_1"], fair_met["SP_a_0"], fair_met["SPD"], fair_met["DI"], 
              fair_met["EO_a_1"], fair_met["EO_a_0"], fair_met["EOD"], 
              fair_met["OA_a_1"], fair_met["OA_a_0"], fair_met["OAD"]], 
              mechanism, epsilon)
        
print('That took {} seconds'.format(time.time() - starttime)) 

0.25
0.5
1
2
4
8
10
20
50
That took 946.4081802368164 seconds
