## Static parameters

In [1]:
import numpy as np

nb_seed = 20
dataset = 'LSAC' # select one in ['adult', 'ACSCoverage', 'LSAC']
params = np.load('results/' + dataset + '/non_private' + '/LGBM_hyperparameters.npy', allow_pickle='TRUE').item()

# set mechanism folder
mechanism = "non_private"
folder_name = 'results/' + dataset + '/' + mechanism

# for ML
test_size = 0.2 # test proportion for train_test_split
if dataset == 'adult':
    target = 'income'
    protected_attribute = 'gender'
    
elif dataset == 'ACSCoverage':
    target = 'PUBCOV'
    protected_attribute = 'DIS'
    
elif dataset == 'LSAC':
    target = 'pass_bar'
    protected_attribute = 'race1' 

## Writing function

In [2]:
def write(folder_name, values, mechanism):
    with open(folder_name + "/LGBM_results_"+mechanism+".csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()    

## Importing

In [3]:
# General imports
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import csv

# sklearn imports
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score

# designed functions
from functions import fairness_metrics

## Reading dataset

In [4]:
if dataset == 'adult':
    df = pd.read_csv('datasets/db_adult_processed_26k.csv')
    
elif dataset == 'ACSCoverage':
    df = pd.read_csv('datasets/db_ACSCoverage.csv')

elif dataset == 'LSAC':
    df = pd.read_csv('datasets/db_LSAC.csv')

df

Unnamed: 0,fam_inc,gender,fulltime,race1,lsat,ugpa,pass_bar
0,4,0,0,1,32,4,1
1,3,0,0,1,17,4,1
2,0,1,0,1,24,4,1
3,3,1,0,1,27,4,1
4,3,1,0,1,36,4,1
...,...,...,...,...,...,...,...
20422,1,1,0,0,14,1,0
20423,2,1,0,0,8,1,0
20424,2,1,1,0,24,1,1
20425,2,1,1,1,32,0,1


## Encoding

In [5]:
lst_df = []
for col in df.columns:
    
    if col != target:
        lst_col_name = [col+"_{}".format(val) for val in range(len(set(df[col])))]

        k = len(set(df[col]))

        OHE = np.eye(k)

        df_ohe = pd.DataFrame([OHE[val] for val in df[col]], columns=lst_col_name)
        lst_df.append(df_ohe)
df = pd.concat([pd.concat(lst_df, axis=1), df[target]], axis=1)
df

Unnamed: 0,fam_inc_0,fam_inc_1,fam_inc_2,fam_inc_3,fam_inc_4,gender_0,gender_1,fulltime_0,fulltime_1,race1_0,...,lsat_34,lsat_35,lsat_36,ugpa_0,ugpa_1,ugpa_2,ugpa_3,ugpa_4,ugpa_5,pass_bar
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20422,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
20423,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
20424,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
20425,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


## Run LGBM on DP data

In [6]:
header = ["seed", 
          "acc", "f1", "auc", "recall", "cm",
          "SP_a_1", "SP_a_0", "SPD", "DI", 
          "EO_a_1", "EO_a_0", "EOD", 
          "OA_a_1", "OA_a_0", "OAD",
         ]

starttime = time.time()
    
# write head of csv file
write(folder_name, header, mechanism)

for seed in range(nb_seed):
    print(seed)
    np.random.seed(seed) # for reproducibility
    
    # Use original datasets
    X = copy.deepcopy(df.drop(target, axis=1))
    y = copy.deepcopy(df[target])

    # Train test splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
    
    # instantiate and train model
    model = LGBMClassifier(random_state=seed, n_jobs=2, objective="binary")
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # performance metrics        
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # prepare dataset for fairness analysis
    df_fm = pd.concat([X_test, y_test], axis=1)
    df_fm['y_pred'] = y_pred

    # fairness metrics
    fair_met = fairness_metrics(df_fm, protected_attribute, target)

    # write results to csv
    write(folder_name, 
          [str(seed),
          acc, f1, auc, recall, cm,
          fair_met["SP_a_1"], fair_met["SP_a_0"], fair_met["SPD"], fair_met["DI"], 
          fair_met["EO_a_1"], fair_met["EO_a_0"], fair_met["EOD"], 
          fair_met["OA_a_1"], fair_met["OA_a_0"], fair_met["OAD"]], 
          mechanism)
        
print('That took {} seconds'.format(time.time() - starttime))  

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
That took 74.47927379608154 seconds
