## Initial parameters

In [1]:
seed = 42
dataset = 'adult'
mechanism = "non_private"
folder_name = 'results/' + dataset + '/' + mechanism

# for ML
if dataset == 'adult':
    target = 'income'
    protected_attribute = 'gender'
    
elif dataset == 'ACSCoverage':
    target = 'PUBCOV'
    protected_attribute = 'DIS'
    
elif dataset == 'LSAC':
    target = 'pass_bar'
    protected_attribute = 'race1' 
    
test_size = 0.2

## Write function

In [2]:
def write(folder_name, values):
    with open(folder_name + "/LGBM_BO_results.csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()

## Importing

In [3]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import csv

# sklearn imports
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score

# hyper-params opti
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# designed functions
from functions import fairness_metrics

## Reading dataset

In [4]:
if dataset == 'adult':
    df = pd.read_csv('datasets/db_adult_processed_26k.csv')
    
elif dataset == 'ACSCoverage':
    df = pd.read_csv('datasets/db_ACSCoverage.csv')

elif dataset == 'LSAC':
    df = pd.read_csv('datasets/db_LSAC.csv')

df

Unnamed: 0,age,workclass,education,marital-status,occupation,native-country,relationship,hours-per-week,gender,race,income
0,23,2,9,2,12,38,5,19,0,4,1
1,4,2,15,0,2,38,3,39,1,4,0
2,0,2,1,4,7,38,3,9,1,4,0
3,34,2,11,2,11,0,0,49,1,1,1
4,9,2,9,4,3,38,1,37,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...
45844,18,2,9,2,4,40,0,64,1,4,1
45845,20,4,9,2,11,39,0,75,1,1,1
45846,7,2,8,4,11,38,1,54,1,4,0
45847,7,2,15,4,0,38,1,39,0,4,0


## Encoding

In [5]:
lst_df = []
for col in df.columns:
    
    if col != target:
        lst_col_name = [col+"_{}".format(val) for val in range(len(set(df[col])))]

        k = len(set(df[col]))

        OHE = np.eye(k)

        df_ohe = pd.DataFrame([OHE[val] for val in df[col]], columns=lst_col_name)
        lst_df.append(df_ohe)
df = pd.concat([pd.concat(lst_df, axis=1), df[target]], axis=1)
df

Unnamed: 0,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,...,hours-per-week_94,hours-per-week_95,gender_0,gender_1,race_0,race_1,race_2,race_3,race_4,income
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
45845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
45846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
45847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0


## Splitting train and test sets

In [6]:
X = copy.deepcopy(df.drop(target, axis=1))
y = copy.deepcopy(df[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((36679, 268), (36679,), (9170, 268), (9170,))

## Single Run of Non-private LGBM

In [7]:
model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# performance metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Performance Metrics')
print("acc:", acc)
print("f1:", f1)
print("auc:", auc)
print("recall:", recall)

# prepare dataset for fairness metrics
df_fm = pd.concat([X_test, y_test], axis=1)
df_fm['y_pred'] = y_pred

print('\nFairness Metrics')

fair_met = fairness_metrics(df_fm, protected_attribute, target)

for key in fair_met.keys():
    print(key+":", fair_met[key])

Performance Metrics
acc: 0.8182115594329334
f1: 0.8281974647016387
auc: 0.8171779914854769
recall: 0.8462510530749789

Fairness Metrics
SP_a_1: 0.6675868788567717
SP_a_0: 0.28021248339973437
DI: 0.4197393511981426
SPD: 0.38737439545703733
EO_a_1: 0.896945551128818
EO_a_0: 0.6520854526958291
EOD: 0.24486009843298895
OA_a_1: 0.8177979863592075
OA_a_0: 0.8190571049136787
OAD: -0.0012591185544711392


## Objective function

In [8]:
def objective_function(space):
    
    global seed, ITER, X_train, y_train, X_test, y_test
    
    ITER += 1
    
    params = {'max_depth': int(space['max_depth']), 
              'learning_rate': space['learning_rate'],
              'n_estimators': int(space['n_estimators']),
             }
    
    print("----------------------------------------------------------------------------")
    print(ITER, ":: ", params)
    
    # Initialize and fit model
    model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
    model.set_params(**params)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Performance metrics
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
        
    # write ["iter", "acc", "f1-score", "auc", "recall", "cm", "params"]
    write(folder_name, [str(ITER),
           acc,
           f1,
           auc,
           recall,
           cm,
           params])

    print("loss:", auc)
    
    # maximize AUC metric
    return {'loss':-auc, 'status': STATUS_OK}

## Bayesian Optimization

In [9]:
space = {'max_depth': hp.quniform('max_depth', -1, 50, 1),
         'n_estimators': hp.quniform('n_estimators', 50, 2000, 50),
         'learning_rate': hp.uniform('learning_rate', 0.01, 0.25),
        }

header = ["iter", "acc", "f1-score", "auc", "recall", "cm", "params"]
write(folder_name, header)

ITER = 0
trials = Trials()
best = fmin(fn=objective_function,
            space=space,
            algo=tpe.suggest,
            rstate= np.random.default_rng(seed),
            max_evals=100,
            verbose=False,
            trials=trials)
print(best)

----------------------------------------------------------------------------
1 ::  {'max_depth': 31, 'learning_rate': 0.2221423901134277, 'n_estimators': 1650}
loss: 0.7937075650315475
----------------------------------------------------------------------------
2 ::  {'max_depth': 30, 'learning_rate': 0.21662331565922432, 'n_estimators': 650}
loss: 0.8070529446662682
----------------------------------------------------------------------------
3 ::  {'max_depth': 3, 'learning_rate': 0.15823860430552075, 'n_estimators': 350}
loss: 0.8137571886298764
----------------------------------------------------------------------------
4 ::  {'max_depth': 10, 'learning_rate': 0.1521888191981938, 'n_estimators': 850}
loss: 0.8094628717483274
----------------------------------------------------------------------------
5 ::  {'max_depth': 4, 'learning_rate': 0.07896551676477612, 'n_estimators': 1050}
loss: 0.8137727156512757
----------------------------------------------------------------------------


loss: 0.8150164967458031
----------------------------------------------------------------------------
46 ::  {'max_depth': 38, 'learning_rate': 0.06324435324394725, 'n_estimators': 1450}
loss: 0.8101689701907862
----------------------------------------------------------------------------
47 ::  {'max_depth': 48, 'learning_rate': 0.09969200841130206, 'n_estimators': 1100}
loss: 0.8079885667778135
----------------------------------------------------------------------------
48 ::  {'max_depth': 44, 'learning_rate': 0.03168287148542446, 'n_estimators': 550}
loss: 0.8165971094211107
----------------------------------------------------------------------------
49 ::  {'max_depth': 36, 'learning_rate': 0.2287410686488734, 'n_estimators': 250}
loss: 0.8116432751613001
----------------------------------------------------------------------------
50 ::  {'max_depth': 27, 'learning_rate': 0.0731709618605772, 'n_estimators': 650}
loss: 0.8122285390844659
---------------------------------------------

loss: 0.8017176981752796
----------------------------------------------------------------------------
90 ::  {'max_depth': 9, 'learning_rate': 0.020537879875035178, 'n_estimators': 700}
loss: 0.8148025000981156
----------------------------------------------------------------------------
91 ::  {'max_depth': 21, 'learning_rate': 0.0794083840173824, 'n_estimators': 1150}
loss: 0.8111710822467276
----------------------------------------------------------------------------
92 ::  {'max_depth': 14, 'learning_rate': 0.16035346161816688, 'n_estimators': 850}
loss: 0.8093099829793362
----------------------------------------------------------------------------
93 ::  {'max_depth': 22, 'learning_rate': 0.22344548306062412, 'n_estimators': 1250}
loss: 0.7981304799430893
----------------------------------------------------------------------------
94 ::  {'max_depth': 19, 'learning_rate': 0.04454388781866422, 'n_estimators': 1450}
loss: 0.814076492775458
--------------------------------------------

## Save hyperparameters dictionary to numpy file

In [10]:
# Ensure data types are correct
best = {
        'max_depth': int(best['max_depth']), 
        'learning_rate': best['learning_rate'],
        'n_estimators': int(best['n_estimators']),
        }
np.save(folder_name + '/LGBM_hyperparameters.npy', best) 

## Single Run of Best Non-Private LGBM Model

In [11]:
params = np.load(folder_name + '/LGBM_hyperparameters.npy', allow_pickle='TRUE').item()

model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
model.set_params(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# performance metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Performance Metrics')
print("acc:", acc)
print("f1:", f1)
print("auc:", auc)
print("recall:", recall)

# prepare dataset for fairness metrics
df_fm = pd.concat([X_test, y_test], axis=1)
df_fm['y_pred'] = y_pred

print('\nFairness Metrics')

fair_met = fairness_metrics(df_fm, protected_attribute, target)

for key in fair_met.keys():
    print(key+":", fair_met[key])

Performance Metrics
acc: 0.818102508178844
f1: 0.8278282411230389
auc: 0.8171270285624797
recall: 0.8445661331086773

Fairness Metrics
SP_a_1: 0.6636895095810328
SP_a_0: 0.28320053120849936
DI: 0.42670635458329803
SPD: 0.3804889783725334
EO_a_1: 0.8929614873837981
EO_a_0: 0.659206510681587
EOD: 0.23375497670221113
OA_a_1: 0.8168236440402729
OA_a_0: 0.8207171314741036
OAD: -0.003893487433830778
