## Initial parameters

In [1]:
seed = 42
dataset = 'LSAC' # select one in ['adult', 'ACSCoverage', 'LSAC']
mechanism = "non_private"
folder_name = 'results/' + dataset + '/' + mechanism

# for ML
if dataset == 'adult':
    target = 'income'
    protected_attribute = 'gender'
    
elif dataset == 'ACSCoverage':
    target = 'PUBCOV'
    protected_attribute = 'DIS'
    
elif dataset == 'LSAC':
    target = 'pass_bar'
    protected_attribute = 'race1' 
    
test_size = 0.2

## Write function

In [2]:
def write(folder_name, values):
    with open(folder_name + "/LGBM_BO_results.csv", mode='a', newline='') as scores_file:
        scores_writer = csv.writer(scores_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        scores_writer.writerow(values)
    scores_file.close()

## Importing

In [3]:
# General imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import csv

# sklearn imports
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, recall_score

# hyper-params opti
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# designed functions
from functions import fairness_metrics

## Reading dataset

In [4]:
if dataset == 'adult':
    df = pd.read_csv('datasets/db_adult_processed_26k.csv')
    
elif dataset == 'ACSCoverage':
    df = pd.read_csv('datasets/db_ACSCoverage.csv')

elif dataset == 'LSAC':
    df = pd.read_csv('datasets/db_LSAC.csv')

df

Unnamed: 0,fam_inc,gender,fulltime,race1,lsat,ugpa,pass_bar
0,4,0,0,1,32,4,1
1,3,0,0,1,17,4,1
2,0,1,0,1,24,4,1
3,3,1,0,1,27,4,1
4,3,1,0,1,36,4,1
...,...,...,...,...,...,...,...
20422,1,1,0,0,14,1,0
20423,2,1,0,0,8,1,0
20424,2,1,1,0,24,1,1
20425,2,1,1,1,32,0,1


## Encoding

In [5]:
lst_df = []
for col in df.columns:
    
    if col != target:
        lst_col_name = [col+"_{}".format(val) for val in range(len(set(df[col])))]

        k = len(set(df[col]))

        OHE = np.eye(k)

        df_ohe = pd.DataFrame([OHE[val] for val in df[col]], columns=lst_col_name)
        lst_df.append(df_ohe)
df = pd.concat([pd.concat(lst_df, axis=1), df[target]], axis=1)
df

Unnamed: 0,fam_inc_0,fam_inc_1,fam_inc_2,fam_inc_3,fam_inc_4,gender_0,gender_1,fulltime_0,fulltime_1,race1_0,...,lsat_34,lsat_35,lsat_36,ugpa_0,ugpa_1,ugpa_2,ugpa_3,ugpa_4,ugpa_5,pass_bar
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20422,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
20423,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
20424,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
20425,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


## Splitting train and test sets

In [6]:
X = copy.deepcopy(df.drop(target, axis=1))
y = copy.deepcopy(df[target])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16341, 54), (16341,), (4086, 54), (4086,))

## Single Run of Non-private LGBM

In [7]:
model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# performance metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Performance Metrics')
print("acc:", acc)
print("f1:", f1)
print("auc:", auc)
print("recall:", recall)

# prepare dataset for fairness metrics
df_fm = pd.concat([X_test, y_test], axis=1)
df_fm['y_pred'] = y_pred

print('\nFairness Metrics')

fair_met = fairness_metrics(df_fm, protected_attribute, target)

for key in fair_met.keys():
    print(key+":", fair_met[key])

Performance Metrics
acc: 0.947136563876652
f1: 0.9728028204482498
auc: 0.5148929364858968
recall: 0.9961320268179474

Fairness Metrics
SP_a_1: 0.9984399375975039
SP_a_0: 0.9333333333333333
DI: 0.9347916666666667
SPD: 0.06510660426417059
EO_a_1: 0.9986438839164633
EO_a_0: 0.9476439790575916
EOD: 0.050999904858871736
OA_a_1: 0.9576183047321893
OA_a_0: 0.7791666666666667
OAD: 0.17845163806552267


## Objective function

In [8]:
def objective_function(space):
    
    global seed, ITER, X_train, y_train, X_test, y_test
    
    ITER += 1
    
    params = {'max_depth': int(space['max_depth']), 
              'learning_rate': space['learning_rate'],
              'n_estimators': int(space['n_estimators']),
             }
    
    print("------------------------------------------------------------------------------------")
    print(ITER, ":: ", params)
    
    # Initialize and fit model
    model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
    model.set_params(**params)
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Performance metrics
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
        
    # write ["iter", "acc", "f1-score", "auc", "recall", "cm", "params"]
    write(folder_name, [str(ITER),
           acc,
           f1,
           auc,
           recall,
           cm,
           params])

    print("loss:", auc)
    
    # maximize AUC metric
    return {'loss':-auc, 'status': STATUS_OK}

## Bayesian Optimization

In [9]:
space = {'max_depth': hp.quniform('max_depth', -1, 50, 1),
         'n_estimators': hp.quniform('n_estimators', 50, 2000, 50),
         'learning_rate': hp.uniform('learning_rate', 0.01, 0.25),
        }

header = ["iter", "acc", "f1-score", "auc", "recall", "cm", "params"]
write(folder_name, header)

ITER = 0
trials = Trials()
best = fmin(fn=objective_function,
            space=space,
            algo=tpe.suggest,
            rstate= np.random.default_rng(seed),
            max_evals=100,
            verbose=False,
            trials=trials)
print(best)

------------------------------------------------------------------------------------
1 ::  {'max_depth': 31, 'learning_rate': 0.2221423901134277, 'n_estimators': 1650}
loss: 0.5533916669972627
------------------------------------------------------------------------------------
2 ::  {'max_depth': 30, 'learning_rate': 0.21662331565922432, 'n_estimators': 650}
loss: 0.5418881659856389
------------------------------------------------------------------------------------
3 ::  {'max_depth': 3, 'learning_rate': 0.15823860430552075, 'n_estimators': 350}
loss: 0.5098273792994009
------------------------------------------------------------------------------------
4 ::  {'max_depth': 10, 'learning_rate': 0.1521888191981938, 'n_estimators': 850}
loss: 0.5421460308644424
------------------------------------------------------------------------------------
5 ::  {'max_depth': 4, 'learning_rate': 0.07896551676477612, 'n_estimators': 1050}
loss: 0.5138614769706827
-------------------------------------

loss: 0.5490997044471774
------------------------------------------------------------------------------------
44 ::  {'max_depth': 29, 'learning_rate': 0.21402280026452858, 'n_estimators': 1450}
loss: 0.5511167532828183
------------------------------------------------------------------------------------
45 ::  {'max_depth': 37, 'learning_rate': 0.23162334091291253, 'n_estimators': 1300}
loss: 0.5504720910858095
------------------------------------------------------------------------------------
46 ::  {'max_depth': 46, 'learning_rate': 0.15566079242964784, 'n_estimators': 1650}
loss: 0.5511167532828183
------------------------------------------------------------------------------------
47 ::  {'max_depth': 9, 'learning_rate': 0.18177994587365698, 'n_estimators': 1100}
loss: 0.5563112429087158
------------------------------------------------------------------------------------
48 ::  {'max_depth': 8, 'learning_rate': 0.18083714122765138, 'n_estimators': 950}
loss: 0.5492286368865792
---

loss: 0.5100852441782046
------------------------------------------------------------------------------------
87 ::  {'max_depth': 19, 'learning_rate': 0.08296302993864231, 'n_estimators': 1500}
loss: 0.5474694529297417
------------------------------------------------------------------------------------
88 ::  {'max_depth': 48, 'learning_rate': 0.13729378349311658, 'n_estimators': 1250}
loss: 0.5513746181616218
------------------------------------------------------------------------------------
89 ::  {'max_depth': 32, 'learning_rate': 0.19333882733400584, 'n_estimators': 950}
loss: 0.5489707720077757
------------------------------------------------------------------------------------
90 ::  {'max_depth': 29, 'learning_rate': 0.2377207841407428, 'n_estimators': 1050}
loss: 0.5459222636569206
------------------------------------------------------------------------------------
91 ::  {'max_depth': 15, 'learning_rate': 0.17450486905882553, 'n_estimators': 1650}
loss: 0.5532627345578609
--

## Save hyperparameters dictionary to numpy file

In [10]:
# Ensure data types are correct
best = {
        'max_depth': int(best['max_depth']), 
        'learning_rate': best['learning_rate'],
        'n_estimators': int(best['n_estimators']),
        }
np.save(folder_name + '/LGBM_hyperparameters.npy', best) 

## Single Run of Best Non-Private LGBM Model

In [11]:
params = np.load(folder_name + '/LGBM_hyperparameters.npy', allow_pickle='TRUE').item()

model = LGBMClassifier(random_state=seed, n_jobs=-1, objective="binary")
model.set_params(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# performance metrics
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Performance Metrics')
print("acc:", acc)
print("f1:", f1)
print("auc:", auc)
print("recall:", recall)

# prepare dataset for fairness metrics
df_fm = pd.concat([X_test, y_test], axis=1)
df_fm['y_pred'] = y_pred

print('\nFairness Metrics')

fair_met = fairness_metrics(df_fm, protected_attribute, target)

for key in fair_met.keys():
    print(key+":", fair_met[key])

Performance Metrics
acc: 0.9437102300538424
f1: 0.9708491761723701
auc: 0.5563112429087158
recall: 0.9876224858174316

Fairness Metrics
SP_a_1: 0.9927197087883516
SP_a_0: 0.8083333333333333
DI: 0.8142613933996856
SPD: 0.18438637545501824
EO_a_1: 0.9940330892324383
EO_a_0: 0.8638743455497382
EOD: 0.13015874368270008
OA_a_1: 0.9544981799271971
OA_a_0: 0.7708333333333334
OAD: 0.18366484659386373
