In [13]:
# Song Liu (song.liu@bristol.ac.uk), 01-06-2023
# All rights reserved.

# load csv files from archive folder
import matplotlib.pyplot as plt
from numpy import  *
import pandas as pd

base = pd.read_csv('archive/Variant I.csv')
# remove 'income', 'customer_age', 'employment_status' columns as they are protected

customer_age = base.customer_age
income = base.income
employment_status = base.employment_status

base = base.drop(columns=['income', 'customer_age', 'employment_status'])
# convert categorical variables in 'payment_type' to integers
base['payment_type'] = base['payment_type'].astype('category')
base['housing_status'] = base['housing_status'].astype('category')
base['source'] = base['source'].astype('category')
base['device_os'] = base['device_os'].astype('category')

cat_columns = base.select_dtypes(['category']).columns
base[cat_columns] = base[cat_columns].apply(lambda x: x.cat.codes)
base = base.to_numpy()

# class prior 
prior = sum(base[:,0])/len(base)

print('class prior for positive:', prior)
print('class prior for negative:', 1-prior)

class prior for positive: 0.011029
class prior for negative: 0.988971


In [8]:
import numpy as np

customer_age_binary = (customer_age<=40).astype(int).to_numpy()
customer_age_binary = np.array([customer_age_binary]).transpose()

In [9]:
# base = np.hstack((base, customer_age_binary))

Let us train a classifier (logistic regression with linear model) on the dataset. 

In [14]:
def roc(pred, label):
    fpr, tpr = [], []
    tmax = max(pred)
    tmin = min(pred)
    for t in linspace(tmin, tmax, 100):
        p = pred > t
        tpr.append(TPR(p, label))
        fpr.append(FPR(p, label))
    return fpr, tpr

def AUC(fpr, tpr):
    return sum([(tpr[i]+tpr[i-1])*(fpr[i-1]-fpr[i])/2 for i in range(1, len(fpr))])

# train a logistic regression using sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = base[:,1:]
y = base[:,0]

X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, customer_age_binary, test_size=0.1, random_state=0)

# # not weighted
# clf0 = LogisticRegression(random_state=0, max_iter = 100).fit(X_train, y_train, sample_weight = None)
# # weight the samples according to the class prior
# weights = y_train*(1-prior) + (1-y_train)*prior
# clf1 = LogisticRegression(random_state=0, max_iter = 100).fit(X_train, y_train, sample_weight = weights)

Slightly better than baseline classifiers. Now try AUC maximization

In [15]:
from fairness_constraints.fairbatch.FairBatchSampler import FairBatch

In [16]:
import pylab as pl
from IPython import display

# Let us maximize the AUC
import torch # bring out the big gun

def aucloss(pred, label):
    if pred[label==1].shape[0] == 0: # you might end up with no positive samples
        print('no positive samples!')
    pos_pred = pred[label==1]
    neg_pred = pred[label==0]
    
    # hinge loss
    T = (neg_pred.T - pos_pred)
    loss = torch.max(T, torch.zeros_like(T))/2
    loss = torch.mean(loss, dim = 0)
    loss = torch.mean(loss)
    return loss

# same linear model
class NN(torch.nn.Module):
    def __init__(self, d):
        super(NN, self).__init__()
        self.fc = torch.nn.Linear(d, 1)
        
    def forward(self, x):
        return self.fc(x)

X_tr = torch.tensor(X_train, dtype = torch.float32)
y_tr = torch.tensor(y_train, dtype = torch.float32)
z_tr = torch.tensor(z_train.squeeze(), dtype = torch.float32)
X_te = torch.tensor(X_test, dtype = torch.float32)
y_te = torch.tensor(y_test, dtype = torch.float32)
z_te = torch.tensor(z_test.squeeze(), dtype = torch.float32)

model = NN(X_train.shape[1])

dataset = torch.utils.data.TensorDataset(X_tr, y_tr, z_tr)


sampler = FairBatch (model, X_tr, y_tr, z_tr, batch_size = 100, alpha = 0.005, target_fairness = 'eqopp', replacement = False, seed = 0)
trainload = torch.utils.data.DataLoader (dataset, sampler=sampler, num_workers=0)
# trainload = torch.utils.data.DataLoader(dataset, batch_size = 10000, shuffle = True)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

for epoch in range(5):
    for x, y in trainload:
        optimizer.zero_grad()
        pred = model(x)
        loss = aucloss(pred, y)
        
        loss.backward()
        optimizer.step()
    
    pred_te = model(X_te).detach().numpy()[:,0]

    fpr, tpr = roc(pred_te, y_test)
    auc = AUC(fpr, tpr)
    
    pl.plot(fpr, tpr, label = 'epoch %d, AUC: %0.2f' % (epoch, auc))
    pl.legend(loc = 'lower right')
    display.display(pl.gcf())
    display.clear_output(wait=True)


KeyError: (-1, 1)