In [2]:
# Song Liu (song.liu@bristol.ac.uk), 01-06-2023
# All rights reserved.

# load csv files from archive folder
import matplotlib.pyplot as plt
from numpy import  *
import pandas as pd

base = pd.read_csv('archive/Variant I.csv')
# remove 'income', 'customer_age', 'employment_status' columns as they are protected

customer_age = base.customer_age
income = base.income
employment_status = base.employment_status

base = base.drop(columns=['income', 'customer_age', 'employment_status'])
# convert categorical variables in 'payment_type' to integers
base['payment_type'] = base['payment_type'].astype('category')
base['housing_status'] = base['housing_status'].astype('category')
base['source'] = base['source'].astype('category')
base['device_os'] = base['device_os'].astype('category')

cat_columns = base.select_dtypes(['category']).columns
base[cat_columns] = base[cat_columns].apply(lambda x: x.cat.codes)
base = base.to_numpy()

# class prior 
prior = sum(base[:,0])/len(base)

print('class prior for positive:', prior)
print('class prior for negative:', 1-prior)

class prior for positive: 0.011029
class prior for negative: 0.988971


In [3]:
import numpy as np

customer_age_class = (customer_age<=40).astype(int).to_numpy()
customer_age_class = np.array([customer_age_class]).transpose()

In [4]:
# base = np.hstack((base, customer_age_binary))

In [10]:

from sklearn.model_selection import train_test_split

X = base[:,1:]
y = 2* base[:,0] - 1

X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(X, y, customer_age_class, test_size=0.9995, random_state=0)

In [16]:
from fairness_constraints.fairbatch.models import test_model
from fairness_constraints.fairbatch.FairBatchSampler import FairBatch, CustomDataset

In [12]:
X_train.shape


(500, 28)

In [17]:
import pylab as pl
from IPython import display

# Let us maximize the AUC
import torch # bring out the big gun

def aucloss(pred, label):
    if pred[label==1].shape[0] == 0: # you might end up with no positive samples
        print('no positive samples!')
    pos_pred = pred[label==1]
    neg_pred = pred[label==0]
    
    # hinge loss
    T = (neg_pred.T - pos_pred)
    loss = torch.max(T, torch.zeros_like(T))/2
    loss = torch.mean(loss, dim = 0)
    loss = torch.mean(loss)
    return loss

# same linear model
class NN(torch.nn.Module):
    def __init__(self, d):
        super(NN, self).__init__()
        self.fc = torch.nn.Linear(d, 1)
        
    def forward(self, x):
        return self.fc(x)

X_tr = torch.tensor(X_train, dtype = torch.float32)
y_tr = torch.tensor(y_train, dtype = torch.float32).unsqueeze(1)
z_tr = torch.tensor(z_train.squeeze(), dtype = torch.float32)
X_te = torch.tensor(X_test, dtype = torch.float32)
y_te = torch.tensor(y_test, dtype = torch.float32)
z_te = torch.tensor(z_test.squeeze(), dtype = torch.float32)

model = NN(X_train.shape[1])

# dataset = torch.utils.data.TensorDataset(X_tr, y_tr, z_tr)
train_data = CustomDataset(X_tr, y_tr, z_tr)

sampler = FairBatch (model, X_tr, y_tr, z_tr, batch_size = 10, alpha = 0.005, target_fairness = 'eqopp', replacement = False, seed = 0)
trainload = torch.utils.data.DataLoader (train_data, sampler=sampler, num_workers=0)
# trainload = torch.utils.data.DataLoader(dataset, batch_size = 10000, shuffle = True)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

for epoch in range(5):

    for batch_idx, (data, target, z) in enumerate (trainload):
    # for x, y in trainload:
        optimizer.zero_grad()
        pred = model(data)
        loss = aucloss(pred, target)
        
        loss.backward()
        optimizer.step()
    
    pred_te = model(X_te).detach().numpy()[:,0]

    fpr, tpr = roc(pred_te, y_test)
    auc = AUC(fpr, tpr)
    
    pl.plot(fpr, tpr, label = 'epoch %d, AUC: %0.2f' % (epoch, auc))
    pl.legend(loc = 'lower right')
    display.display(pl.gcf())
    display.clear_output(wait=True)


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 630 and the array at index 1 has size 11

In [None]:
import itertools

z_item = list(set(z_tr.tolist()))
y_item = list(set(y_tr.tolist()))

yz_tuple = list(itertools.product(y_item, z_item))
        

In [78]:
yz_tuple

[(1.0, 0.0), (1.0, 1.0), (-1.0, 0.0), (-1.0, 1.0)]

In [79]:
z_mask = {}
for tmp_z in z_item:
    z_mask[tmp_z] = (z_tr == tmp_z)

y_mask = {}
for tmp_y in y_item:
    y_mask[tmp_y] = (y_tr == tmp_y)

yz_mask = {}
for tmp_yz in yz_tuple:
            yz_mask[tmp_yz] = (y_tr == tmp_yz[0]) & (z_tr == tmp_yz[1])
        
        

In [80]:
y_mask

{1.0: tensor([False, False, False,  ..., False, False, False]),
 -1.0: tensor([True, True, True,  ..., True, True, True])}

In [81]:
z_index = {}
for tmp_z in z_item:
    z_index[tmp_z] = (z_mask[tmp_z] == 1).nonzero().squeeze()

y_index = {}
for tmp_y in y_item:
    y_index[tmp_y] = (y_mask[tmp_y] == 1).nonzero().squeeze()

yz_index = {}   
for tmp_yz in yz_tuple:
            yz_index[tmp_yz] = (yz_mask[tmp_yz] == 1).nonzero().squeeze()
              

In [82]:
yz_len = {}
for tmp_yz in yz_tuple:
    yz_len[tmp_yz] = len(yz_index[tmp_yz])

In [83]:
yz_index

{(1.0, 0.0): tensor([   251,    432,   1361,  ..., 898731, 899769, 899964]),
 (1.0, 1.0): tensor([   137,    368,    388,  ..., 899820, 899894, 899957]),
 (-1.0, 0.0): tensor([     2,     16,     32,  ..., 899954, 899968, 899979]),
 (-1.0, 1.0): tensor([     0,      1,      3,  ..., 899997, 899998, 899999])}

In [84]:
S = {}
batch_size=5
N=len(z_tr)
for tmp_yz in yz_tuple:
    S[tmp_yz] = batch_size * (yz_len[tmp_yz])/N


In [85]:
S

{(1.0, 0.0): 0.006733333333333333,
 (1.0, 1.0): 0.0484,
 (-1.0, 0.0): 0.5474277777777777,
 (-1.0, 1.0): 4.397438888888889}

In [55]:
S[1,1]

0.0484