References:
1. https://github.com/jf20541/LogisticRegressionPyTorch/blob/main/src/pytorchmodel.py
2. https://pytorch.org/tutorials/beginner/basics/
3. https://stackoverflow.com/questions/42704283/l1-l2-regularization-in-pytorch
4. https://gist.github.com/tuelwer/0b52817e9b6251d940fd8e2921ec5e20

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

random_state = 1
torch.manual_seed(random_state)

<torch._C.Generator at 0x12683bef0>

In [2]:
df = pd.read_csv('../data/cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Duration,Credit Amount,Installment rate,Residence,Age,Number of credits,Maintenance,Target,Account Status_<0,Account Status_<200,...,Housing_own,Housing_rent,Job_management/ highly qualified employee,Job_skilled employee / official,Job_unemployed/ unskilled - non-resident,Job_unskilled - resident,Telephone_none,Telephone_yes,Foreign_no,Foreign_yes
0,6,1169,4,4,67,2,1,1,1,0,...,1,0,0,1,0,0,0,1,0,1
1,48,5951,2,2,22,1,1,0,0,1,...,1,0,0,1,0,0,1,0,0,1
2,12,2096,2,3,49,1,2,1,0,0,...,1,0,0,0,0,1,1,0,0,1
3,42,7882,2,4,45,1,2,1,1,0,...,0,0,0,1,0,0,1,0,0,1
4,24,4870,3,4,53,2,2,0,1,0,...,0,0,0,1,0,0,1,0,0,1


In [3]:
sensitive_indexes = []
sensitive_columns = []

for index, column in enumerate(df.loc[:, df.columns != 'Target'].columns):
    if column.startswith("Sex") or column.startswith("Age") or column.startswith("Foreign"):
        sensitive_indexes.append(index)
        sensitive_columns.append(column)
        
sensitive_indexes, sensitive_columns


([4, 36, 37, 38, 39, 59, 60],
 ['Age',
  'Sex_female divorced/separated/married',
  'Sex_male divorced/separated',
  'Sex_male married/widowed',
  'Sex_male single',
  'Foreign_no',
  'Foreign_yes'])

In [4]:
def get_fairness_regularizer(data, preds):
#     assert data.shape[0] == preds.shape[0]

    combined = ["" for _ in range(data.shape[0])]

    for i in sensitive_indexes:
        for j in range(data.shape[0]):
            combined[j] = combined[j] + "_" + str(data[j, i].item())

    counts = dict()
    for index, entry in enumerate(combined):
        if entry not in counts:
            counts[entry] = [index]
        else:
            indexes = counts[entry]
            indexes.append(index)
            counts[entry] = indexes
        
    pr_y1_si = [0 for _ in range(data.shape[0])]
    preds_sum_dict = dict()

    for idx, entry in enumerate(combined):
        indexes = counts[entry]

        if entry in preds_sum_dict:
            preds_sum = preds_sum_dict[entry]
        else:
            preds_sum = 0
            for index in indexes:
                preds_sum += preds[index].detach().item()
            preds_sum_dict[entry] = preds_sum
            
        pr_y1_si[idx] = preds_sum / len(indexes)

    pr_y1_si = torch.tensor(pr_y1_si)
    pr_y0_si = 1 - pr_y1_si
    
    pr_y1 = torch.sum(preds) / data.shape[0]
    pr_y0 = 1 - pr_y1
    
    pr_0 = torch.nan_to_num(torch.clamp(torch.log(pr_y0_si / pr_y0), min=-2., max=2.), 0.)
    pr_1 = torch.nan_to_num(torch.clamp(torch.log(pr_y1_si / pr_y1), min=-2., max=2.), 0.)

    return torch.sum((1 - preds) * pr_0 + preds * pr_1)
    

In [5]:
Xs = torch.tensor(df.loc[:, df.columns != 'Target'].values)
ys = torch.tensor(df['Target'].values)
ys = ys.reshape(ys.shape[0], 1)

In [6]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [7]:
loss_fn = nn.BCELoss()
l2_lambda = 0.01
fairness_lambda = 0.1


def test(Xs, ys, model, threshold=0.72):
    """
    If the model fails to find a reasonable solution, return True
    """
    size = Xs.shape[0]
    test_loss, correct = 0, 0

    with torch.no_grad():
        pred = model(Xs.float())
        test_loss += loss_fn(pred, ys.float()).item()
        
        # l2 regularization
        l2_reg = torch.tensor(0.)
        for w in model.parameters():
            l2_reg += w.norm(2)
        test_loss += l2_lambda * l2_reg
    
        correct += ((pred > 0.5).float() == ys).type(torch.float).sum().item()
        
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    return correct <= threshold


In [8]:
def run_training():
    model = LogisticRegression(Xs.shape[1], 1)
    optimizer = torch.optim.LBFGS(model.parameters(), lr=0.1, line_search_fn='strong_wolfe')

    def closure():
        optimizer.zero_grad()

        # Compute prediction and loss
        pred = model(Xs.float())
#         loss = loss_fn(pred, ys.float())

#         # l2 regularization
#         l2_reg = torch.tensor(0.)
#         for w in model.parameters():
#             l2_reg += w.norm(2)
#         loss += l2_lambda * l2_reg

        loss = 0.
        
        # fairness regularizer
        fairness_regularization = get_fairness_regularizer(Xs.float(), pred)
        loss += fairness_lambda * fairness_regularization
        
        loss.backward()
        print(model.parameters())

        return loss

    epochs = 20
    for t in range(epochs):
        optimizer.step(closure)
        
    return model
    
model = run_training()
while test(Xs, ys, model):
    model = run_training()


1.4745802274161993e-32
9.751486196522426e-12
3.1646770108829566e-28
1.6991988950439541e-34
0.0
4.3011532519523163e-19
0.0
8.390808324801126e-14
3.2228894838516714e-31
2.555438112063832e-12
1.58136610153833e-32
4.337515089260447e-20
3.7876716015589977e-25
7.36494724323501e-26
2.1151692868028905e-21
2.5035853724130376e-24
3.3547499858117294e-29
6.16209054329951e-31
1.5178738404773276e-34
2.664250550159419e-16
1.3606349702352419e-19
5.11497582570547e-13
2.6245368725530405e-12
4.6696567613741215e-35
8.6471530862104e-23
0.0
2.4112538992796943e-21
1.5733853717901146e-17
9.881397815569277e-21
1.2211459275943561e-21
6.415807522591328e-22
8.011395366547269e-13
5.248707160941345e-30
0.0
1.6520289910365582e-12
1.0601603054375858e-31
7.967502097243325e-28
6.480151040700298e-11
3.23827660775328e-30
3.594962444746324e-20
1.8410190054481557e-11
0.0
2.229947251027927e-24
3.0599164876560345e-33
1.2401086809342222e-33
1.2929990328813492e-16
1.3316241602337294e-14
3.7450913680675026e-08
2.074540866783098

RuntimeError: all elements of input should be between 0 and 1

In [None]:
# all_preds = np.zeros(df.shape[0], dtype=int)

# kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
# for train_index, test_index in kf.split(df):
#     X_train, y_train = Xs.iloc[train_index], ys.iloc[train_index]
#     X_test, y_test = Xs.iloc[test_index], ys.iloc[test_index]
    
#     X_train = torch.tensor(X_train.values)
#     y_train = torch.tensor(y_train.values)
#     y_train = y_train.reshape(y_train.shape[0], 1)
#     X_test = torch.tensor(X_test.values)
    
#     model = run_training(X_train, y_train)
#     while test(X_train, y_train, model):
#         model = run_training(X_train, y_train)
    
#     model.eval()
#     with torch.no_grad():
#         all_preds[test_index] = model(X_test.float()).numpy().squeeze() > 0.5
# #     print(accuracy_score(y_test, preds))

# accuracy_score(ys, all_preds)


In [None]:
# orig_df = pd.read_csv('../data/orig.csv', index_col=0)
# orig_df.rename(columns={"Target": "label_value"}, inplace=True)
# orig_df['score'] = all_preds
# orig_df.head()

In [None]:
# orig_df.to_csv('../data/processed.csv')