References:
1. https://github.com/jf20541/LogisticRegressionPyTorch/blob/main/src/pytorchmodel.py
2. https://pytorch.org/tutorials/beginner/basics/
3. https://stackoverflow.com/questions/42704283/l1-l2-regularization-in-pytorch
4. https://gist.github.com/tuelwer/0b52817e9b6251d940fd8e2921ec5e20

TODOs

1. Add regularization
2. Explore LBFGS in PyTorch
3. Implement fairness variant

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

random_state = 1
torch.manual_seed(random_state)

<torch._C.Generator at 0x118400f10>

In [2]:
df = pd.read_csv('../data/cleaned.csv', index_col=0)
df.head()

Unnamed: 0,Duration,Credit Amount,Installment rate,Residence,Age,Number of credits,Maintenance,Target,Account Status_<0,Account Status_<200,...,Housing_own,Housing_rent,Job_management/ highly qualified employee,Job_skilled employee / official,Job_unemployed/ unskilled - non-resident,Job_unskilled - resident,Telephone_none,Telephone_yes,Foreign_no,Foreign_yes
0,6,1169,4,4,67,2,1,1,1,0,...,1,0,0,1,0,0,0,1,0,1
1,48,5951,2,2,22,1,1,0,0,1,...,1,0,0,1,0,0,1,0,0,1
2,12,2096,2,3,49,1,2,1,0,0,...,1,0,0,0,0,1,1,0,0,1
3,42,7882,2,4,45,1,2,1,1,0,...,0,0,0,1,0,0,1,0,0,1
4,24,4870,3,4,53,2,2,0,1,0,...,0,0,0,1,0,0,1,0,0,1


In [3]:
Xs = df.loc[:, df.columns != 'Target']
ys = df['Target']

In [4]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [5]:
loss_fn = nn.BCELoss()
l2_lambda = 0.01


def test(Xs, ys, model):
    """
    If the model fails to find a reasonable solution, return True
    """
    size = Xs.shape[0]
    test_loss, correct = 0, 0

    with torch.no_grad():
        pred = model(Xs.float())
        test_loss += loss_fn(pred, ys.float()).item()
        
        # l2 regularization
        l2_reg = torch.tensor(0.)
        for w in model.parameters():
            l2_reg += w.norm(2)
        test_loss += l2_lambda * l2_reg
    
        correct += ((pred > 0.5).float() == ys).type(torch.float).sum().item()
        
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
    return correct <= 0.75

In [6]:
def run_training(Xs, ys):
    model = LogisticRegression(Xs.shape[1], 1)
    optimizer = torch.optim.LBFGS(model.parameters(), lr=0.1, line_search_fn='strong_wolfe')

    def closure():
        optimizer.zero_grad()

        # Compute prediction and loss
        pred = model(Xs.float())
        loss = loss_fn(pred, ys.float())

        # l2 regularization
        l2_reg = torch.tensor(0.)
        for w in model.parameters():
            l2_reg += w.norm(2)
        loss += l2_lambda * l2_reg

        loss.backward()

        return loss

    epochs = 20
    for t in range(epochs):
        optimizer.step(closure)
        
    return model
    
# model = run_training(Xs, ys)
# while test(Xs, ys, model):
#     model = run_training(Xs, ys)

In [7]:
all_preds = np.zeros(df.shape[0], dtype=int)

kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
for train_index, test_index in kf.split(df):
    X_train, y_train = Xs.iloc[train_index], ys.iloc[train_index]
    X_test, y_test = Xs.iloc[test_index], ys.iloc[test_index]
    
    X_train = torch.tensor(X_train.values)
    y_train = torch.tensor(y_train.values)
    y_train = y_train.reshape(y_train.shape[0], 1)
    X_test = torch.tensor(X_test.values)
    
    model = run_training(X_train, y_train)
    while test(X_train, y_train, model):
        model = run_training(X_train, y_train)
    
    model.eval()
    with torch.no_grad():
        all_preds[test_index] = model(X_test.float()).numpy().squeeze() > 0.5
#     print(accuracy_score(y_test, preds))

accuracy_score(ys, all_preds)

Test Error: 
 Accuracy: 79.2%, Avg loss: 0.475370 

Test Error: 
 Accuracy: 69.5%, Avg loss: 0.669254 

Test Error: 
 Accuracy: 77.0%, Avg loss: 0.498052 

Test Error: 
 Accuracy: 75.8%, Avg loss: 0.499158 

Test Error: 
 Accuracy: 60.2%, Avg loss: 0.698923 

Test Error: 
 Accuracy: 70.4%, Avg loss: 29.630943 

Test Error: 
 Accuracy: 70.4%, Avg loss: 29.630796 

Test Error: 
 Accuracy: 70.4%, Avg loss: 29.632051 

Test Error: 
 Accuracy: 78.6%, Avg loss: 0.477100 

Test Error: 
 Accuracy: 76.1%, Avg loss: 0.495518 



0.759

In [8]:
orig_df = pd.read_csv('../data/orig.csv', index_col=0)
orig_df.rename(columns={"Target": "label_value"}, inplace=True)
orig_df['score'] = all_preds
orig_df.head()

Unnamed: 0,Account Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment,Installment rate,Sex,Other debtors,...,Age,Other installments,Housing,Number of credits,Job,Maintenance,Telephone,Foreign,label_value,score
0,<0,6,critical account,radio/television,1169,no,>= 7 years,4,male single,none,...,67,none,own,2,skilled employee / official,1,yes,yes,1,1
1,<200,48,existing credits paid back duly till now,radio/television,5951,<100,1 <= < 4 years,2,female divorced/separated/married,none,...,22,none,own,1,skilled employee / official,1,none,yes,0,0
2,no,12,critical account,education,2096,<100,4 <= < 7 years,2,male single,none,...,49,none,own,1,unskilled - resident,2,none,yes,1,1
3,<0,42,existing credits paid back duly till now,furniture/equipment,7882,<100,4 <= < 7 years,2,male single,guarantor,...,45,none,for free,1,skilled employee / official,2,none,yes,1,1
4,<0,24,delay in paying off,car (new),4870,<100,1 <= < 4 years,3,male single,none,...,53,none,for free,2,skilled employee / official,2,none,yes,0,0


In [9]:
orig_df.to_csv('../data/processed.csv')