In [6]:
import torch.nn as nn
import torch.autograd as autograd
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
%matplotlib inline

# Binary classification with PyTorch

## read data

In [7]:
PATH = Path("data_hw2")
list(PATH.iterdir())

[PosixPath('data_hw2/default-credit-card-clients.csv'),
 PosixPath('data_hw2/train_books_ratings.csv'),
 PosixPath('data_hw2/valid_books_ratings.csv')]

In [10]:
df = pd.get_dummies(df, columns=['EDUCATION','MARRIAGE'])

In [11]:
df.head(5)

Unnamed: 0,ID,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,1,20000,2,24,2,2,-1,-1,-2,-2,...,0,1,0,0,0,0,0,1,0,0
1,2,120000,2,26,-1,2,0,0,0,2,...,0,1,0,0,0,0,0,0,1,0
2,3,90000,2,34,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,4,50000,2,37,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,5,50000,1,57,-1,0,-1,0,0,0,...,0,1,0,0,0,0,0,1,0,0


## split data

In [33]:
np.random.seed(3)
msk = np.random.rand(len(df)) < 0.8
train = df[msk].reset_index()
valid = df[~msk].reset_index()

## normalize input variables

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default_payment', 'EDUCATION_0',
       'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4',
       'EDUCATION_5', 'EDUCATION_6', 'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2',
       'MARRIAGE_3'],
      dtype='object')

In [36]:
X_train = train.iloc[:,:23].values
Y_train = train['default_payment'].values
X_val = valid.iloc[:,:23].values
Y_val = valid['default_payment'].values

In [37]:
X_train, X_val = X_train, X_val
# Y_train, Y_val = Y_train, Y_val
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

## PyTorch dataset

In [44]:
from torch.utils.data import Dataset, DataLoader,TensorDataset

In [192]:
tensor_x_train = torch.Tensor(X_train).float()# transform to torch tensor
tensor_y_train = torch.Tensor(Y_train).float()
tensor_x_val = torch.Tensor(X_val).float()# transform to torch tensor
# tensor_y_val = torch.Tensor(Y_val).float().unsqueeze(1)
tensor_y_val = torch.Tensor(Y_val).float()

dataset_train = TensorDataset(tensor_x_train,tensor_y_train) # create your datset
train_dataloader = DataLoader(dataset_train) # create your dataloader


In [177]:
dataset_val = TensorDataset(tensor_x_val,tensor_y_val) # create your datset
val_dataloader = DataLoader(dataset_val) # create your dataloader

## create data loaders

In [178]:
train_dl = DataLoader(dataset_train, batch_size=1000, shuffle=True)

In [179]:
val_dl = DataLoader(dataset_val, batch_size=1000, shuffle=False)

## model for logistic regression

In [180]:
x, y = next(iter(train_dl))

In [181]:
model = torch.nn.Sequential(
    torch.nn.Linear(23, 1)
)

In [182]:
y_hat = model(x.float())

In [183]:
F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())

tensor(0.7495, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

## log loss and auc-roc

In [184]:
from sklearn.metrics import log_loss,roc_auc_score

In [185]:
def val_metric(model, valid_dl):
    model.eval()
    log_losses = []
    y_hats = []
    ys = []
    for x, y in valid_dl:
        y = y.unsqueeze(1)
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
        y_hats.append(y_hat.flatten().detach().numpy())
        ys.append(y.flatten().detach().numpy())
        log_losses.append(loss.item())
    
    #compute the r2_score at the end, use all the batch results
    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)
    return np.mean(log_losses), roc_auc_score(ys, y_hats)

## training loop

In [186]:
def train_loop(model, train_dl, valid_dl, optimizer, epochs):
    for i in range(epochs):
        model.train()
        losses = []
        for x, y in train_dl:#each mini-batch
            y = y.unsqueeze(1)
            y_hat = model(x.float())
            
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        train_loss = np.mean(losses)
        valid_loss, valid_auc = val_metric(model, valid_dl)
        print("train loss %.3f valid loss %.3f auc %.3f" % 
              (train_loss, valid_loss, valid_auc))

## model training

In [188]:
model2 = torch.nn.Sequential(
    torch.nn.Linear(23, 1),
)

In [189]:
learning_rate = 0.0046
optimizer = torch.optim.Adam(model2.parameters(), lr=learning_rate)
train_loop(model2, train_dl, val_dl, optimizer, epochs=20)

train loss 0.694 valid loss 0.651 auc 0.632
train loss 0.627 valid loss 0.615 auc 0.657
train loss 0.595 valid loss 0.588 auc 0.666
train loss 0.566 valid loss 0.567 auc 0.675
train loss 0.550 valid loss 0.549 auc 0.680
train loss 0.527 valid loss 0.535 auc 0.685
train loss 0.523 valid loss 0.523 auc 0.687
train loss 0.509 valid loss 0.514 auc 0.689
train loss 0.498 valid loss 0.506 auc 0.693
train loss 0.488 valid loss 0.500 auc 0.692
train loss 0.486 valid loss 0.495 auc 0.695
train loss 0.484 valid loss 0.491 auc 0.696
train loss 0.472 valid loss 0.488 auc 0.698
train loss 0.472 valid loss 0.486 auc 0.697
train loss 0.475 valid loss 0.484 auc 0.698
train loss 0.474 valid loss 0.482 auc 0.700
train loss 0.464 valid loss 0.481 auc 0.699
train loss 0.470 valid loss 0.479 auc 0.700
train loss 0.466 valid loss 0.479 auc 0.701
train loss 0.464 valid loss 0.478 auc 0.699
