In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import torch.optim as optim
import torch.nn as nn


## 1.1

In [4]:

df = pd.read_csv("data_hw2/default-credit-card-clients.csv")

In [3]:
df = df.drop('ID', axis=1)
df = pd.get_dummies(df, columns=['EDUCATION', 'MARRIAGE'])
df.head(10)

Unnamed: 0,LIMIT_BAL,SEX,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,...,EDUCATION_1,EDUCATION_2,EDUCATION_3,EDUCATION_4,EDUCATION_5,EDUCATION_6,MARRIAGE_0,MARRIAGE_1,MARRIAGE_2,MARRIAGE_3
0,20000,2,24,2,2,-1,-1,-2,-2,3913,...,0,1,0,0,0,0,0,1,0,0
1,120000,2,26,-1,2,0,0,0,2,2682,...,0,1,0,0,0,0,0,0,1,0
2,90000,2,34,0,0,0,0,0,0,29239,...,0,1,0,0,0,0,0,0,1,0
3,50000,2,37,0,0,0,0,0,0,46990,...,0,1,0,0,0,0,0,1,0,0
4,50000,1,57,-1,0,-1,0,0,0,8617,...,0,1,0,0,0,0,0,1,0,0
5,50000,1,37,0,0,0,0,0,0,64400,...,1,0,0,0,0,0,0,0,1,0
6,500000,1,29,0,0,0,0,0,0,367965,...,1,0,0,0,0,0,0,0,1,0
7,100000,2,23,0,-1,-1,0,0,-1,11876,...,0,1,0,0,0,0,0,0,1,0
8,140000,2,28,0,0,2,0,0,0,11285,...,0,0,1,0,0,0,0,1,0,0
9,20000,1,35,-2,-2,-2,-2,-1,-1,0,...,0,0,1,0,0,0,0,0,1,0


In [18]:
df.shape

(30000, 33)

## 1.2

In [41]:
np.random.seed(3)
msk = np.random.rand(len(df)) < 0.8
train = df[msk].reset_index()
valid = df[~msk].reset_index()
train = train.drop('index', axis=1)
valid = valid.drop('index', axis=1)

## 1.3

In [42]:
train_label = train['default_payment']
train_inputs = train.drop('default_payment', axis=1)
print(train_inputs.shape)

valid_label = valid['default_payment']
valid_inputs = valid.drop('default_payment', axis=1)

scaler = StandardScaler()

train_inputs_standardized = scaler.fit_transform(train_inputs)

valid_inputs_standardized = scaler.transform(valid_inputs)

train_inputs_standardized_df = pd.DataFrame(train_inputs_standardized, columns=train_inputs.columns)
valid_inputs_standardized_df = pd.DataFrame(valid_inputs_standardized, columns=valid_inputs.columns)

(24018, 32)


## 1.4

In [44]:
train_input_ar = train_inputs_standardized_df.values
train_label_ar = train_label.values

valid_input_ar = valid_inputs_standardized_df.values
valid_label_ar = valid_label.values

class HWDataset(Dataset):
    def __init__(self, input_ar, label_ar):
        self.inputs = torch.tensor(input_ar, dtype=torch.float32)
        self.labels = torch.tensor(label_ar, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

train_data = HWDataset(train_input_ar, train_label_ar)
valid_data = HWDataset(valid_input_ar, valid_label_ar)

## 1.5

In [47]:
train_dl = DataLoader(train_data, batch_size=1000, shuffle=True)
valid_dl = DataLoader(valid_data, batch_size=1000, shuffle=False)

## 1.6

In [48]:
model = torch.nn.Sequential(
    torch.nn.Linear(32, 1),
)
model

Sequential(
  (0): Linear(in_features=32, out_features=1, bias=True)
)

In [49]:
x, y = next(iter(train_dl))

In [50]:
x.size(), y.size()

(torch.Size([1000, 32]), torch.Size([1000]))

## 1.7

In [51]:
def val_metric(model, dl):
    model.eval()

    losses = []
    y_hats = []
    ys = []

    for x, y in dl:
        y_hat = model(x)
        y_hat = y_hat.squeeze()
        prob = torch.sigmoid(y_hat)
        loss = F.binary_cross_entropy_with_logits(y_hat, y.float()).item()
        losses.append(loss)
        y_hats.append(prob.detach().numpy())
        ys.append(y.numpy())

    ys = np.concatenate(ys)
    y_hats = np.concatenate(y_hats)

    return np.mean(losses), roc_auc_score(ys, y_hats)

## 1.8

In [64]:
def train_loop(model, train_dl, valid_dl, optimizer, epochs):
    for i in range(epochs):
        model.train()
        train_losses = []
        for x, y in train_dl:
            y = y.unsqueeze(1)
            y_hat = model(x.float())
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        train_loss = np.mean(train_losses)
        valid_loss, valid_auc = val_metric(model, valid_dl)
        print("train loss %.3f valid loss %.3f AUC %.3f" % 
              (train_loss, valid_loss, valid_auc))

## 1.9

In [68]:
optimizer = optim.Adam(model.parameters(), lr=0.8)
train_loop(model, train_dl, valid_dl, optimizer, epochs = 20)

train loss 0.778 valid loss 0.609 AUC 0.694
train loss 0.551 valid loss 0.625 AUC 0.596
train loss 0.554 valid loss 0.576 AUC 0.645
train loss 0.544 valid loss 0.634 AUC 0.686
train loss 0.541 valid loss 0.596 AUC 0.639
train loss 0.532 valid loss 0.573 AUC 0.665
train loss 0.536 valid loss 0.589 AUC 0.650
train loss 0.512 valid loss 0.549 AUC 0.689
train loss 0.503 valid loss 0.738 AUC 0.486
train loss 0.591 valid loss 0.689 AUC 0.632
train loss 0.558 valid loss 0.588 AUC 0.680
train loss 0.533 valid loss 0.647 AUC 0.687
train loss 0.571 valid loss 0.660 AUC 0.594
train loss 0.548 valid loss 0.575 AUC 0.670
train loss 0.543 valid loss 0.633 AUC 0.677
train loss 0.558 valid loss 0.692 AUC 0.511
train loss 0.577 valid loss 0.572 AUC 0.680
train loss 0.523 valid loss 0.594 AUC 0.646
train loss 0.534 valid loss 0.770 AUC 0.645
train loss 0.586 valid loss 0.701 AUC 0.654
