import libraries

In [1]:
import os
import joblib as jl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as du
from torch.utils.data import Dataset

read JUND data

In [2]:
class JUND_Dataset(Dataset):
    def __init__(self, data_dir):
        '''load X, y, w, a from data_dir
        convert all to float tensors'''
        super(JUND_Dataset, self).__init__()
        cur_dir = os.path.join(os.getcwd(), data_dir)
        self.X = jl.load(os.path.join(cur_dir, 'shard-0-X.joblib'))
        self.X = torch.tensor(self.X, dtype=torch.float)
        self.y = jl.load(os.path.join(cur_dir, 'shard-0-y.joblib'))
        self.y = torch.tensor(self.y, dtype=torch.float)
        self.w = jl.load(os.path.join(cur_dir, 'shard-0-w.joblib'))
        self.w = torch.tensor(self.w, dtype=torch.float)
        self.a = jl.load(os.path.join(cur_dir, 'shard-0-a.joblib'))
        self.a = torch.tensor(self.a, dtype=torch.float)
        
    def __len__(self):
        '''return len of dataset'''
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        '''return X, y, w, and a values at index idx'''
        return self.X[idx,:], self.y[idx], self.w[idx], self.a[idx]
        

In [3]:
# create datasets and print basic stats
train_data = JUND_Dataset('train_dataset')
test_data = JUND_Dataset('test_dataset')
val_data = JUND_Dataset('valid_dataset')
print("train X", train_data.X.shape)
print("test X", test_data.X.shape)
print("val X", val_data.X.shape)
print("frac train pos", len(torch.where(train_data.y == 1)[0])/train_data.y.shape[0])
print("frac val pos", len(torch.where(val_data.y == 1)[0])/val_data.y.shape[0])
print("frac test pos", len(torch.where(test_data.y == 1)[0])/test_data.y.shape[0])
print("w_sum", train_data.w.sum(), val_data.w.sum(), test_data.w.sum())

train X torch.Size([276216, 101, 4])
test X torch.Size([34528, 101, 4])
val X torch.Size([34527, 101, 4])
frac train pos 0.004228574738610363
frac val pos 0.004228574738610363
frac test pos 0.0042284522706209455
w_sum tensor(276215.9375) tensor(34526.9961) tensor(34527.9961)


define MLP model

In [4]:
class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, use_a=True,
                 dropout=0):
        '''in_dim: input layer dim
           hidden_dim: hidden layer dim
           out_dim: output layer dim
           use_a: use accessibility value?
           dropout: dropout probability
           '''
        
        super(MLP, self).__init__()
        self.dropout = dropout
        self.use_a = use_a
            
        # input is 101 x 4 array, so flatten into 404d vec
        self.flatten = nn.Flatten()
        
        #two fully connected layers
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        
        #concat a or not? accessibility info
        if use_a:
            hidden_dim += 1
        
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x, a):
        #x is Bx101x4, a is Bx1 accessibility values          
        # since x is 101x4, flatten it first
        x = self.flatten(x)
        
        # compute output of fc1, and apply relu activation, followed by dropout
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.dropout)
        
        # concat x and a
        if self.use_a:
            x = torch.cat([x, a], dim=1)
            
       # compute output layer
        x = self.fc2(x)
        
        return x

In [5]:
def compute_correct(output, target, weight):
    '''compute the weights for correct prediction
       first apply sigmoid and predict class 1 if >= 0.5, 0 otherwise
    '''
    #use logsigmoid for log space computations
    output = F.logsigmoid(output.detach())
    pred = torch.where(output > F.logsigmoid(torch.tensor(0.5)), 
                       1, 0)

    # add up weights of correct predictions
    correct = torch.sum((pred == target)*weight)
    
    return correct.item()

Evaluation Loop: Used for Validation and Testing

In [6]:
def eval_model(model, data_loader):
    # set model in eval mode, since we are no longer training
    model.eval()
    eval_loss = 0.
    correct = 0.
    
    # turn of gradient computation, will speed up testing
    with torch.no_grad():
        for batch_idx, (data, target, weight, accessibility)\
            in enumerate(data_loader):
            # send batches to device
            data = data.to(device)
            target = target.to(device)
            weight = weight.to(device)
            accessibility = accessibility.to(device)

            # compute forward pass and loss
            output = model(data, accessibility)
            loss = F.binary_cross_entropy_with_logits(
                output, target, weight=weight)

            # sum up batch loss
            eval_loss += loss.item()

            # add up number of correct predictions
            correct += compute_correct(output, target, weight)
            
        # eval loss per example
        eval_loss /= (batch_idx+1)

        # final test accuracy
        eval_acc = correct / data_loader.dataset.w.sum().item()

    #put model back to training mode at end of eval
    model.train()
    return eval_loss, eval_acc

Set up training

In [10]:
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"using device: {device}")
batch_size = 1000
learning_rate = 0.5
epochs = 5
weight_decay = 0.
dropout = 0.5
use_a = True

# set model and optimizer
# input is 101x4 array 
# use 128d hidden layer
# output is 1d since there are 2 classes; use sigmoid

model = MLP(101*4, 128, 1, use_a, dropout)
    
optimizer = optim.Adam(model.parameters(), lr=learning_rate,
                      weight_decay=weight_decay)

# load training and validation data in batches
train_loader = du.DataLoader(dataset=train_data,
                             batch_size=batch_size,
                             shuffle=True)

val_loader = du.DataLoader(dataset=val_data,
                             batch_size=batch_size,
                             shuffle=True)
# send model over to device
model = model.to(device)

using device: cuda:0


Training loop over batches

In [11]:
for epoch in range(1, epochs + 1):
    sum_loss = 0.
    correct = 0.
    for batch_idx, (data, target, weight, accessibility)\
        in enumerate(train_loader):
        # send batch over to device
        data = data.to(device)
        target = target.to(device)
        weight = weight.to(device)
        accessibility = accessibility.to(device)
        
        # zero out prev gradients
        optimizer.zero_grad()
        
        # run the forward pass
        output = model(data, accessibility)
        # compute loss/error with weight per sample
        loss = F.binary_cross_entropy_with_logits(
                output, target, weight=weight)
        sum_loss += loss.item()
        
        #compute training accuracy
        correct += compute_correct(output, target, weight)

        # compute gradients and take a step
        loss.backward()
        optimizer.step()
        
    # average loss per example
    sum_loss /= (batch_idx+1)
    train_acc = correct / train_data.w.sum().item()
    print(f'Epoch: {epoch}, Loss: {sum_loss:.6e}, Acc: {train_acc:.4f}')
    if epoch % 5 == 0:  
        #how does validation do?
        val_loss, val_acc = eval_model(model, val_loader)
        print(f'Val Loss: {val_loss:.6e}, Val Acc: {val_acc:.4f}')

Epoch: 1, Loss: 1.315484e+01, Acc: 0.5552
Epoch: 2, Loss: 6.733035e+00, Acc: 0.6379
Epoch: 3, Loss: 6.945383e-01, Acc: 0.6705
Epoch: 4, Loss: 8.903910e-01, Acc: 0.6819
Epoch: 5, Loss: 5.680076e-01, Acc: 0.6955
Val Loss: 5.334630e-01, Val Acc: 0.7201


Now do testing

In [14]:
# load test data in batches
test_loader = du.DataLoader(dataset=test_data,
                            batch_size=batch_size,
                            shuffle=False)
test_loss, test_acc = eval_model(model, test_loader)
print(f'Test loss: {test_loss:.6e}, accuracy: {test_acc:.4f}')

Test loss: 5.249908e-01, accuracy: 0.7198
