In [125]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable

import evaluation
import csv

import numpy as np

In [120]:
vocab_size = 40000
embedding_size = 50
Y = 10
dropout = 0.3
kernel_size = 3
num_epochs = 5
batch_size = 32
log_interval = 100

In [133]:
#yield some tensors. file should hold data sorted by sequence length, for batching
def data_generator(filename, batch_size, Y):
    with open(filename, 'r') as infile:
        r = csv.reader(infile)
        #header
        next(r)
        cur_insts = []
        cur_labels = []
        cur_length = 0
        for row in r:
            #find the next batch_size instances with the same length
            text = row[1]
            length = int(row[3])
            if length > cur_length:
                if len(cur_insts) > 0:
                    #create the tensors
                    yield torch.LongTensor(cur_insts), torch.FloatTensor(cur_labels)
                    #clear
                    cur_insts = []
                    cur_labels = []
                cur_insts.append([int(w) for w in text.split()])
                labels = [int(l) for l in row[2].split(';')]
                cur_labels.append([1 if i in labels else 0 for i in range(Y)])
                #reset length
                cur_length = length
            else:
                if len(cur_insts) == batch_size:
                    #create the tensors
                    yield torch.LongTensor(cur_insts), torch.FloatTensor(cur_labels)
                    #clear
                    cur_insts = []
                    cur_labels = []
                cur_insts.append([int(w) for w in text.split()])
                labels = [int(l) for l in row[2].split(';')]
                cur_labels.append([1 if i in labels else 0 for i in range(Y)])

In [134]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.embed_drop = nn.Dropout(p=dropout)
        self.conv = nn.Conv1d(embedding_size, Y, kernel_size=kernel_size)
        self.conv_drop = nn.Dropout(p=dropout)
        self.fc = nn.Linear(Y, Y)
        
    def forward(self, x):
        x = self.embed_drop(self.embed(x))
        x = torch.transpose(x, 1, 2).contiguous()
        
        x = self.conv_drop(self.conv(x))
        
        x = F.tanh(F.max_pool1d(x, kernel_size=x.size()[2]))
        x = torch.squeeze(x, dim=2)
        
        x = self.fc(x)
        
        return F.sigmoid(x)

In [102]:
model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [126]:
def train(epoch, dataset):
    filename = '../mimicdata/notes_10_train_' + dataset + '_sorted.csv'
#     train_loader = data_generator(filename, batch_size, Y)
    #just sets the model into 'train' mode
    model.train()
    for batch_idx, (data, target) in enumerate(data_generator(filename, batch_size, Y)):
        data, target = Variable(data), Variable(target)
        #clear gradients
        optimizer.zero_grad()
        #forward computation
        output = model(data)
        loss = F.binary_cross_entropy(output, target)
        #backward pass
        loss.backward()
        #kick it in the right direction
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [batch #{}, batch_size {}, seq length {}]\tLoss: {:.6f}'.format(
                epoch+1, batch_idx, data.size()[0], data.size()[1], loss.data[0]))

In [151]:
def test(epoch, dataset):
    filename = '../mimicdata/notes_10_dev_' + dataset + '_sorted.csv'
#     test_loader = data_generator(filename, batch_size, Y)
    #set model to 'eval' mode
    model.eval()
    test_loss = 0
    y = []
    yhat = []
    yhat_raw = []
    for data, target in data_generator(filename, 1, Y):
        data, target = Variable(data, volatile=True), Variable(target)
        #predict
        output = model(data)
        test_loss += F.binary_cross_entropy(output, target)
        yhat_raw.append(output.data.numpy())
        output[output >= 0.5] = 1
        output[output < 0.5] = 0
        y.append(target.data.numpy())
        yhat.append(output.data.numpy())
        
    y = np.squeeze(np.array(y))
    yhat = np.squeeze(np.array(yhat))
    yhat_raw = np.squeeze(np.array(yhat))
    print(y.shape)
    print(yhat.shape)
    acc, prec, rec, f1 = evaluation.all_metrics(yhat, y)
    print("acc, prec, rec, f1")
    print(acc, prec, rec, f1)
    return y, yhat, yhat_raw

In [152]:
y = None
yhat = None
yhat_raw = None
for epoch in range(1):
    train(epoch, 'single')
    y,yhat, yhat_raw = test(epoch, 'single')

Train Epoch: 0 [batch #0, batch_size 1, seq length 6]	Loss: 0.209235
Train Epoch: 0 [batch #100, batch_size 31, seq length 116]	Loss: 0.270123
Train Epoch: 0 [batch #200, batch_size 7, seq length 160]	Loss: 0.286257
Train Epoch: 0 [batch #300, batch_size 32, seq length 196]	Loss: 0.261735
Train Epoch: 0 [batch #400, batch_size 21, seq length 235]	Loss: 0.262348
Train Epoch: 0 [batch #500, batch_size 32, seq length 278]	Loss: 0.293996
Train Epoch: 0 [batch #600, batch_size 32, seq length 324]	Loss: 0.292046
Train Epoch: 0 [batch #700, batch_size 32, seq length 373]	Loss: 0.286776
Train Epoch: 0 [batch #800, batch_size 14, seq length 466]	Loss: 0.253835
Train Epoch: 0 [batch #900, batch_size 2, seq length 567]	Loss: 0.233735
Train Epoch: 0 [batch #1000, batch_size 4, seq length 671]	Loss: 0.376782
Train Epoch: 0 [batch #1100, batch_size 2, seq length 781]	Loss: 0.347014
Train Epoch: 0 [batch #1200, batch_size 1, seq length 899]	Loss: 0.229172
Train Epoch: 0 [batch #1300, batch_size 1, se

  num = intersect_size(yhat, y, 1) / yhat.sum(axis=1)


In [156]:
yhat_raw

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)