In [0]:
# Run this for pre-processesed dataset file
! git clone https://github.com/kendreaditya/ECG_DATA.git

Cloning into 'ECG_DATA'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (4/4), done.


In [0]:
import pickle
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torch.autograd import Variable

In [0]:
# Number of words for which we are storing an embedding
vocab_size    = 3000
# Number of dimension of the embeddings
embedding_dim = 50
batch_size    = 256
input_len     = 36
epochs        = 10
print_every   = 1000
cuda          = True

In [0]:
def load_files():
    with open('ECG_DATA/', 'rb') as data_file:
        data = pickle.load(data_file)

    with open('../data/sentiment_vocabulary.pkl', 'rb') as vocab_file:
        vocab = pickle.load(vocab_file)
        
    return data, vocab

This function creates a correspondance between the `vocab_size` most frequent words and integers from $[1, \text{vocab_size}]$. We will be using the index 0 to encode rare words and to pad the word sequences to an unique lenghts.

In [0]:
def create_word_to_idx(vocab):
    items       = list(vocab.items())
    items       = sorted(items, key = lambda x: x[1], reverse = True)
    word_to_idx = {word : i + 1 for i, (word, _) in enumerate(items[:vocab_size])}
    
    return word_to_idx

In [0]:
def encode_data(data, word_to_idx, input_len):
    encoded_data = []
    
    # For each tweet, we compute the sequence of indices corresponding to 
    # its list of words. If the length of this sequence is smaller than 
    # input_len words, we pad it with zeros. If the sequence is longer, we 
    # cut it down to input_len words. 
    for tweet, target in data:
        encoded_tweet = [word_to_idx.get(word, 0) for word in tweet]
        len_encoding  = len(encoded_tweet) 
        if len(encoded_tweet) < input_len:
            encoded_tweet = encoded_tweet + [0] * (input_len - len_encoding)
        else:
            encoded_tweet = encoded_tweet[:input_len]
        encoded_data.append((' '.join(tweet), encoded_tweet, target))
        
    return encoded_data

In [0]:
def load_data(vocab_size, input_len, test_proportion = 0.2):
    data, vocab   = load_files()
    word_to_idx   = create_word_to_idx(vocab)
    encoded_data  = encode_data(data, word_to_idx, input_len)
    # We split the data into a training set and a test set.
    training_size = int(len(encoded_data) * (1 - test_proportion))  
    random.shuffle(encoded_data)
    training_data = encoded_data[:training_size]
    test_data     = encoded_data[training_size:]
    
    return training_data, test_data

In [0]:
def batch_to_tensor(batch):
    tweets  = [tweet for tweet, _, _ in batch]
    inputs  = torch.LongTensor([input for _, input, _ in batch])
    targets = torch.LongTensor([target for _, _, target in batch])
    
    return tweets, inputs, targets

In [0]:
def batch_generator(data, batch_size, shuffle = True):
    if shuffle:
        data = random.sample(data, len(data))
        
    return (batch_to_tensor(data[i: i + batch_size]) for i in range(0, len(data), batch_size))

In [0]:
def evaluate_model(cnn, criterion, train_data, test_data, batch_size):
    def evaluate_model_data(data):
        batch_number     = 0
        total_loss       = 0
        total_correct    = 0
        total_prediction = 0
        for _, inputs, targets in batch_generator(data, batch_size, shuffle = False):
            inputs            = Variable(inputs)
            targets           = Variable(targets)
            inputs            = inputs.cuda() if cuda else inputs
            targets           = targets.cuda() if cuda else targets
            predictions       = cnn(inputs)
            loss              = criterion(predictions, targets)
            total_loss       += loss.cpu().data[0]
            batch_number     += 1
            pred_classes      = predictions.max(dim = 1)[1]
            total_prediction += predictions.size()[0]
            total_correct    += (pred_classes == targets).cpu().sum().data[0]
        average_loss     = total_loss / batch_number
        average_accuracy = total_correct / total_prediction
        
        return average_loss, average_accuracy
    
    return evaluate_model_data(train_data), evaluate_model_data(test_data)

In [0]:
def print_model_evaluation(cnn, epoch, criterion, train_data, test_data, batch_size):
    cnn.eval()
    evaluation = evaluate_model(cnn, criterion, train_data, test_data, batch_size)
    cnn.train()
    print(
        f'[{epoch + 1:3}] ' 
        f'train loss: {evaluation[0][0]:.4f}, train accuracy: {100 * evaluation[0][1]:.3f}%, '
        f'test loss: {evaluation[1][0]:.4f}, test accuracy: {100 * evaluation[1][1]:.3f}%'
    )

The network used in this notebook is a simple CNN with Batch Normalization and Dropout. In the forward pass, after computing the embedding of each word of the sequence, we have to permute the last two dimensions as the embedding layer outputs has a `(batch_size, seq_len, channels)` shape and the convolutions layers take `(batch_size, channels, seq_len)` has input shape. More details on this issue can be found in [this topic](https://discuss.pytorch.org/t/inconsistent-dimension-ordering-for-1d-networks-ncl-vs-nlc-vs-lnc/14807).

In [0]:
class CNN(nn.Module):
    def __init__(self, vocab_size, input_len, embedding_dim):
        super(CNN, self).__init__()
        self.conv1         = nn.Conv1d(1, 64, 3, padding = 1)
        self.bn1           = nn.BatchNorm1d(64)
        self.dropout1      = nn.Dropout(p = 0.8)
        self.conv2         = nn.Conv1d(64 , 64 , 3, padding = 1)
        self.bn2           = nn.BatchNorm1d(64)
        self.dropout2      = nn.Dropout(p = 0.8)
        self.conv3         = nn.Conv1d(64 , 128, 3, padding = 1)
        self.bn3           = nn.BatchNorm1d(128)
        self.dropout3      = nn.Dropout(p = 0.8)
        self.conv4         = nn.Conv1d(128, 128, 3, padding = 1)
        self.bn4           = nn.BatchNorm1d(128)
        self.dropout4      = nn.Dropout(p = 0.8)
        self.linear1       = nn.Linear(128 * 9, 256)
        self.bn5           = nn.BatchNorm1d(256)
        self.dropout5      = nn.Dropout(p = 0.8)
        self.linear2       = nn.Linear(256, 256)
        self.bn6           = nn.BatchNorm1d(256)
        self.dropout6      = nn.Dropout(p = 0.8)
        self.linear3       = nn.Linear(256, 4)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1, 2).contiguous()
        x = self.dropout1(self.bn1(F.relu(self.conv1(x))))
        x = self.dropout2(self.bn2(F.relu(self.conv2(x))))
        x = F.avg_pool1d(x, 2)
        x = self.dropout3(self.bn3(F.relu(self.conv3(x))))
        x = self.dropout4(self.bn4(F.relu(self.conv4(x))))
        x = F.avg_pool1d(x, 2)
        x = x.view(-1, 9 * 128)
        x = self.dropout5(self.bn5(F.relu(self.linear1(x))))
        x = self.dropout6(self.bn6(F.relu(self.linear2(x))))
        x = F.log_softmax(self.linear3(x), dim = 1)
        
        return x

In [0]:
train_data, test_data = load_data(vocab_size, input_len)
cnn                   = CNN(vocab_size, input_len, embedding_dim)
cnn                   = cnn.cuda() if cuda else cnn
criterion             = nn.NLLLoss()
optimizer             = optim.Adam(cnn.parameters())

In [0]:
print_model_evaluation(cnn, 0, criterion, train_data, test_data, batch_size)
for epoch in range(epochs):
    total_loss   = 0
    running_loss = 0
    for i, (_, inputs, targets) in enumerate(batch_generator(train_data, batch_size)):
        optimizer.zero_grad()
        inputs        = Variable(inputs)
        targets       = Variable(targets)
        inputs        = inputs.cuda() if cuda else inputs
        targets       = targets.cuda() if cuda else targets
        predictions   = cnn(inputs)
        loss          = criterion(predictions, targets)
        loss_value    = loss.cpu().data[0]
        running_loss += loss_value
        loss.backward()
        optimizer.step()
        
        if i % print_every == print_every - 1:
            print(f'\t[{i + 1:6}] running_loss: {running_loss / print_every:.4f}')
            running_loss = 0

    print_model_evaluation(cnn, epoch + 1, criterion, train_data, test_data, batch_size)

[  1] train loss: 0.6943, train accuracy: 50.077%, test loss: 0.6944, test accuracy: 49.967%
	[  1000] running_loss: 0.6972
	[  2000] running_loss: 0.6430
	[  3000] running_loss: 0.5614
	[  4000] running_loss: 0.5199
[  2] train loss: 0.4674, train accuracy: 78.343%, test loss: 0.4769, test accuracy: 77.699%
	[  1000] running_loss: 0.4779
	[  2000] running_loss: 0.4718
	[  3000] running_loss: 0.4652
	[  4000] running_loss: 0.4615
[  3] train loss: 0.4166, train accuracy: 81.428%, test loss: 0.4396, test accuracy: 79.930%
	[  1000] running_loss: 0.4417
	[  2000] running_loss: 0.4381
	[  3000] running_loss: 0.4369
	[  4000] running_loss: 0.4373
[  4] train loss: 0.3918, train accuracy: 82.704%, test loss: 0.4296, test accuracy: 80.348%
	[  1000] running_loss: 0.4194
	[  2000] running_loss: 0.4191
	[  3000] running_loss: 0.4222
	[  4000] running_loss: 0.4203
[  5] train loss: 0.3772, train accuracy: 83.527%, test loss: 0.4271, test accuracy: 80.448%
	[  1000] running_loss: 0.4068
	[  2000

We can see that the network overfits the training data quite a lot even using dropout.