In [21]:
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from time import time
import math
import bcolz
import cPickle as pickle

In [22]:
def save_array(fname, arr): 
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()
    
def load_array(fname):
    return bcolz.open(fname)

In [3]:
# load the convolutional feature vectors
train_features = pickle.load(open("data/train_features.pkl", "rb"))
test_features = pickle.load(open("data/test_features.pkl", "rb"))
train_labels = load_array("data/train_labels.bc")
test_labels = load_array("data/test_labels.bc")

In [4]:
print(train_features.size())
print(train_labels.shape)
print(test_features.size())
print(test_labels.shape)

torch.Size([500, 32, 7, 35])
(500, 5)
torch.Size([200, 32, 7, 35])
(200, 5)


In [5]:
train_data = []
for i in range(train_features.size()[0]):
    train_data.append((train_features[i].view(-1), torch.LongTensor(train_labels[i])))
print(train_data[0][0].size(), train_data[0][1].shape)

test_data = []
for i in range(test_features.size()[0]):
    test_data.append((test_features[i].view(-1), torch.LongTensor(test_labels[i])))
print(test_data[0][0].size(), test_data[0][1].shape)

torch.Size([7840]) torch.Size([5])
torch.Size([7840]) torch.Size([5])


In [30]:
vocab_size = 10
embedding_dim = 7840
hidden_dim = 1024
dropout_prob = 0.1
target_size = 5

In [7]:
def time_since(since):
    s = time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [8]:
# train an LSTM to recognize digit sequences by using the convolutional features
class MNIST_Seq(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob):
        super(MNIST_Seq, self).__init__()

        self.hidden_dim=hidden_dim

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.decoder = nn.Linear(hidden_dim, vocab_size)


    def init_hidden(self):
        self.hidden = (Variable(torch.zeros(1, 1, self.hidden_dim)).cuda(),
                        Variable(torch.zeros(1, 1, self.hidden_dim)).cuda())

    def forward(self, context):
        lstm_out, self.hidden = self.lstm(context, self.hidden)
        lstm_dropped = self.dropout(lstm_out)
        decoded = self.decoder(lstm_dropped)
        #logprobs = F.log_softmax(decoded)
        return decoded

In [28]:
def evaluate(data):
    model.eval()
    correct_digits = 0
    total_digits = 0
    for this_input, this_target in data:
        model.init_hidden()
        input_var = Variable(torch.stack([this_input] * target_size)).cuda()
        output = model(input_var.view(target_size,1,-1))
        top_n, top_i = torch.topk(output, 1, dim=2)
        pred = top_i.cpu().data.view(5)
        correct_digits += (pred == this_target).sum()
        total_digits += 5
#         print("pred: ", pred.numpy())
#         print("targ: ", this_target.numpy())
#         print()
    return float(correct_digits) / total_digits

def train(inp, target):
    model.init_hidden()
    model.zero_grad()

    input_var = Variable(torch.stack([inp] * target_size)).cuda()

    output = model(input_var.view(target_size, 1, -1))
    loss = criterion(output.view(target_size, vocab_size), Variable(target).cuda())

    loss.backward()
    optimizer.step()

    return loss.data[0] / target_size

def train_epoch():
    model.train(True)
    for this_input, this_target in train_data:
        loss = train(this_input, this_target)
        #print loss
    return loss

In [31]:
model = MNIST_Seq(vocab_size, embedding_dim, hidden_dim, dropout_prob).cuda()
criterion = nn.CrossEntropyLoss()

In [32]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [34]:
n_epochs=25
print_every = 1
plot_every = 10

all_losses = []
loss_avg = 0

In [25]:
for param_group in optimizer.param_groups:
    param_group['lr'] = 0.0002

In [35]:
start = time()
for epoch in range(1, n_epochs + 1):
    loss = train_epoch()
    loss_avg += loss

    if epoch % print_every == 0:
        
        train_acc = evaluate(train_data)
        test_acc = evaluate(test_data)
        print('[%s (%d %d%%) %.4f (train_acc: %.3f, test_acc: %.3f)]' % (time_since(start), epoch, epoch / n_epochs * 100, loss, train_acc, test_acc))

    if epoch % plot_every == 0:
        all_losses.append(loss_avg / plot_every)
        loss_avg = 0

[0m 21s (1 4%) 0.4488 (train_acc: 0.336, test_acc: 0.316)]
[0m 43s (2 8%) 0.3666 (train_acc: 0.383, test_acc: 0.345)]
[1m 5s (3 12%) 0.3279 (train_acc: 0.439, test_acc: 0.369)]
[1m 27s (4 16%) 0.2935 (train_acc: 0.514, test_acc: 0.429)]
[1m 49s (5 20%) 0.2517 (train_acc: 0.548, test_acc: 0.460)]
[2m 11s (6 24%) 0.2208 (train_acc: 0.607, test_acc: 0.497)]
[2m 33s (7 28%) 0.2212 (train_acc: 0.655, test_acc: 0.513)]
[2m 54s (8 32%) 0.1822 (train_acc: 0.704, test_acc: 0.539)]
[3m 16s (9 36%) 0.2483 (train_acc: 0.726, test_acc: 0.564)]
[3m 38s (10 40%) 0.1750 (train_acc: 0.752, test_acc: 0.565)]
[4m 0s (11 44%) 0.1193 (train_acc: 0.772, test_acc: 0.584)]
[4m 22s (12 48%) 0.1198 (train_acc: 0.779, test_acc: 0.607)]
[4m 44s (13 52%) 0.1168 (train_acc: 0.794, test_acc: 0.619)]
[5m 6s (14 56%) 0.1049 (train_acc: 0.829, test_acc: 0.626)]
[5m 28s (15 60%) 0.1037 (train_acc: 0.854, test_acc: 0.642)]
[5m 49s (16 64%) 0.0744 (train_acc: 0.872, test_acc: 0.684)]
[6m 11s (17 68%) 0.0709 (train_acc: 0.