In [None]:
# default_exp models

In [67]:
#export
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# models

> API details:

In [3]:
# append our Aeye package
# TODO: fix this import issue when using the package.
import sys 
sys.path.append('../Aeye')

from preprocessing import Lang, tensorForImageCaption, get_preprocessed_data, SOS_token, EOS_token

In [4]:
feature_dict, sentence_list, lang = get_preprocessed_data('train')
sentence_list = random.sample(sentence_list, len(sentence_list))

In [55]:
img, sent = tensorForImageCaption(feature_dict, sentence_list[0], lang)

In [5]:
sentence_list

[('2478929971_9eb6c074b6.jpg',
  ['a',
   'group',
   'of',
   'boys',
   'and',
   'girls',
   'sit',
   'and',
   'talk',
   'together',
   'under',
   'a',
   'tree']),
 ('3229821595_77ace81c6b.jpg',
  ['a',
   'group',
   'of',
   'people',
   'are',
   'standing',
   'on',
   'a',
   'ledge',
   'overlooking',
   'low',
   'clouds']),
 ('241347803_afb04b12c4.jpg',
  ['this',
   'football',
   'team',
   'wears',
   'red',
   'shirts',
   'and',
   'red',
   'helmets']),
 ('894928353_002a3d5f06.jpg',
  ['the', 'three', 'children', 'are', 'playing', 'on', 'the', 'rails']),
 ('1247181182_35cabd76f3.jpg',
  ['the',
   'man',
   'is',
   'sitting',
   'at',
   'the',
   'top',
   'of',
   'a',
   'rocky',
   'mountain']),
 ('420355149_f2076770df.jpg',
  ['a',
   'little',
   'kid',
   'dressed',
   'in',
   'blue',
   'playing',
   'in',
   'a',
   'boat']),
 ('3601569729_bf4bf82768.jpg',
  ['there',
   'are',
   'riders',
   'and',
   'horses',
   'in',
   'a',
   'horse',
   'race',


## Encoder

In [13]:
#export

# this is the encoder rnn. for ecodings use a resnet
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_features, n_out_features):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embeddings = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.fc = nn.Linear(self.hidden_size + n_features, n_out_features)
        
    def forward(self, img, input, hidden):
        embedded = self.embeddings(input)
        output = embedded
        
        # format img to feed into fc layer
        n = sent.size(0)
        img = torch.from_numpy(np.tile(img, (n, 1)))
        img = torch.unsqueeze(img, 1)
        
        # feed into fc layer
        output = torch.cat((img, output), dim=-1)
        output = F.relu(self.fc(output))
        
        output, hidden = self.gru(output, hidden)
        return output, hidden
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    

In [7]:
hidden_size = 512
input_size = lang.n_words

encoder = EncoderRNN(input_size, hidden_size, n_features=512, n_out_features=512)
encoder

EncoderRNN(
  (embeddings): Embedding(7376, 512)
  (gru): GRU(512, 512)
  (fc): Linear(in_features=1024, out_features=512, bias=True)
)

In [8]:
hidden = encoder.initHidden()
output, hidden = encoder(img, sent, hidden)

In [9]:
output.shape, hidden.shape

(torch.Size([14, 1, 512]), torch.Size([1, 1, 512]))

## Decoder

In [35]:
#export
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embeddings = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        output = self.embeddings(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        print(output.shape)
        output = self.softmax(output)
        print(output.shape)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [36]:
decoder = DecoderRNN(hidden_size, lang.n_words)
decoder

DecoderRNN(
  (embeddings): Embedding(7376, 512)
  (gru): GRU(512, 512)
  (out): Linear(in_features=512, out_features=7376, bias=True)
  (softmax): LogSoftmax()
)

In [37]:
target_len = sent.size(0)


In [64]:
decoder_input = torch.tensor([[SOS_token]], device=device)

output, hidden = decoder(decoder_input, hidden)
output.shape, hidden.shape

torch.Size([1, 1, 7376])
torch.Size([1, 1, 7376])


(torch.Size([1, 1, 7376]), torch.Size([1, 1, 512]))

In [41]:
_, output = output.topk(1)
output.shape, decoder_input.shape

(torch.Size([1, 1, 1]), torch.Size([1, 1]))

In [42]:

decoder(output, hidden)

torch.Size([1, 1, 7376])
torch.Size([1, 1, 7376])


(tensor([[[0., 0., 0.,  ..., 0., 0., 0.]]], grad_fn=<LogSoftmaxBackward>),
 tensor([[[-7.3828e-02,  3.5051e-01,  9.4528e-03,  1.2378e-01, -1.9783e-02,
            2.1174e-02, -1.9180e-01, -2.2130e-03, -1.1323e-01,  2.3941e-01,
            2.8382e-01,  1.4417e-01,  1.7306e-01, -2.1401e-01,  6.6446e-02,
            2.5142e-01, -3.1836e-01,  5.5944e-01,  2.2823e-02, -3.2213e-01,
           -2.5474e-01, -3.2703e-01, -2.9756e-02,  3.2013e-01, -9.9566e-02,
           -5.9875e-03,  1.3243e-01,  4.4257e-01,  2.5005e-01, -1.1452e-01,
           -2.7099e-01, -4.1905e-03,  1.1201e-01,  1.9497e-02,  6.8550e-02,
           -1.2297e-01, -2.1383e-01, -9.4193e-02, -2.2402e-01,  1.7501e-01,
           -1.7964e-01, -9.2075e-02, -1.3106e-01, -4.0252e-01, -8.9865e-02,
            2.5837e-01,  3.9030e-01,  4.5934e-02,  1.5891e-01, -2.9085e-01,
            2.6306e-02,  8.6565e-02,  7.8658e-02,  1.3097e-01, -1.0183e-02,
           -1.7066e-01,  4.7572e-02,  1.1945e-01,  3.8104e-02, -3.8584e-01,
           -3

In [66]:
citerion = nn.NLLLoss()
#sent = sent.unsqueeze(1)
print(sent[0].squeeze(0).shape, output.squeeze(0).shape)

citerion(output.squeeze(0), sent[0].squeeze(0))

torch.Size([1]) torch.Size([1, 7376])


tensor(0., grad_fn=<NllLossBackward>)