In [1]:
from model import EncoderCNN
from dataLoader import build_vocab, get_loader
import dataLoader
import torch
import torch.utils.data as data
from torchvision import transforms
import pickle

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
%load_ext autoreload
%autoreload 2


In [4]:
vocab_path = 'data/vocab.pkl'
image_dir = 'data/images'
caption_json = 'data/captions.json'
data_json = 'data/val_split.json'
batch_size = 3
resize = 256
crop_size = 224

transform = transforms.Compose([
    transforms.Resize(resize),
    transforms.RandomCrop(crop_size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))])

with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

data_loader = get_loader(image_dir=image_dir,
                         caption_json=caption_json,
                         data_json=data_json,
                         vocabulary=vocab,
                         transform=transform,
                         batch_size=batch_size,
                         shuffle=True)

In [5]:
# Randomly sample a caption length, and sample indices with that length.
(images, image_id, target, prob) = next(iter(data_loader))

In [6]:
images.shape

torch.Size([3, 3, 224, 224])

In [7]:
target.shape

torch.Size([3, 6, 15])

In [8]:
prob.shape

torch.Size([3, 6])

In [9]:
prob

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 0.]])

In [6]:
embed_size = 256

In [13]:
# Specify the dimensionality of the image embedding.


#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)

# Move the encoder to GPU if CUDA is available.
encoder.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([3, 256])


In [14]:
from model import SentenceRNN

In [15]:
# Specify the dimensionality of the image embedding.
hiddem_size = 256

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
sentRnn = SentenceRNN(256,256,256)

# Move the encoder to GPU if CUDA is available.
sentRnn.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

probs, topic, hiddens = sentRnn(features = features)

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([3, 256])


In [None]:
probs.shape

In [None]:
probs

In [None]:
topic.shape

In [None]:
hiddens.shape

In [None]:
from model import WordRNN

In [None]:
target[0].shape

In [11]:
from model import WordRNN, SentenceRNN

In [59]:
topic.shape

torch.Size([3, 1, 256])

In [86]:
target_0.shape

torch.Size([3, 18])

In [16]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Store the size of the vocabulary.
vocab_size = len(vocab)

# Initialize the decoder.
wordRnn = WordRNN(embed_size, hidden_size, vocab_size)

# Move the decoder to GPU if CUDA is available.
wordRnn.to(device)
 
# Move last batch of captions (from Step 1) to GPU if CUDA is available 
target_0 = target[:,0,:].to(device)
topic = topic.to(device)
print(target_0.shape)
print(topic.shape)

# Pass the encoder output and captions through the decoder.
outputs = wordRnn(topic, target_0)

print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! 
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==target_0.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
 

torch.Size([3, 20])
torch.Size([3, 1, 256])
topics torch.Size([3, 1, 256])
embedding torch.Size([3, 19, 256])
inner inputs torch.Size([3, 20, 256])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([3, 20, 4667])


In [70]:
outputs.shape

torch.Size([3, 18, 4667])

In [54]:
target_0.shape

torch.Size([6, 18])

In [101]:
s_max = 6
n_max = 50
states = None

In [175]:
for i in range(s_max):
    p, topic, states = sentRnn.sample(features, states)
    samples_ids = wordRnn.sample(topic, max_len=n_max)
    print(p)
    p = (p > 0.5).squeeze(1)
    print(p.shape)
    print(p)
    print(samples_ids.shape)
    print(p[:,0].view(3,1))
    print(torch.Tensor(samples_ids).to(device))
    samples_ids = samples_ids * p[:,0].cpu().data.numpy().reshape(3,1)
    print(samples_ids)
    break

torch.Size([3, 1, 2])
tensor([[[0.5179, 0.4821]],

        [[0.4756, 0.5244]],

        [[0.4941, 0.5059]]], device='cuda:0', grad_fn=<SoftmaxBackward>)
torch.Size([3, 2])
tensor([[1, 0],
        [0, 1],
        [0, 1]], device='cuda:0', dtype=torch.uint8)
(3, 50)
tensor([[1],
        [0],
        [0]], device='cuda:0', dtype=torch.uint8)
tensor([[2589., 2437., 1423.,  540.,  618., 4498.,  577.,  140.,  701., 2128.,
         3688.,   96., 3841., 4473., 1564., 2486., 1455., 4657., 4593., 2525.,
         4358., 3038., 3270., 2296., 2296., 3365., 1014., 2012.,  275., 2894.,
         4281., 2814.,  956., 4091., 1694., 3925., 3571., 1751., 4643., 2490.,
         2490., 4296., 1930., 1500., 3848., 1691., 2791., 1118., 4214., 1244.],
        [2589., 2437., 1423.,  540.,  618., 4498.,  577.,  140.,  701., 2128.,
         3688.,   96., 3841., 4473., 1564., 2486., 1455., 4657., 4593., 2525.,
         4358., 3038., 3270., 2296., 2296., 3365., 1014., 2012.,  275., 2894.,
         4281., 2814.,  95