# Downloading datasets

In [1]:
import os

if not os.path.exists('./data'):
    # Download the dataset
    !mkdir data
    !wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip -P ./data/
    !wget http://images.cocodataset.org/zips/train2014.zip -P ./data/
    !wget http://images.cocodataset.org/zips/val2014.zip -P ./data/
    !unzip ./data/captions_train-val2014.zip -d ./data/
    !unzip ./data/train2014.zip -d ./data/
    !unzip ./data/val2014.zip -d ./data/

# Processing inputs

In [2]:
if not os.path.exists('./data/vocab.pkl'):
    # build a vocabulary list
    !python build_vocab.py   

if not os.path.exists('./data/resized2014'):
    # Resize all the images to bring them to shape 224x224
    !python resize.py

# Training

In [3]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader import get_loader
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms
from types import SimpleNamespace

In [4]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args = SimpleNamespace(model_path='./models/',
                       vocab_path='data/vocab.pkl',
                       image_dir='data/resized2014',
                       caption_path='data/annotations/captions_train2014.json',
                       crop_size=224,
                       log_step=10,
                       save_step=1000,
                       embed_size=256,
                       hidden_size=512,
                       num_layers=2,
                       num_epochs=2,
                       batch_size=128,
                       num_workers=2,
                       learning_rate=0.001)

In [5]:
# Create model directory
if not os.path.exists(args.model_path):
    os.makedirs(args.model_path)

# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([
    transforms.RandomCrop(args.crop_size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                         (0.229, 0.224, 0.225))])

# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
    vocab = pickle.load(f)

# Build data loader
data_loader = get_loader(args.image_dir, args.caption_path, vocab,
                         transform, args.batch_size,
                         shuffle=True, num_workers=args.num_workers)

# Build the models
encoder = EncoderCNN(args.embed_size).to(device)
decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=args.learning_rate)

loading annotations into memory...
Done (t=0.30s)
creating index...
index created!


In [6]:
print(args, '\n')

# Train the models
total_step = len(data_loader)
for epoch in range(args.num_epochs):
    print('>>> Begin epoch {}'.format(epoch))

    for i, (images, captions, lengths) in enumerate(data_loader):
        # Set mini-batch dataset
        images = images.to(device)
        captions = captions.to(device)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

        # Forward, backward and optimize
        features = encoder(images)
        outputs = decoder(features, captions, lengths)
        loss = criterion(outputs, targets)
        decoder.zero_grad()
        encoder.zero_grad()
        loss.backward()
        optimizer.step()

        # Print log info
        if i % args.log_step == 0:
            print('\rEpoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                  .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))
        else:
            print('.', end='')

        # Save the model checkpoints
        if (i+1) % args.save_step == 0:
            torch.save(decoder.state_dict(), os.path.join(
                args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
            torch.save(encoder.state_dict(), os.path.join(
                args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))

namespace(model_path='./models/', vocab_path='data/vocab.pkl', image_dir='data/resized2014', caption_path='data/annotations/captions_train2014.json', crop_size=224, log_step=10, save_step=1000, embed_size=256, hidden_size=512, num_layers=2, num_epochs=2, batch_size=128, num_workers=2, learning_rate=0.001) 

>>> Begin epoch 0
Epoch [0/2], Step [0/3236], Loss: 9.2067, Perplexity: 9964.0078
Epoch [0/2], Step [10/3236], Loss: 5.8962, Perplexity: 363.6424
Epoch [0/2], Step [20/3236], Loss: 5.4293, Perplexity: 227.9847
Epoch [0/2], Step [30/3236], Loss: 5.3328, Perplexity: 207.0119
Epoch [0/2], Step [40/3236], Loss: 5.0016, Perplexity: 148.6541
Epoch [0/2], Step [50/3236], Loss: 4.8911, Perplexity: 133.0957
Epoch [0/2], Step [60/3236], Loss: 4.7773, Perplexity: 118.7891
Epoch [0/2], Step [70/3236], Loss: 4.6424, Perplexity: 103.7909
Epoch [0/2], Step [80/3236], Loss: 4.5284, Perplexity: 92.6142
Epoch [0/2], Step [90/3236], Loss: 4.6010, Perplexity: 99.5826
Epoch [0/2], Step [100/3236], Loss: