In [1]:
import shutil
import os
import random
import pickle
import pandas as pd
import numpy as np
import spacy
import re
import sys

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.models as models


from collections import Counter,defaultdict
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import nltk
from nltk.translate.bleu_score import corpus_bleu

import skimage.io
import skimage.transform
import cv2

In [2]:
os.makedirs('.kaggle/')

In [3]:
! mv kaggle.json .kaggle/kaggle.json

In [6]:
import kaggle



In [7]:
! chmod 600 /root/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': Permission denied


In [8]:
! kaggle datasets download ming666/flicker8k-dataset

Downloading flicker8k-dataset.zip to /home/ubuntu
100%|█████████████████████████████████████▉| 2.07G/2.08G [00:32<00:00, 70.1MB/s]
100%|██████████████████████████████████████| 2.08G/2.08G [00:32<00:00, 68.8MB/s]


In [31]:
! kaggle datasets download watts2/glove6b50dtxt

Downloading glove6b50dtxt.zip to /home/ubuntu
 96%|████████████████████████████████████▌ | 65.0M/67.7M [00:01<00:00, 42.5MB/s]
100%|██████████████████████████████████████| 67.7M/67.7M [00:01<00:00, 62.0MB/s]


In [32]:
! unzip glove6b50dtxt.zip

Archive:  glove6b50dtxt.zip
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [34]:
! mv glove.6B.50d.txt data/glove.6B.50d.txt

In [None]:
! unzip flicker8k-dataset.zip 

In [13]:
os.makedirs('data/')

In [14]:
! mv flickr8k_dataset/Flicker8k_Dataset data/Flicker8k_Dataset

In [15]:
! mv Flickr8k_text data/Flickr8k_text

In [16]:
! wget https://zhegan27.github.io/Papers/FlickrStyle_v0.9.zip


--2020-06-24 06:43:50--  https://zhegan27.github.io/Papers/FlickrStyle_v0.9.zip
Resolving zhegan27.github.io (zhegan27.github.io)... 185.199.109.153, 185.199.110.153, 185.199.111.153, ...
Connecting to zhegan27.github.io (zhegan27.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 550592 (538K) [application/zip]
Saving to: ‘FlickrStyle_v0.9.zip’


2020-06-24 06:43:50 (44.0 MB/s) - ‘FlickrStyle_v0.9.zip’ saved [550592/550592]



In [None]:
! unzip FlickrStyle_v0.9.zip

In [18]:
! mv FlickrStyle_v0.9 data/FlickrStyle_v0.9

## Preprocessing

### select 7k train images and 1k validation images

In [19]:
img_lst = pickle.load(open( "data/FlickrStyle_v0.9/humor/train.p", "rb" ) )

In [2]:
torch.cuda.is_available()

True

In [21]:
def select_7k_images(c_type='humor'):
    '''8k -> 7k'''
    # Create folder if does not exist.
    if not os.path.exists('data/Flickr7k/'):
        os.makedirs('data/Flickr7k/')
    # open data/type/train.p
    img_lst = pickle.load(open( "data/FlickrStyle_v0.9/humor/train.p", "rb" ) )
    
    # copy imgs
    for img_name in img_lst:
        shutil.copyfile('data/Flicker8k_Dataset/' + img_name,
                        'data/Flickr7k/' + img_name)



In [22]:
# Run for first time 
select_7k_images(c_type='humor')

In [23]:
def select_validation_images():
    '''select test images randomly'''
    # get filenames in flickr7k, 30k_images
    filenames_7k = os.listdir('data/Flickr7k/')
    filenames_8k = os.listdir('data/Flicker8k_Dataset')

    filenames = list(set(filenames_8k) - set(filenames_7k))
    print("img_num: " + str(len(filenames)))

    # copy images
    validation_img_path = 'data/validation_images/'
    if not os.path.exists(validation_img_path):
        os.makedirs(validation_img_path)
    for img_name in filenames:
        shutil.copyfile('data/Flicker8k_Dataset/' + img_name,
                        validation_img_path + img_name)

In [24]:
# Run for first time
select_validation_images()

img_num: 1091


### select 7k factual captions

In [25]:
flickr8k_filename = "data/Flickr8k_text/Flickr8k.token.txt"

In [26]:
img_id_lst = pickle.load(open( "data/FlickrStyle_v0.9/humor/train.p", "rb" ) )

In [27]:
# get filenames in flickr7k_images
filenames = os.listdir('data/Flickr7k/')
# open factual caption: Flickr8k.token.txt
with open(flickr8k_filename, 'r') as f:
    res = f.readlines()

# write out
with open('data/factual_train.txt', 'w') as f:
    r = re.compile(r'#\d*')
    for line in res:
        img_id = r.split(line)[0]
        if img_id in img_id_lst:
            f.write(line)
            
with open('data/factual_valid.txt', 'w') as f:
    r = re.compile(r'#\d*')
    for line in res:
        img_id = r.split(line)[0]
        if img_id not in img_id_lst:
            f.write(line)

## Build vocab

In [3]:
def extract_captions(mode='factual'):
    ''' Extract captions from txt files to build vocabulary.'''
    if mode == 'factual':
        text = pd.read_csv("data/factual_train.txt", 
                           encoding= 'unicode_escape', names=['line'])
        text['caption'] = text['line'].str.split('\t', n=1, expand=True)[1]

    elif mode == 'humorous':
        text = pd.read_csv("data/FlickrStyle_v0.9/humor/funny_train.txt", 
                           encoding= 'unicode_escape', names=['caption'])
    else:
        text = pd.read_csv("data/FlickrStyle_v0.9/romantic/romantic_train.txt", 
                           encoding= 'unicode_escape', names=['caption'])
    return list(text.caption)

In [4]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

def loadGloveModel(gloveFile="data/glove.6B.50d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

def get_vocab(mode_list=['factual', 'humorous']):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    
    Delete rare words from vocab if they are not in word_vecs
    and don't have at least min_df occurrencies.
    """
    content = []
    for mode in mode_list:
        content.extend(extract_captions(mode))
    vocab = defaultdict(float)
    for line in content:
        words = set(spacy_tok(line))
        for word in words:
            vocab[word] += 1
    return vocab

def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    #word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

def encode_sentence_no_padding(s, vocab2index):
    return np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])

In [5]:
word_vecs = loadGloveModel()
word_count = get_vocab(mode_list=['factual', 'humorous'])
# word_count = delete_rare_words(word_vecs, word_count)
pretrained_weight, vocab, vocab2index = create_embedding_matrix(word_vecs, word_count)

In [6]:
len(vocab)

10383

## Datasets and Dataloaders

In [7]:
def get_img_and_caption(caption_file):
    '''
    Get image name and caption from factual caption file.
    Returns array of tuples.
    '''
    text = pd.read_csv(caption_file, encoding= 'unicode_escape', names=['line'])
    caption = text['line'].str.split('\t', n=1, expand=True)[1]
    img = text['line'].str.split('#', n=1, expand=True)[0]
    return list(zip(img, caption))

In [8]:
def read_image(path):
    im = cv2.imread(str(path))
    return cv2.cvtColor(im, cv2.COLOR_BGR2RGB)

def resize_image(path, sz=112):
    im = read_image(path)
    return cv2.resize(im, (sz, sz))

def normalize(im):
    """Normalizes images with Imagenet stats."""
    imagenet_stats = np.array([[0.485, 0.456, 0.406], [0.229, 0.224, 0.225]])
    return (im - imagenet_stats[0])/imagenet_stats[1]

In [56]:
class Flickr7kDataset(Dataset):
    def __init__(self, img_dir, caption_file, vocab2index, transform=None):
        '''
        img_dir: Directory with all images.
        caption_file: Path to factual caption file.
        vocab2index: Vocab dictionary.
        transform: Optional transforms to apply.
        '''
        self.img_dir = img_dir
        self.img_caption_list = self._get_img_and_caption(caption_file)
        self.transform = transform

    def _get_img_and_caption(self, caption_file):
        '''
        Get image name and caption from factual caption file.
        Returns array of tuples.
        '''
        text = pd.read_csv(caption_file, encoding= 'unicode_escape', names=['line'])
        caption = text['line'].str.split('\t', n=1, expand=True)[1]
        img = text['line'].str.split('#', n=1, expand=True)[0]
        return list(zip(img, caption))

    def __len__(self):
        return len(self.img_caption_list)

    def __getitem__(self, idx):
        '''Returns image and caption embedding.'''
        img_name = self.img_caption_list[idx][0]
        img_path = os.path.join(self.img_dir, img_name)

        if self.transform is not None:
#             transform = transforms.Compose([Rescale((224, 224)),transforms.ToTensor()])
#             image = transform(image)
            image = resize_image(img_path, sz=56)
            image = normalize(image)
            image = np.rollaxis(image, 2)
            
        caption = self.img_caption_list[idx][1]
        caption = encode_sentence_no_padding(caption, vocab2index)
        
        return image, caption

In [57]:
img_path = "data/Flickr7k"
cap_path = "data/factual_train.txt"
flickr7k_ds = Flickr7kDataset(img_path, cap_path, vocab2index, transform=True)

In [None]:
os.makedirs('data/test')

In [None]:
! mv data/validation_images/3504479370_ff2d89a043.jpg data/test/3504479370_ff2d89a043.jpg

In [11]:
image, captions = flickr7k_ds[0]

In [12]:
image.shape

(3, 112, 112)

In [13]:
def collate_fn(data):
    '''create minibatch tensors from data(list of tuple(image, caption))'''
    images, captions = zip(*data)

    # images
    images = [torch.Tensor(im) for im in images]
    images = torch.stack(images, 0)

    # captions
    captions = [torch.Tensor(c) for c in captions]
    lengths = torch.LongTensor([len(c) for c in captions])
    captions_padded = pad_sequence(captions, batch_first=True, padding_value=0)
    
    return images, captions_padded, lengths

In [14]:
images, captions, lengths = collate_fn([flickr7k_ds[0], flickr7k_ds[1], flickr7k_ds[2]])

In [15]:
images[0].shape, captions[0].shape, lengths[0]

(torch.Size([3, 112, 112]), torch.Size([18]), tensor(18))

In [16]:
batch_size = 3
num_workers = 0
shuffle = True
flickr7k_dl = DataLoader(flickr7k_ds,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn)

In [58]:
valid_img_path = 'data/validation_images'
valid_cap_path = "data/factual_valid.txt"
valid_ds = Flickr7kDataset(valid_img_path, valid_cap_path, vocab2index, transform=True)
valid_dl = DataLoader(flickr7k_ds,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn)

In [18]:
images, captions, lengths = next(iter(flickr7k_dl))

In [19]:
images.shape, captions.shape, lengths.shape

(torch.Size([3, 3, 112, 112]), torch.Size([3, 15]), torch.Size([3]))

In [20]:
class FlickrStyle7kDataset(Dataset):
    def __init__(self, caption_file, vocab2index):
        '''
        caption_file: Path to styled caption file.
        vocab2index: Vocab dictionary.
        '''
        self.caption_list = self._get_caption(caption_file)

    def _get_caption(self, caption_file):
        '''Get list of captions from styled caption file.'''
        text = pd.read_csv(caption_file, encoding= 'unicode_escape', 
                           names=['caption'])
        return list(text.caption)

    def __len__(self):
        return len(self.caption_list)

    def __getitem__(self, idx):
        caption = self.caption_list[idx]
        caption = encode_sentence_no_padding(caption, vocab2index)
        return caption

In [21]:
def collate_fn_styled(captions):
    # captions
    captions = [torch.Tensor(c) for c in captions]
    lengths = torch.LongTensor([len(c) for c in captions])
    captions_padded = pad_sequence(captions, batch_first=True, padding_value=0)

    return captions_padded, lengths


In [59]:
#batch_size 
#shuffle = True
cap_path_styled = "data/FlickrStyle_v0.9/humor/funny_train.txt"
flickrstyle7k_ds = FlickrStyle7kDataset(cap_path_styled, vocab2index)

flickrstyle7k_dl = DataLoader(dataset=flickrstyle7k_ds,
                         batch_size=batch_size,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn_styled)

In [23]:
captions, lengths = next(iter(flickrstyle7k_dl))

In [24]:
captions.shape, lengths.shape

(torch.Size([3, 17]), torch.Size([3]))

## Loss

In [25]:
def sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def masked_cross_entropy(logits, target, length):
    if torch.cuda.is_available():
        length = length.cuda()

    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.
    Returns:
        loss: An average loss value masked by the length.
    """

    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss


In [26]:

length = torch.LongTensor([23, 21, 17])

print(sequence_mask(length))

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True, False, False, False,
         False, False, False]])


  """


## Model

In [27]:
class EncoderCNN(nn.Module):
    def __init__(self, emb_dim):
        '''
        Load the pretrained ResNet152 and replace fc
        '''
        super(EncoderCNN, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)
        self.A = nn.Linear(resnet.fc.in_features, emb_dim)

    def forward(self, images):
        '''Extract the image feature vectors'''
        features = self.resnet(images)
        if torch.cuda.is_available():
            features = features.cuda()
        features = features.view(features.size(0), -1)
        features = self.A(features)
        return features

In [28]:
class FactoredLSTM(nn.Module):
    def __init__(self, emb_dim, hidden_dim, factored_dim,  vocab_size):
        super(FactoredLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        # embedding
        self.B = nn.Embedding(vocab_size, emb_dim)

        # factored lstm weights
        self.U_i = nn.Linear(factored_dim, hidden_dim)
        self.S_fi = nn.Linear(factored_dim, factored_dim)
        self.V_i = nn.Linear(emb_dim, factored_dim)
        self.W_i = nn.Linear(hidden_dim, hidden_dim)

        self.U_f = nn.Linear(factored_dim, hidden_dim)
        self.S_ff = nn.Linear(factored_dim, factored_dim)
        self.V_f = nn.Linear(emb_dim, factored_dim)
        self.W_f = nn.Linear(hidden_dim, hidden_dim)

        self.U_o = nn.Linear(factored_dim, hidden_dim)
        self.S_fo = nn.Linear(factored_dim, factored_dim)
        self.V_o = nn.Linear(emb_dim, factored_dim)
        self.W_o = nn.Linear(hidden_dim, hidden_dim)

        self.U_c = nn.Linear(factored_dim, hidden_dim)
        self.S_fc = nn.Linear(factored_dim, factored_dim)
        self.V_c = nn.Linear(emb_dim, factored_dim)
        self.W_c = nn.Linear(hidden_dim, hidden_dim)

        # h - humorous
        self.S_hi = nn.Linear(factored_dim, factored_dim)
        self.S_hf = nn.Linear(factored_dim, factored_dim)
        self.S_ho = nn.Linear(factored_dim, factored_dim)
        self.S_hc = nn.Linear(factored_dim, factored_dim)

        # r - romantic
        # self.S_ri = nn.Linear(factored_dim, factored_dim)
        # self.S_rf = nn.Linear(factored_dim, factored_dim)
        # self.S_ro = nn.Linear(factored_dim, factored_dim)
        # self.S_rc = nn.Linear(factored_dim, factored_dim)

        # weight for output
        self.C = nn.Linear(hidden_dim, vocab_size)

    def forward_step(self, embedded, h_0, c_0, mode):
        # transform embedded from emb_dim --> factored_dim
        i = self.V_i(embedded)
        f = self.V_f(embedded)
        o = self.V_o(embedded)
        c = self.V_c(embedded)
        
        # factored_dim --> factored_dim
        if mode == "factual":
            i = self.S_fi(i)
            f = self.S_ff(f)
            o = self.S_fo(o)
            c = self.S_fc(c)
        elif mode == "humorous":
            i = self.S_hi(i)
            f = self.S_hf(f)
            o = self.S_ho(o)
            c = self.S_hc(c)
        # elif mode == "romantic":
        #     i = self.S_ri(i)
        #     f = self.S_rf(f)
        #     o = self.S_ro(o)
        #     c = self.S_rc(c)
        else:
            sys.stderr.write("mode name wrong!")

        i_t = F.sigmoid(self.U_i(i.double()) + self.W_i(h_0.double()))
        f_t = F.sigmoid(self.U_f(f.double()) + self.W_f(h_0.double()))
        o_t = F.sigmoid(self.U_o(o.double()) + self.W_o(h_0.double()))
        c_tilda = F.tanh(self.U_c(c.double()) + self.W_c(h_0.double()))

        c_t = f_t * c_0 + i_t * c_tilda
        h_t = o_t * c_t

        outputs = self.C(h_t)

        return outputs, h_t, c_t

    def forward(self, captions, features=None, mode="factual"):
        '''
        Args:
            features: fixed vectors from images, [batch, emb_dim]
            captions: [batch, max_len]
            mode: type of caption to generate
        '''
        batch_size = captions.size(0)
        embedded = self.B(captions)  # [batch, max_len, emb_dim]
        # concat image features and captions
        if mode == "factual":
            if features is None:
                sys.stderr.write("features is None!")
            embedded = torch.cat((features.unsqueeze(1), embedded), 1)

        # initialize hidden state
        h_t = torch.Tensor(batch_size, self.hidden_dim)
        c_t = torch.Tensor(batch_size, self.hidden_dim)
        nn.init.uniform(h_t)
        nn.init.uniform(c_t)

        if torch.cuda.is_available():
            h_t = h_t.cuda()
            c_t = c_t.cuda()

        all_outputs = []
        # iterate
        for ix in range(embedded.size(1) - 1):
            emb = embedded[:, ix, :]
            outputs, h_t, c_t = self.forward_step(emb, h_t, c_t, mode=mode)
            all_outputs.append(outputs)

        all_outputs = torch.stack(all_outputs, 1)

        return all_outputs


## Training

In [29]:
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'

# def eval_outputs(outputs, vocab):
#     # outputs: [batch, max_len - 1, vocab_size]
#     indices = torch.topk(outputs, 1)[1]
#     indices = indices.squeeze(2)
#     indices = indices.data
#     for i in range(len(indices)):
#         caption = [vocab.i2w[x] for x in indices[i]]
#         print(caption)

In [30]:
# batch_size = 50
# data_loader = get_data_loader(img_path, cap_path, vocab, batch_size)
# styled_data_loader = get_styled_data_loader(cap_path_styled, vocab, batch_size)

In [31]:
reverse_word_map = dict(map(reversed, vocab2index.items()))
reverse_word_map[0]='<PAD>'

In [32]:
reverse_word_map[3]

'an'

In [72]:
def val_metrics(encoder, decoder, valid_fact_dl):
    encoder.eval()
    decoder.eval()
    total = 0
    step = len(valid_fact_dl)
    sum_bleu = 0
    for n,(images, captions, lengths) in enumerate(valid_fact_dl):
        batch = lengths.shape[0]
        if torch.cuda.is_available():
            images = images.cuda()
            captions = captions.cuda()
        features = encoder(images.double())
        outputs = decoder(captions.long(), features.double(), mode="factual")
        loss = criterion(outputs[:, 1:, :].contiguous(),
                             captions[:, 1:].contiguous().long(), lengths - 1)
        #if n% 50 == 0:
        print("Validation Loss %.3f" % (loss.data.mean()))
        
        indices = torch.topk(outputs, 1)[1].squeeze(2).data
        for i in range(batch):
            predicted_caption = [reverse_word_map[x.item()] for x in indices[i]]
            actual_caption = [reverse_word_map[x.item()] for x in captions[i]]
            print(predicted_caption, actual_caption)
            bleu_score = corpus_bleu(predicted_caption, actual_caption)
            print(bleu_score)
            sum_bleu += bleu_score
            total += 1
        if n==3: break
    return sum_bleu/total

In [65]:
def train(encoder, decoder, 
          train_fact_dl, valid_fact_dl, style_dl, 
          epoch_num, optimizer_cap, optimizer_lang,
          total_cap_step, total_lang_step, model_path):
    
    for epoch in range(epoch_num):
        #caption
        for i, (images, captions, lengths) in enumerate(train_fact_dl):
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()

            # forward, backward and optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images.double())
            outputs = decoder(captions.long(), features.double(), mode="factual")
            loss = criterion(outputs[:, 1:, :].contiguous(),
                             captions[:, 1:].contiguous().long(), lengths - 1)
            loss.backward()
            optimizer_cap.step()

            #print log
            if i % 50 == 0:
                print("Epoch [%d/%d], CAP, Step [%d/%d], Train Loss: %.4f"
                      % (epoch+1, epoch_num, i, total_cap_step,
                          loss.data.mean()))
            #if i==2: break
            # print("Epoch [%d/%d], CAP, Step [%d/%d], Loss: %.4f"
            #           % (epoch+1, epoch_num, i, total_cap_step,
            #               loss.data.mean()))

        #eval_outputs(outputs, vocab)
#         val_bleu = val_metrics(encoder, decoder, valid_fact_dl)
#         print("val_bleu %.3f" % (val_bleu))

        # language
        for i, (captions, lengths) in enumerate(style_dl):
            if torch.cuda.is_available():
                captions = captions.cuda()

            # forward, backward and optimize
            decoder.zero_grad()
            outputs = decoder(captions.long(), mode='humorous')
            loss = criterion(outputs, captions[:, 1:].contiguous().long(), lengths-1)
            loss.backward()
            optimizer_lang.step()

            #print log
            if i % 50 == 0:
                print("Epoch [%d/%d], LANG, Step [%d/%d], Loss: %.4f"
                      % (epoch+1, epoch_num, i, total_lang_step,
                          loss.data.mean()))
            #if i==3: break
            # print("Epoch [%d/%d], LANG, Step [%d/%d], Loss: %.4f"
            #           % (epoch+1, epoch_num, i, total_lang_step,
            #               loss.data.mean()))

        # save models
        torch.save(decoder.state_dict(),
                   os.path.join(model_path, 'decoder-%d.pkl' % (epoch + 1,)))

        torch.save(encoder.state_dict(),
                   os.path.join(model_path, 'encoder-%d.pkl' % (epoch + 1,)))

In [52]:
batch_size_1 = 20
batch_size_2 = 40

num_workers = 4
shuffle = True
flickr7k_dl = DataLoader(flickr7k_ds,
                         batch_size=batch_size_1,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn)
valid_dl = DataLoader(valid_ds,
                         batch_size=10,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn)
flickrstyle7k_dl = DataLoader(dataset=flickrstyle7k_ds,
                         batch_size=batch_size_2,
                         shuffle=shuffle,
                         num_workers=num_workers,
                         collate_fn=collate_fn_styled)

In [37]:
emb_dim = 300
hidden_dim = 512
factored_dim = 512
vocab_size = len(vocab)

encoder = EncoderCNN(emb_dim)
decoder = FactoredLSTM(emb_dim, hidden_dim, factored_dim, vocab_size)

encoder = encoder.double()
decoder = decoder.double()

if torch.cuda.is_available():
    encoder = encoder.cuda()
    decoder = decoder.cuda()

In [61]:
# train
total_cap_step = len(flickr7k_dl)
total_lang_step = len(flickrstyle7k_dl)
epoch_num = 5

In [40]:
# loss and optimizer
lr_caption = 0.0002
lr_language = 0.0005

criterion = masked_cross_entropy
cap_params = list(decoder.parameters()) + list(encoder.A.parameters())
lang_params = list(decoder.parameters())
optimizer_cap = torch.optim.Adam(cap_params, lr=lr_caption)
optimizer_lang = torch.optim.Adam(lang_params, lr=lr_language)

In [41]:
model_path = 'pretrained_models'
if not os.path.exists(model_path):
    os.makedirs(model_path)

In [46]:
# Task 1 - first epoch
train(encoder, decoder, 
      flickr7k_dl, valid_dl, flickrstyle7k_dl, 
      epoch_num, optimizer_cap, optimizer_lang,
      total_cap_step, total_lang_step,model_path)

  """


Epoch [1/5], CAP, Step [0/1750], Train Loss: 9.1242
Epoch [1/5], CAP, Step [50/1750], Train Loss: 5.1986
Epoch [1/5], CAP, Step [100/1750], Train Loss: 4.5383
Epoch [1/5], CAP, Step [150/1750], Train Loss: 4.6473
Epoch [1/5], CAP, Step [200/1750], Train Loss: 4.4983
Epoch [1/5], CAP, Step [250/1750], Train Loss: 4.4616
Epoch [1/5], CAP, Step [300/1750], Train Loss: 4.2857
Epoch [1/5], CAP, Step [350/1750], Train Loss: 4.4231
Epoch [1/5], CAP, Step [400/1750], Train Loss: 4.0600
Epoch [1/5], CAP, Step [450/1750], Train Loss: 4.0460
Epoch [1/5], CAP, Step [500/1750], Train Loss: 3.9267
Epoch [1/5], CAP, Step [550/1750], Train Loss: 3.5970
Epoch [1/5], CAP, Step [600/1750], Train Loss: 4.1289
Epoch [1/5], CAP, Step [650/1750], Train Loss: 3.7435
Epoch [1/5], CAP, Step [700/1750], Train Loss: 3.7922
Epoch [1/5], CAP, Step [750/1750], Train Loss: 3.8378
Epoch [1/5], CAP, Step [800/1750], Train Loss: 4.0018
Epoch [1/5], CAP, Step [850/1750], Train Loss: 3.9198
Epoch [1/5], CAP, Step [900/175

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 7.44 GiB total capacity; 6.84 GiB already allocated; 17.56 MiB free; 7.02 GiB reserved in total by PyTorch)

In [60]:
# Task 2 - first epoch
train(encoder, decoder, 
      flickr7k_dl, valid_dl, flickrstyle7k_dl, 
      epoch_num, optimizer_cap, optimizer_lang,
      total_cap_step, total_lang_step,model_path)

  """


Epoch [1/5], LANG, Step [0/350], Loss: 6.6371
Epoch [1/5], LANG, Step [10/350], Loss: 5.5687
Epoch [1/5], LANG, Step [20/350], Loss: 5.0289
Epoch [1/5], LANG, Step [30/350], Loss: 5.2054
Epoch [1/5], LANG, Step [40/350], Loss: 5.2507
Epoch [1/5], LANG, Step [50/350], Loss: 4.7173
Epoch [1/5], LANG, Step [60/350], Loss: 4.9744
Epoch [1/5], LANG, Step [70/350], Loss: 4.3951
Epoch [1/5], LANG, Step [80/350], Loss: 4.9487
Epoch [1/5], LANG, Step [90/350], Loss: 4.9245
Epoch [1/5], LANG, Step [100/350], Loss: 4.7983
Epoch [1/5], LANG, Step [110/350], Loss: 4.8542
Epoch [1/5], LANG, Step [120/350], Loss: 4.2926
Epoch [1/5], LANG, Step [130/350], Loss: 4.9134
Epoch [1/5], LANG, Step [140/350], Loss: 4.4444
Epoch [1/5], LANG, Step [150/350], Loss: 4.4908
Epoch [1/5], LANG, Step [160/350], Loss: 5.0431
Epoch [1/5], LANG, Step [170/350], Loss: 4.2022
Epoch [1/5], LANG, Step [180/350], Loss: 4.7108
Epoch [1/5], LANG, Step [190/350], Loss: 4.8104
Epoch [1/5], LANG, Step [200/350], Loss: 4.8454
Epo

KeyboardInterrupt: 

In [66]:
# Task 1 + Task 2 : remaining 4 epochs
train(encoder, decoder, 
      flickr7k_dl, valid_dl, flickrstyle7k_dl, 
      epoch_num, optimizer_cap, optimizer_lang,
      total_cap_step, total_lang_step,model_path)

  """


Epoch [1/5], CAP, Step [0/1750], Train Loss: 3.9546
Epoch [1/5], CAP, Step [50/1750], Train Loss: 3.1573
Epoch [1/5], CAP, Step [100/1750], Train Loss: 3.7334
Epoch [1/5], CAP, Step [150/1750], Train Loss: 3.3264
Epoch [1/5], CAP, Step [200/1750], Train Loss: 3.4457
Epoch [1/5], CAP, Step [250/1750], Train Loss: 3.3678
Epoch [1/5], CAP, Step [300/1750], Train Loss: 3.4562
Epoch [1/5], CAP, Step [350/1750], Train Loss: 3.3256
Epoch [1/5], CAP, Step [400/1750], Train Loss: 3.9985
Epoch [1/5], CAP, Step [450/1750], Train Loss: 3.2154
Epoch [1/5], CAP, Step [500/1750], Train Loss: 3.5057
Epoch [1/5], CAP, Step [550/1750], Train Loss: 3.1751
Epoch [1/5], CAP, Step [600/1750], Train Loss: 3.1273
Epoch [1/5], CAP, Step [650/1750], Train Loss: 3.2453
Epoch [1/5], CAP, Step [700/1750], Train Loss: 3.3597
Epoch [1/5], CAP, Step [750/1750], Train Loss: 3.0249
Epoch [1/5], CAP, Step [800/1750], Train Loss: 3.8213
Epoch [1/5], CAP, Step [850/1750], Train Loss: 3.4782
Epoch [1/5], CAP, Step [900/175

Epoch [4/5], CAP, Step [1400/1750], Train Loss: 2.4476
Epoch [4/5], CAP, Step [1450/1750], Train Loss: 3.0776
Epoch [4/5], CAP, Step [1500/1750], Train Loss: 2.8273
Epoch [4/5], CAP, Step [1550/1750], Train Loss: 2.6042
Epoch [4/5], CAP, Step [1600/1750], Train Loss: 3.0436
Epoch [4/5], CAP, Step [1650/1750], Train Loss: 2.5882
Epoch [4/5], CAP, Step [1700/1750], Train Loss: 2.5414
Epoch [4/5], LANG, Step [0/350], Loss: 2.5110
Epoch [4/5], LANG, Step [50/350], Loss: 2.3639
Epoch [4/5], LANG, Step [100/350], Loss: 2.4572
Epoch [4/5], LANG, Step [150/350], Loss: 2.6094
Epoch [4/5], LANG, Step [200/350], Loss: 2.6223
Epoch [4/5], LANG, Step [250/350], Loss: 3.0079
Epoch [4/5], LANG, Step [300/350], Loss: 2.5911
Epoch [5/5], CAP, Step [0/1750], Train Loss: 2.8887


Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 

In [73]:
val_metrics(encoder, decoder, valid_dl)

  """


Validation Loss 2.969
['dog', 'are', 'a', 'a', 'man', 'in', 'down', 'a', 'snowy', '.', 'trail', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'] ['People', 'watch', 'as', 'a', 'person', 'skis', 'down', 'a', 'mountain', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1.0931616654031189e-231
['dog', 'black', 'dog', 'dog', 'white', 'dog', 'is', 'in', 'the', 'water', '.', 'a', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'] ['A', 'spotted', 'black', 'and', 'white', 'dog', 'splashes', 'in', 'the', 'water', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1.2097822504111573e-231
['black', 'old', 'in', 'is', 'on', 'a', 'of', 'a', 'art', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'] ['An', 'unshaven', 'man', 'sits', 'in', 'front', 'of', 'an', 'evergreen', 'tree', '.', '<PAD>', '<PAD>'

Validation Loss 2.897
['black', 'people', 'are', 'standing', 'at', 'sitting', 'are', 'listening', 'around', 'to', 'each', 'brick', 'wall', '.', 'a', 'art', 'gallery', '.'] ['some', 'people', 'are', 'sitting', 'and', 'others', 'are', 'standing', 'next', 'to', 'a', 'false', 'wall', 'in', 'an', 'art', 'gallery', '.']
1.5881433504932496e-231
['the', 'man', 'is', 'climbing', 'a', 'wave', 'bike', 'through', 'a', 'trail', '.', 'down', '.', '.', '.', '.', '.', '.'] ['This', 'person', 'is', 'riding', 'a', 'BMX', 'bike', 'on', 'a', 'clifftop', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1.2837951622885606e-231
['dog', 'and', 'with', 'mouth', 'collar', 'in', 'in', 'in', 'snow', 'leaves', '.', 'low', '.', '.', '.', '.', '.', '.'] ['Black', 'dog', 'with', 'orange', 'ball', 'approaches', 'camera', 'across', 'dead', 'leaves', '.', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1.1441414120472538e-231
['woman', 'man', 'girl', 'is', 'off', 'rail', 'down', 'a', 'r

1.1921874587366841e-231

In [None]:
# encoder.load_state_dict(torch.load('pretrained_models/encoder-15.pkl'))
# decoder.load_state_dict(torch.load('pretrained_models/decoder-15.pkl'))

In [None]:
encoder.eval()
decoder.eval()

In [None]:
test_img_path = 'data/test/3504479370_ff2d89a043.jpg'

image = resize_image(img_path, sz=56)
image = normalize(image)
image = np.rollaxis(image, 2)

In [None]:
im = read_image(test_img_path)
im.show()

In [None]:
image.show()

In [None]:
features = encoder(image)
output = decoder.sample(features, mode="factual")

caption = 

In [None]:
indices = torch.topk(outputs, 1)[1].squeeze(2).data
for i in range(batch):
    predicted_caption = [reverse_word_map[x.item()] for x in indices[i]]