## Install requirements


In [2]:
!pip install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
!pip install tensorboardX
!pip install scikit-learn

Collecting torch==1.1.0 from https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
[?25l  Downloading https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl (770.7MB)
[K     |████████████████████████████████| 770.7MB 77kB/s  eta 0:00:013    |██                              | 47.9MB 10.7MB/s eta 0:01:08     |████████████████                | 386.7MB 687kB/s eta 0:09:19     |████████████████████▎           | 487.2MB 12.3MB/s eta 0:00:24     |██████████████████████▎         | 537.7MB 63.9MB/s eta 0:00:04███████████████████████▋      | 617.7MB 11.5MB/s eta 0:00:14
Installing collected packages: torch
Successfully installed torch-1.1.0
Collecting torchvision==0.3.0 from https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
[?25l  Downloading https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 403kB/s eta 0:00:01
Collecti

In [3]:
import os
import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter

In [4]:
from data import get_loader, load_coco_files

phases = ['train', 'test_A']
batch_size = {'train': 200, 'test_A': 1000}
shuffle = {'train': True, 'test_A': False}
num_workers = {'train': 4, 'test_A': 1}
pin_memory = {'train': True, 'test_A': False}

loaders = {}
for phase in phases:
    folder_dir = os.path.join('/data/jeperez/COCO-2014-spanish/', phase)
    file_names = os.path.join(folder_dir, '{}_images_names.txt'.format(phase))
    file_vectors = os.path.join(folder_dir, '{}_images_vectors.bin'.format(phase))
    file_captions = os.path.join(folder_dir, '{}_captions.txt'.format(phase))
    images_names, visual_feats, captions = load_coco_files(file_names, file_vectors, file_captions, 2048)
    loaders[phase] = get_loader(images_names, visual_feats, captions, batch_size[phase], shuffle[phase], 
                              num_workers[phase], pin_memory[phase])
    
    if phase == 'train':
        train_names, train_texts = zip(*captions)

for s in ['{}: {}'.format(n, s) for n,s in zip(train_names[0:10], train_texts[0:10])]:
    print(s)

leyendo /data/jeperez/COCO-2014-spanish/train/train_images_names.txt
leyendo /data/jeperez/COCO-2014-spanish/train/train_images_vectors.bin
20000 vectores de largo 2048
leyendo /data/jeperez/COCO-2014-spanish/train/train_captions.txt
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_images_names.txt
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_images_vectors.bin
1000 vectores de largo 2048
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_captions.txt
COCO_train2014_000000000086.jpg: Un hombre en un una vieja bicicleta de moda en el bosque
COCO_train2014_000000000086.jpg: Un hombre montado en una bicicleta de motor a través de un bosque.
COCO_train2014_000000000086.jpg: Un hombre sentado en una motocicleta en el bosque.
COCO_train2014_000000000086.jpg: Una persona que mira hacia abajo en algo mientras está sentado en una bicicleta.
COCO_train2014_000000000086.jpg: Una persona joven está en una vieja bicicleta muy adornado.
COCO_train2014_000000000077.jpg: un grupo de a

## Defines the divice to be used

In [6]:
from utils import get_freer_gpu

device = 'gpu'

if device == 'gpu' and torch.cuda.is_available():
    freer_gpu_id = get_freer_gpu()
    device = torch.device('cuda:{}'.format(freer_gpu_id))
    torch.cuda.empty_cache()
else:
    device = torch.device('cpu')

print(device)

cuda:0


## Seleccionar el modelo para representación de los textos


In [7]:
text_descriptor_name = 'embedding'

if text_descriptor_name == 'bow':
    from text_descriptors.bow import TextDescriptor
    text_descriptor = TextDescriptor(type='bow', texts=train_texts, lowecase=False, ngram_range=(1,1), 
                                     max_df=.8, min_df=.01)
    print(text_descriptor.descriptor.vocabulary_)
elif text_descriptor_name == 'tf-idf':
    from text_descriptors.bow import TextDescriptor
    text_descriptor = TextDescriptor(type='tf-idf', texts=train_texts, lowecase=False, ngram_range=(1,3), 
                                     max_df=.8, min_df=.01)
    print(text_descriptor.descriptor.vocabulary_)
elif text_descriptor_name == 'lsa':
    from text_descriptors.lsa import LSADescriptor
    text_descriptor = LSADescriptor(type='tf-idf', texts=train_texts, lowecase=False, ngram_range=(1,3), 
                                    max_df=.8, min_df=.01, n_components=100)
elif text_descriptor_name == 'embedding':
    from text_descriptors.embedding import WordEmbedding
    text_descriptor = WordEmbedding(texts=train_texts, lowecase=False, ngram_range=(1,1), max_df=.8, min_df=.01)
else:
    raise 'unknown descriptor {}'.format(text_descriptor_name)

print('descriptor size: {}'.format(text_descriptor.out_size))


descriptor size: 300


## Initialize the Regressor


In [14]:
from text_encoders.regressor import MLP, RNN

regression_model_name = 'rnn' # ['mlp', 'rnn']

if regression_model_name == 'mlp':
    regression_model = MLP(in_size=text_descriptor.out_size, h_size=5096, out_size=2048)
elif text_descriptor_name == 'embedding' and regression_model_name == 'rnn':
    regression_model = RNN(in_size=text_descriptor.out_size, h_size=2048)
else:
    raise 'unknown configuration: {} + {}'.format(text_descriptor_name, regression_model_name)
    
regression_model.to(device)

regression_model

RNN(
  (rnn): GRU(300, 2048, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
)

## loss function and optimizer


In [15]:
criterion = nn.MSELoss()

criterion

MSELoss()

## optimizers


In [16]:
encoder_optimizer = optim.Adam(regression_model.parameters(), lr=0.001)
if text_descriptor_name == 'embedding':
    embedding_optimizer = optim.Adam(text_descriptor.parameters(), lr=0.001)

encoder_optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

# initialize tensorboard logger

In [17]:
exp_name = '{}-{}'.format(text_descriptor_name, regression_model_name)
datetime_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
writer = SummaryWriter(logdir=os.path.join('./log/runs/', '{}-{}'.format(exp_name, datetime_str)))

writer

<tensorboardX.writer.SummaryWriter at 0x7f3d41e8b9e8>

# Train Regression

In [19]:
epochs = 100
for e in range(epochs):
    print
    for phase in phases:
        loss_count = 0
        for i, (images_names, visual_feats, captions) in enumerate(loaders[phase]):
            with torch.set_grad_enabled(phase == 'train'):
                if regression_model_name == 'mlp':
                    descriptors = text_descriptor.transform(captions)
                    encodes = regression_model(descriptors)
                elif regression_model_name == 'rnn':
                    idx_texts = text_descriptor.word_to_idx(captions)
                    print(idx_texts)
                    idx_texts = torch.LongTensor(idx_texts)
                    descriptors = text_descriptor(idx_texts)
                    encodes = regression_model(descriptors)
            
                # Evaluate the loss function
                loss = criterion(encodes, visual_feats)
    
            if phase == 'train':
                loss.backward()
                encoder_optimizer.step()
                if regression_model_name == 'rnn':
                    embedding_optimizer.step()
            
            loss_count += loss.item()
            writer.add_scalar('{}-loss'.format(phase), loss, e * len(loaders[phase]) + i)


[[6, 76, 39, 22, -1, 39, -1, 145, -1], [6, 63, 109, 84, 120, 125, 128, 49, 46, -1, -1, 39, 144, -1], [6, 95, 125, 128, 49, 79, 85, -1, -1, 46, -1, 39, 144, -1], [7, -1, 64, 39, -1, -1, -1, -1, 130, 145, -1], [7, 34, 49, -1, 39, -1, 37, -1, 39, 93, -1, 37, -1, -1, -1, -1], [6, 38, -1, 37, 44, -1, 44, -1, 144, 96, 119, -1, -1, -1, 49, 46, -1], [6, 108, -1, 93, 125, 128, 49, 79, 104, -1, 39, 144, 17, 40, -1], [7, 114, -1, 49, -1, 37, 81, -1, 49, 145, -1], [0, -1, 39, 22, 39, -1, -1, 77, 10, 22, 49, 144, 28, 39, -1], [1, -1, 39, 22, 125, -1, 41, 40, -1, -1, 46, -1], [7, 91, 120, 132, 144, 139, -1, 86, 125, 55, 126, 49, 145, -1], [3, 62, 49, -1, 39, -1, -1, -1, 120, -1, 118, 145, -1], [-1, -1, -1, 39, -1, 39, -1, -1, -1, 49, 145, -1], [6, 69, -1, 136, 49, 145, -1, 49, 46, -1], [145, 110, 120, -1, 145, 138, 39, -1, 49, 145, 24, 39, 79, 32], [145, 91, 120, 132, 130, 145, -1, 39, 140, 86, -1, 67, -1, 49, 46, 9], [6, 61, 20, -1, 93, 120, -1, 49, 46, -1, 39, 79, -1], [7, 124, 39, 54, 37, 144, -1

ValueError: expected sequence of length 9 at dim 1 (got 14)

In [18]:
text_descriptor.cv.vocabulary_['muchacho']

KeyError: 'muchacho'