## Install requirements


In [2]:
!pip install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
!pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
!pip install tensorboardX
!pip install scikit-learn
!pip install nltk

Collecting torch==1.1.0 from https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
[?25l  Downloading https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl (770.7MB)
[K     |████████████████████████████████| 770.7MB 77kB/s  eta 0:00:013    |██                              | 47.9MB 10.7MB/s eta 0:01:08     |████████████████                | 386.7MB 687kB/s eta 0:09:19     |████████████████████▎           | 487.2MB 12.3MB/s eta 0:00:24     |██████████████████████▎         | 537.7MB 63.9MB/s eta 0:00:04███████████████████████▋      | 617.7MB 11.5MB/s eta 0:00:14
Installing collected packages: torch
Successfully installed torch-1.1.0
Collecting torchvision==0.3.0 from https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
[?25l  Downloading https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 403kB/s eta 0:00:01
Collecti

In [1]:
import os
import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from tensorboardX import SummaryWriter
import numpy as np

In [2]:
from data import get_loader, load_coco_files

phases = ['train', 'test_A']
batch_size = {'train': 200, 'test_A': 1000}
shuffle = {'train': True, 'test_A': False}
num_workers = {'train': 4, 'test_A': 1}
pin_memory = {'train': True, 'test_A': False}

loaders = {}
coco_images_names = {}
for phase in phases:
    folder_dir = os.path.join('/data/jeperez/COCO-2014-spanish/', phase)
    file_names = os.path.join(folder_dir, '{}_images_names.txt'.format(phase))
    file_vectors = os.path.join(folder_dir, '{}_images_vectors.bin'.format(phase))
    file_captions = os.path.join(folder_dir, '{}_captions.txt'.format(phase))
    coco_images_names[phase], coco_visual_feats, coco_captions = load_coco_files(file_names, file_vectors, file_captions, 2048)
    loaders[phase] = get_loader(coco_images_names[phase], coco_visual_feats, coco_captions, batch_size[phase], shuffle[phase], 
                              num_workers[phase], pin_memory[phase])
    
    if phase == 'train':
        train_names, train_texts = zip(*coco_captions)

for s in ['{}: {}'.format(n, s) for n,s in zip(train_names[0:10], train_texts[0:10])]:
    print(s)

leyendo /data/jeperez/COCO-2014-spanish/train/train_images_names.txt
leyendo /data/jeperez/COCO-2014-spanish/train/train_images_vectors.bin
20000 vectores de largo 2048
leyendo /data/jeperez/COCO-2014-spanish/train/train_captions.txt
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_images_names.txt
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_images_vectors.bin
1000 vectores de largo 2048
leyendo /data/jeperez/COCO-2014-spanish/test_A/test_A_captions.txt
COCO_train2014_000000000086.jpg: Un hombre en un una vieja bicicleta de moda en el bosque
COCO_train2014_000000000086.jpg: Un hombre montado en una bicicleta de motor a través de un bosque.
COCO_train2014_000000000086.jpg: Un hombre sentado en una motocicleta en el bosque.
COCO_train2014_000000000086.jpg: Una persona que mira hacia abajo en algo mientras está sentado en una bicicleta.
COCO_train2014_000000000086.jpg: Una persona joven está en una vieja bicicleta muy adornado.
COCO_train2014_000000000077.jpg: un grupo de a

## Defines the divice to be used

In [3]:
from utils import get_freer_gpu

device = 'gpu'

if device == 'gpu' and torch.cuda.is_available():
    freer_gpu_id = get_freer_gpu()
    device = torch.device('cuda:{}'.format(freer_gpu_id))
    torch.cuda.empty_cache()
else:
    device = torch.device('cpu')

print(device)

cuda:2


## Seleccionar el modelo para representación de los textos


In [4]:
text_descriptor_name = 'tf-idf'

if text_descriptor_name == 'bow':
    from text_descriptors.bow import TextDescriptor
    text_descriptor = TextDescriptor(type='bow', texts=train_texts, lowecase=False, ngram_range=(1,1), 
                                     max_df=.8, min_df=.01)
elif text_descriptor_name == 'tf-idf':
    from text_descriptors.bow import TextDescriptor
    text_descriptor = TextDescriptor(type='tf-idf', texts=train_texts, lowecase=False, ngram_range=(1,3), 
                                     max_df=.8, min_df=.01)
elif text_descriptor_name == 'lsa':
    from text_descriptors.lsa import LSADescriptor
    text_descriptor = LSADescriptor(type='tf-idf', texts=train_texts, lowecase=False, ngram_range=(1,3), 
                                    max_df=.8, min_df=.01, n_components=100)
elif text_descriptor_name == 'embedding':
    from text_descriptors.embedding import WordEmbedding
    from vocabulary import Vocabulary
    vocab = Vocabulary(max_df=1, min_df=0)
    vocab.add_sentences(train_texts)
    vocab.add_words(['<unk>', '<pad>'])
    text_descriptor = WordEmbedding(num_embeddings=len(vocab), embedding_dim=300)
    text_descriptor.to(device)
else:
    raise 'unknown descriptor {}'.format(text_descriptor_name)

print('descriptor size: {}'.format(text_descriptor.out_size))


descriptor size: 291


## Initialize the Regressor


In [30]:
from text_encoders.regressor import MLP, RNN

regression_model_name = 'mlp' # ['mlp', 'rnn']

if regression_model_name == 'mlp':
    regression_model = MLP(in_size=text_descriptor.out_size, h_size=4096, out_size=2048)
elif text_descriptor_name == 'embedding' and regression_model_name == 'rnn':
    regression_model = RNN(in_size=text_descriptor.out_size, h_size=2048, num_layers=1, bidirectional=False, device=device)
else:
    raise 'unknown configuration: {} + {}'.format(text_descriptor_name, regression_model_name)
    
regression_model.to(device)
regression_model

MLP(
  (fc1): Linear(in_features=291, out_features=4096, bias=True)
  (relu1): ReLU()
  (drop_1): Dropout(p=0.2)
  (fc2): Linear(in_features=4096, out_features=2048, bias=True)
  (relu2): ReLU()
)

## loss function and optimizer


In [31]:
criterion = nn.MSELoss()
criterion

MSELoss()

## optimizers


In [32]:
encoder_optimizer = optim.SGD(regression_model.parameters(), lr=0.01)
print(encoder_optimizer)

if text_descriptor_name == 'embedding':
    embedding_optimizer = optim.Adam(text_descriptor.parameters(), lr=0.001)
    print(embedding_optimizer)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)


# initialize tensorboard logger

In [33]:
exp_name = '{}-{}'.format(text_descriptor_name, regression_model_name)
datetime_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
writer = SummaryWriter(logdir=os.path.join('./log/runs/', '{}-{}'.format(exp_name, datetime_str)))
writer

<tensorboardX.writer.SummaryWriter at 0x7f4bf63e0470>

# Train Regression

In [34]:
epochs = 10
for e in range(epochs):
    print('epoch: {}'.format(e))
    for phase in phases:
        print('phase: {}'.format(phase))
        
        regression_model.train()
        if regression_model_name == 'rnn':
            text_descriptor.train()
        
        names = []
        encoded_vectors = []
        loss_count = 0
        for i, (images_names, visual_feats, captions) in enumerate(loaders[phase]):
#             print('{} - {}'.format(images_names[0], captions[0]))
            with torch.set_grad_enabled(phase == 'train'):
                if regression_model_name == 'mlp':
                    if text_descriptor_name == 'embedding':
                        idx_texts = vocab(captions, 20)
                        idx_texts = torch.LongTensor(idx_texts).to(device)
                        descriptors = text_descriptor(idx_texts)
                        descriptors = torch.mean(descriptors, dim=1)
                    else:
                        descriptors = text_descriptor.transform(captions)
                        descriptors = torch.FloatTensor(descriptors.toarray()).to(device)
                    encodes = regression_model(descriptors)
                elif regression_model_name == 'rnn':
                    idx_texts = vocab(captions, 20)
                    idx_texts = torch.LongTensor(idx_texts).to(device)
                    descriptors = text_descriptor(idx_texts)
                    encodes = regression_model(descriptors)
            
                # Evaluate the loss function
                loss = criterion(encodes, visual_feats.to(device))
    
            if phase == 'train':
                loss.backward()
                encoder_optimizer.step()
                if regression_model_name == 'rnn':
                    embedding_optimizer.step()
            else:
                encoded_vectors.append(encodes)
                names += images_names
            
            loss_count += loss.item()
            writer.add_scalar('{}-loss'.format(phase), loss, e * len(loaders[phase]) + i)
            if i%100 == 0:
                print('[{}/{}]'.format(i, len(loaders[phase])))
            
        print('loss: {}'.format(loss.item()/len(loaders[phase])))
    
        # compute measures
        metric='l2'
        k=5
        if phase != 'train':
            regression_model.eval()
            if regression_model_name == 'rnn':
                text_descriptor.eval()
                
            encoded_vectors = torch.cat(encoded_vectors, dim=0)
            coco_visual_feats = np.array(coco_visual_feats)
            print(coco_visual_feats.shape)
            print(encoded_vectors.size())
            
            avg_position = 0
            recall_at_k = 0
            mrr = 0
            for i, feats_vec in enumerate(encoded_vectors.cpu().numpy()):
                if metric == 'l2':
                    dist = np.sqrt(np.sum((coco_visual_feats - feats_vec) ** 2, axis=1))
                else:  # L1
                    dist = np.sqrt(np.sum((coco_visual_feats - feats_vec), axis=1))
                
                sorted_idx = sorted(range(coco_visual_feats.shape[0]), key=lambda x: dist[x])
                result_position = sorted_idx.index(coco_images_names[phase].index(names[i])) + 1
                avg_position += result_position
                recall_at_k += 1 if result_position <= k else 0
                mrr += 1/result_position
            writer.add_scalar('{}-avg_position'.format(phase), avg_position / len(encoded_vectors), e)
            writer.add_scalar('{}-recall@{}'.format(phase, k), recall_at_k / len(encoded_vectors), e)
            writer.add_scalar('{}-mrr'.format(phase), mrr / len(encoded_vectors), e)


epoch: 0
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.001314208984375
phase: test_A
[0/5]
loss: 0.139220130443573
(1000, 2048)
torch.Size([5000, 2048])
epoch: 1
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.0012340683937072754
phase: test_A
[0/5]
loss: 0.1269922971725464
(1000, 2048)
torch.Size([5000, 2048])
epoch: 2
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.001212866187095642
phase: test_A
[0/5]
loss: 0.12327580451965332
(1000, 2048)
torch.Size([5000, 2048])
epoch: 3
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.0011872925758361816
phase: test_A
[0/5]
loss: 0.12226094007492065
(1000, 2048)
torch.Size([5000, 2048])
epoch: 4
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.0012205474376678466
phase: test_A
[0/5]
loss: 0.12330120801925659
(1000, 2048)
torch.Size([5000, 2048])
epoch: 5
phase: train
[0/500]
[100/500]
[200/500]
[300/500]
[400/500]
loss: 0.0011808987855911255


In [30]:
loaders

{'train': <torch.utils.data.dataloader.DataLoader at 0x7f6c79b6c518>,
 'test_A': <torch.utils.data.dataloader.DataLoader at 0x7f6c7472b978>}

In [61]:
encoded_vectors.size()

torch.Size([5000, 2048])