# STEP 2: Refining representations with neural networks added to improved pipeline

## Ranking function

In [3]:
"""Retrieval ranking function for the learnt representations from the official code of im2recipe paper"""
def ranker(im_vecs, instr_vecs, N = 1000, flag = "image"):
    idxs = range(N)

    glob_rank = []
    glob_recall = {1:0.0,5:0.0,10:0.0}
    for i in range(10):

        ids = random.sample(range(0,len(im_vecs)), N)
        
        im_sub = im_vecs[ids,:]
        instr_sub = instr_vecs[ids,:]

        if flag == "image":
            sims = np.dot(im_sub,instr_sub.T) # for im2recipe
        else:
            sims = np.dot(instr_sub,im_sub.T) # for recipe2im

        med_rank = []
        recall = {1:0.0,5:0.0,10:0.0}

        for ii in idxs:

            # name = ids_sub[ii]
            # get a column of similarities
            sim = sims[ii,:]

            # sort indices in descending order
            sorting = np.argsort(sim)[::-1].tolist()

            # find where the index of the pair sample ended up in the sorting
            pos = sorting.index(ii)

            if (pos+1) == 1:
                recall[1]+=1
            if (pos+1) <=5:
                recall[5]+=1
            if (pos+1)<=10:
                recall[10]+=1

            # store the position
            med_rank.append(pos+1)

        for i in recall.keys():
            recall[i]=recall[i]/N

        med = np.median(med_rank)

        for i in recall.keys():
            glob_recall[i]+=recall[i]
        glob_rank.append(med)

    for i in glob_recall.keys():
        glob_recall[i] = glob_recall[i]/10
    
    print ("Mean median", np.average(glob_rank))
    print ("Recall", glob_recall)

# Loading necessary files

In [2]:
# validation image and full text data
img_val = torch.load("img_val.pt")
text_val = torch.load("text_val.pt")

# train image and full text data
img_train = torch.load("img_train.pt")
text_train = torch.load("text_train.pt")

# test image and full text data
img_test = torch.load("img_test.pt")
text_test = torch.load("text_test.pt")

#individual text test data
ingredients_test = torch.load("test_ingredients.pt")
instructions_test = torch.load("test_instructions.pt")
title_test = torch.load("test_title.pt")

#individual text train data
ingredients_train = torch.load("train_ingredients.pt")
instructions_train = torch.load("train_instructions.pt")
title_train = torch.load("train_title.pt")

## 2a: Using MSE Loss to train NN

In [4]:
import matplotlib.pyplot as plt
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [5]:
!export CUDA_VISIBLE_DEVICES='1,2,3,4'

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

### Model definition and train loop

In [7]:
class EmbeddingDataset(Dataset):
    def __init__(self, image_emb, text_emb, transform=None):
        self.image_emb = torch.as_tensor(np.array(image_emb))
        self.text_emb = torch.as_tensor(np.array(text_emb))        
        self.transform = transform

    def __len__(self):
        return len(self.image_emb)

    def __getitem__(self, idx):
        return self.image_emb[idx], self.text_emb[idx]

In [8]:
class EmbeddingNetwork(nn.Module):
    def __init__(self, output_size, input_size=1024):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(),
            nn.LeakyReLU()
        )
        self.layer2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = self.layer1(x)
        return self.layer2(x)

In [None]:
class AverageMeter(object):
    # Utility function for timers
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def train(train_loader, img_model, txt_model, criterion, optimizer, epoch):
    print('Starting training epoch {}'.format(epoch))
    img_model.train()
    txt_model.train()
    
    batch_time, data_time, losses = AverageMeter(), AverageMeter(), AverageMeter()
    end = time.time()
    running_loss = 0.
    last_loss = 0.
    optimizer.zero_grad()
    
    for i, (image_emb, text_emb) in enumerate(train_loader):
    
        # Use GPU if available
        if use_gpu: 
            image_emb, text_emb = image_emb.to(f'cuda:{img_model.device_ids[0]}'), text_emb.to(f'cuda:{txt_model.device_ids[0]}')

        data_time.update(time.time() - end)

        # Run forward pass
        out_image_emb = img_model(image_emb) 
        out_text_emb = txt_model(text_emb)
        loss = criterion(out_image_emb, out_text_emb) 

        # Compute gradient and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        # Print model accuracy -- in the code below
        running_loss += loss.item()
        if i % 2000 == 0:
            last_loss = running_loss / 2000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
        
        if i % 2000 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                'Time {batch_time.val} ({batch_time.avg})\t'
                'Data {data_time.val} ({data_time.avg})\t'.format(
                  epoch, i, len(train_loader), batch_time=batch_time,
                 data_time=data_time)) 

    print('Finished training epoch {}'.format(epoch))

### Training: Dim = 512; all components

#### im2recipe and recipe2im

In [None]:
img_model = EmbeddingNetwork(512)
img_model= nn.DataParallel(img_model, device_ids=[1,2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(512);
txt_model= nn.DataParallel(txt_model, device_ids=[1,2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)
    
train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-512-epoch-{}.pth'.format(epoch+1))

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(512)
img_model_title= nn.DataParallel(img_model_title, device_ids=[1,2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(512);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[1,2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model_title.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    
epochs = 5
for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
    torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-512-epoch-{}.pth'.format(epoch+1))
    torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-512-epoch-{}.pth'.format(epoch+1))

#### im2ingredients and ingredients2im

In [None]:
img_model_ingredients = EmbeddingNetwork(512)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[1,2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(512)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[1,2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-512-epoch-{}.pth'.format(epoch+1))

#### im2instructions and instructions2im

In [None]:
img_model_instructions = EmbeddingNetwork(512)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[1,2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));
txt_model_instructions = EmbeddingNetwork(512)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[1,2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-512-epoch-{}.pth'.format(epoch+1))

### Training: dims = 256; all components

#### im2recipe and recipe2im

In [None]:
img_model = EmbeddingNetwork(256)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(256);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-256-epoch-{}.pth'.format(epoch+1))

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(256)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(256);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-256-epoch-{}.pth'.format(epoch+1))

#### im2ingredients and ingredients2im

In [None]:
img_model_ingredients = EmbeddingNetwork(256)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(256)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-256-epoch-{}.pth'.format(epoch+1))

#### im2instructions and instructions2im

In [None]:
img_model_instructions = EmbeddingNetwork(256)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(256)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-256-epoch-{}.pth'.format(epoch+1))

### Training: dims = 128; all components

#### im2recipe and recipe2im

In [None]:
img_model = EmbeddingNetwork(128)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(128);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-128-epoch-{}.pth'.format(epoch+1))

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(128)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(128);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-128-epoch-{}.pth'.format(epoch+1))

#### im2ingredients and ingredients2im

In [None]:
img_model_ingredients = EmbeddingNetwork(128)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(128)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-128-epoch-{}.pth'.format(epoch+1))

#### im2instructions and instructions2im

In [None]:
img_model_instructions = EmbeddingNetwork(128)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(128)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-128-epoch-{}.pth'.format(epoch+1))

### Training: dims = 64; all components

#### im2recipe and recipe2im

In [None]:
img_model = EmbeddingNetwork(64)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(64);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-64-epoch-{}.pth'.format(epoch+1))

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(64)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(64);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-64-epoch-{}.pth'.format(epoch+1))

#### im2ingredients and ingredients2im

In [None]:
img_model_ingredients = EmbeddingNetwork(64)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(64)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-64-epoch-{}.pth'.format(epoch+1))

#### im2instructions and instructions2im

In [None]:
img_model_instructions = EmbeddingNetwork(64)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(64)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-64-epoch-{}.pth'.format(epoch+1))

### Dimensional Analysis on val data. Dims: [64, 128, 256, 512]; all components

#### 512

In [9]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(512)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-512-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(512)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-512-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(512)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-512-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(512)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-512-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(512)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-512-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(512)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-512-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(512)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-512-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(512)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-512-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();

#### Full recipe

In [11]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 4.9
Recall {1: 0.24130000000000001, 5: 0.5218, 10: 0.6375}
Running im2recipe for dims = 512 and sample = 10000
Mean median 41.5
Recall {1: 0.060719999999999996, 5: 0.19102, 10: 0.27913}


#### Title

In [12]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 1.45
Recall {1: 0.49539999999999995, 5: 0.7586, 10: 0.8249000000000001}
Running im2title for dims = 512 and sample = 10000
Mean median 7.0
Recall {1: 0.2185, 5: 0.4593799999999999, 10: 0.56763}


#### Ingredients

In [13]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2instructions and instructions2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 11.9
Recall {1: 0.1045, 5: 0.3227999999999999, 10: 0.4681}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 112.0
Recall {1: 0.01713, 5: 0.06742000000000001, 10: 0.11599}


#### Instructions

In [14]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2ingredients and ingredients2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 9.15
Recall {1: 0.13710000000000003, 5: 0.38470000000000004, 10: 0.5317000000000001}
Running im2instructions for dims = 512 and sample = 10000
Mean median 84.15
Recall {1: 0.0245, 5: 0.09233, 10: 0.15209}


#### 256

In [None]:
# im2recipe 256
img_model_full_256 = EmbeddingNetwork(256)
img_model_full_256 = nn.DataParallel(img_model_full_256, device_ids=[1])
img_model_full_256.load_state_dict(torch.load("checkpoints/img-model-full-256-epoch-10.pth"))
img_model_full_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_full_256.eval();
txt_model_full_256 = EmbeddingNetwork(256)
txt_model_full_256 = nn.DataParallel(txt_model_full_256, device_ids=[1])
txt_model_full_256.load_state_dict(torch.load("checkpoints/txt-model-full-256-epoch-10.pth"))
txt_model_full_256.to((f'cuda:{txt_model_full_256.device_ids[0]}'));
txt_model_full_256.eval();

#im2title 256
img_model_title_256 = EmbeddingNetwork(256)
img_model_title_256 = nn.DataParallel(img_model_title_256, device_ids=[1])
img_model_title_256.load_state_dict(torch.load("checkpoints/img-model-title-256-epoch-5.pth"))
img_model_title_256.to((f'cuda:{img_model_title_256.device_ids[0]}'));
img_model_title_256.eval();
txt_model_title_256 = EmbeddingNetwork(256)
txt_model_title_256 = nn.DataParallel(txt_model_title_256, device_ids=[1])
txt_model_title_256.load_state_dict(torch.load("checkpoints/txt-model-title-256-epoch-5.pth"))
txt_model_title_256.to((f'cuda:{txt_model_title_256.device_ids[0]}'));
txt_model_title_256.eval();

#im2instructions 256
img_model_instructions_256 = EmbeddingNetwork(256)
img_model_instructions_256 = nn.DataParallel(img_model_instructions_256, device_ids=[1])
img_model_instructions_256.load_state_dict(torch.load("checkpoints/img-model-instructions-256-epoch-5.pth"))
img_model_instructions_256.to((f'cuda:{img_model_instructions_256.device_ids[0]}'));
img_model_instructions_256.eval();
txt_model_instructions_256 = EmbeddingNetwork(256)
txt_model_instructions_256 = nn.DataParallel(txt_model_instructions_256, device_ids=[1])
txt_model_instructions_256.load_state_dict(torch.load("checkpoints/txt-model-instructions-256-epoch-5.pth"))
txt_model_instructions_256.to((f'cuda:{txt_model_instructions_256.device_ids[0]}'));
txt_model_instructions_256.eval();

#im2ingredients 256
img_model_ingredients_256 = EmbeddingNetwork(256)
img_model_ingredients_256 = nn.DataParallel(img_model_ingredients_256, device_ids=[1])
img_model_ingredients_256.load_state_dict(torch.load("checkpoints/img-model-ingredients-256-epoch-5.pth"))
img_model_ingredients_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_ingredients_256.eval();
txt_model_ingredients_256 = EmbeddingNetwork(256)
txt_model_ingredients_256 = nn.DataParallel(txt_model_ingredients_256, device_ids=[1])
txt_model_ingredients_256.load_state_dict(torch.load("checkpoints/txt-model-ingredients-256-epoch-5.pth"))
txt_model_ingredients_256.to((f'cuda:{txt_model_ingredients_256.device_ids[0]}'));
txt_model_ingredients_256.eval();

#### Full recipe

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_256(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_256(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 256 and sample = 1000
Mean median 6.95
Recall {1: 0.2076, 5: 0.4608, 10: 0.571}
Running im2recipe for dims = 256 and sample = 10000
Mean median 59.2
Recall {1: 0.05, 5: 0.16153, 10: 0.24162}


#### Title

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_256(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_256(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 256 and sample = 1000
Mean median 2.0
Recall {1: 0.45620000000000005, 5: 0.7178, 10: 0.7921000000000001}
Running im2title for dims = 256 and sample = 10000
Mean median 9.1
Recall {1: 0.19510000000000002, 5: 0.41746999999999995, 10: 0.52024}


In [None]:
#### Ingredients

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_256(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_256(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 256 and sample = 1000
Mean median 13.3
Recall {1: 0.09599999999999999, 5: 0.2969, 10: 0.4372}
Running im2ingredients for dims = 256 and sample = 10000
Mean median 127.05
Recall {1: 0.015189999999999999, 5: 0.06169, 10: 0.10497000000000001}


#### Instructions

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_256(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_256(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 256 and sample = 1000
Mean median 13.7
Recall {1: 0.1041, 5: 0.3077, 10: 0.44000000000000006}
Running im2instructions for dims = 256 and sample = 10000
Mean median 129.0
Recall {1: 0.01961, 5: 0.07319, 10: 0.12057999999999999}


#### 128

In [18]:
# im2recipe 128
img_model_full_128 = EmbeddingNetwork(128)
img_model_full_128 = nn.DataParallel(img_model_full_128, device_ids=[1])
img_model_full_128.load_state_dict(torch.load("checkpoints/img-model-full-128-epoch-10.pth", map_location = "cuda:1"))
img_model_full_128.to((f'cuda:{img_model_full_128.device_ids[0]}'));
img_model_full_128.eval();
txt_model_full_128 = EmbeddingNetwork(128)
txt_model_full_128 = nn.DataParallel(txt_model_full_128, device_ids=[1])
txt_model_full_128.load_state_dict(torch.load("checkpoints/txt-model-full-128-epoch-10.pth", map_location = "cuda:1"))
txt_model_full_128.to((f'cuda:{txt_model_full_128.device_ids[0]}'));
txt_model_full_128.eval();

#im2title 128
img_model_title_128 = EmbeddingNetwork(128)
img_model_title_128 = nn.DataParallel(img_model_title_128, device_ids=[1])
img_model_title_128.load_state_dict(torch.load("checkpoints/img-model-title-128-epoch-5.pth", map_location = "cuda:1"))
img_model_title_128.to((f'cuda:{img_model_title_128.device_ids[0]}'));
img_model_title_128.eval();
txt_model_title_128 = EmbeddingNetwork(128)
txt_model_title_128 = nn.DataParallel(txt_model_title_128, device_ids=[1])
txt_model_title_128.load_state_dict(torch.load("checkpoints/txt-model-title-128-epoch-5.pth", map_location = "cuda:1"))
txt_model_title_128.to((f'cuda:{txt_model_title_128.device_ids[0]}'));
txt_model_title_128.eval();

#im2instructions 128
img_model_instructions_128 = EmbeddingNetwork(128)
img_model_instructions_128 = nn.DataParallel(img_model_instructions_128, device_ids=[1])
img_model_instructions_128.load_state_dict(torch.load("checkpoints/img-model-instructions-128-epoch-5.pth", map_location = "cuda:1"))
img_model_instructions_128.to((f'cuda:{img_model_instructions_128.device_ids[0]}'));
img_model_instructions_128.eval();
txt_model_instructions_128 = EmbeddingNetwork(128)
txt_model_instructions_128 = nn.DataParallel(txt_model_instructions_128, device_ids=[1])
txt_model_instructions_128.load_state_dict(torch.load("checkpoints/txt-model-instructions-128-epoch-5.pth", map_location = "cuda:1"))
txt_model_instructions_128.to((f'cuda:{txt_model_instructions_128.device_ids[0]}'));
txt_model_instructions_128.eval();

#im2ingredients 128
img_model_ingredients_128 = EmbeddingNetwork(128)
img_model_ingredients_128 = nn.DataParallel(img_model_ingredients_128, device_ids=[1])
img_model_ingredients_128.load_state_dict(torch.load("checkpoints/img-model-ingredients-128-epoch-5.pth", map_location = "cuda:1"))
img_model_ingredients_128.to((f'cuda:{img_model_full_128.device_ids[0]}'));
img_model_ingredients_128.eval();
txt_model_ingredients_128 = EmbeddingNetwork(128)
txt_model_ingredients_128 = nn.DataParallel(txt_model_ingredients_128, device_ids=[1])
txt_model_ingredients_128.load_state_dict(torch.load("checkpoints/txt-model-ingredients-128-epoch-5.pth", map_location = "cuda:1"))
txt_model_ingredients_128.to((f'cuda:{txt_model_ingredients_128.device_ids[0]}'));
txt_model_ingredients_128.eval();

#### Full recipe

In [19]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_128(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_128(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 128 and sample = 1000
Mean median 13.25
Recall {1: 0.14609999999999998, 5: 0.3574, 10: 0.4633}
Running im2recipe for dims = 128 and sample = 10000
Mean median 122.0
Recall {1: 0.031409999999999993, 5: 0.10732, 10: 0.16516000000000003}


#### Title

In [20]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_128(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_128(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 128 and sample = 1000
Mean median 2.0
Recall {1: 0.40310000000000007, 5: 0.6633, 10: 0.7434999999999999}
Running im2title for dims = 128 and sample = 10000
Mean median 13.65
Recall {1: 0.16215000000000002, 5: 0.36424000000000006, 10: 0.4618300000000001}


#### Ingredients

In [21]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_128(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_128(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 128 and sample = 1000
Mean median 36.0
Recall {1: 0.06910000000000001, 5: 0.2001, 10: 0.2903}
Running im2ingredients for dims = 128 and sample = 10000
Mean median 334.85
Recall {1: 0.014300000000000002, 5: 0.05009, 10: 0.07991}


#### Instructions

In [22]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_128(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_128(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 128 and sample = 1000
Mean median 46.65
Recall {1: 0.06220000000000001, 5: 0.18119999999999997, 10: 0.2584000000000001}
Running im2instructions for dims = 128 and sample = 10000
Mean median 448.1
Recall {1: 0.01051, 5: 0.04272, 10: 0.06999999999999999}


#### 64

In [26]:
# im2recipe 64
img_model_full_64 = EmbeddingNetwork(64)
img_model_full_64 = nn.DataParallel(img_model_full_64, device_ids=[1])
img_model_full_64.load_state_dict(torch.load("checkpoints/img-model-full-64-epoch-10.pth"))
img_model_full_64.to((f'cuda:{img_model_full_64.device_ids[0]}'));
img_model_full_64.eval();
txt_model_full_64 = EmbeddingNetwork(64)
txt_model_full_64 = nn.DataParallel(txt_model_full_64, device_ids=[1])
txt_model_full_64.load_state_dict(torch.load("checkpoints/txt-model-full-64-epoch-10.pth"))
txt_model_full_64.to((f'cuda:{txt_model_full_64.device_ids[0]}'));
txt_model_full_64.eval();

#im2title 64
img_model_title_64 = EmbeddingNetwork(64)
img_model_title_64 = nn.DataParallel(img_model_title_64, device_ids=[1])
img_model_title_64.load_state_dict(torch.load("checkpoints/img-model-title-64-epoch-5.pth"))
img_model_title_64.to((f'cuda:{img_model_title_64.device_ids[0]}'));
img_model_title_64.eval();
txt_model_title_64 = EmbeddingNetwork(64)
txt_model_title_64 = nn.DataParallel(txt_model_title_64, device_ids=[1])
txt_model_title_64.load_state_dict(torch.load("checkpoints/txt-model-title-64-epoch-5.pth"))
txt_model_title_64.to((f'cuda:{txt_model_title_64.device_ids[0]}'));
txt_model_title_64.eval();

#im2instructions 64
img_model_instructions_64 = EmbeddingNetwork(64)
img_model_instructions_64 = nn.DataParallel(img_model_instructions_64, device_ids=[1])
img_model_instructions_64.load_state_dict(torch.load("checkpoints/img-model-instructions-64-epoch-5.pth"))
img_model_instructions_64.to((f'cuda:{img_model_instructions_64.device_ids[0]}'));
img_model_instructions_64.eval();
txt_model_instructions_64 = EmbeddingNetwork(64)
txt_model_instructions_64 = nn.DataParallel(txt_model_instructions_64, device_ids=[1])
txt_model_instructions_64.load_state_dict(torch.load("checkpoints/txt-model-instructions-64-epoch-5.pth"))
txt_model_instructions_64.to((f'cuda:{txt_model_instructions_64.device_ids[0]}'));
txt_model_instructions_64.eval();

#im2ingredients 64
img_model_ingredients_64 = EmbeddingNetwork(64)
img_model_ingredients_64 = nn.DataParallel(img_model_ingredients_64, device_ids=[1])
img_model_ingredients_64.load_state_dict(torch.load("checkpoints/img-model-ingredients-64-epoch-5.pth"))
img_model_ingredients_64.to((f'cuda:{img_model_full_64.device_ids[0]}'));
img_model_ingredients_64.eval();
txt_model_ingredients_64 = EmbeddingNetwork(64)
txt_model_ingredients_64 = nn.DataParallel(txt_model_ingredients_64, device_ids=[1])
txt_model_ingredients_64.load_state_dict(torch.load("checkpoints/txt-model-ingredients-64-epoch-5.pth"))
txt_model_ingredients_64.to((f'cuda:{txt_model_ingredients_64.device_ids[0]}'));
txt_model_ingredients_64.eval();

#### Full recipe

In [27]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_64(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_64(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 51.3
Recall {1: 0.056499999999999995, 5: 0.1654, 10: 0.2392}
Running im2recipe for dims = 512 and sample = 10000
Mean median 510.75
Recall {1: 0.009409999999999998, 5: 0.03598, 10: 0.05913}


#### Title

In [28]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_64(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_64(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 4.9
Recall {1: 0.27979999999999994, 5: 0.5205, 10: 0.6196999999999999}
Running im2title for dims = 512 and sample = 10000
Mean median 38.7
Recall {1: 0.09568999999999998, 5: 0.24392, 10: 0.32765999999999995}


#### Ingredients

In [29]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_64(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_64(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 84.6
Recall {1: 0.0372, 5: 0.11200000000000002, 10: 0.1706}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 828.8
Recall {1: 0.00601, 5: 0.022809999999999997, 10: 0.038959999999999995}


#### Instructions

In [30]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_64(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_64(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 84.05
Recall {1: 0.028300000000000002, 5: 0.09469999999999999, 10: 0.1502}
Running im2instructions for dims = 512 and sample = 10000
Mean median 835.5
Recall {1: 0.005619999999999999, 5: 0.02037, 10: 0.033839999999999995}


### Evaluation and Ablation Studies

 We can see that dimensions = 512 has a better performance

In [None]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(512)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-512-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(512)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-512-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(512)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-512-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(512)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-512-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(512)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-512-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(512)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-512-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(512)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-512-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(512)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-512-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();

#### Full recipe

In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_test[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 5.2
Recall {1: 0.23480000000000004, 5: 0.5183000000000001, 10: 0.6353}
Running im2recipe for dims = 512 and sample = 10000
Mean median 43.1
Recall {1: 0.06054999999999999, 5: 0.18576, 10: 0.27418}


#### Title

In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(title_test[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 24.6
Recall {1: 0.12349999999999998, 5: 0.29239999999999994, 10: 0.3778}
Running im2title for dims = 512 and sample = 10000
Mean median 247.35
Recall {1: 0.03185, 5: 0.09637999999999998, 10: 0.14421}


#### Ingredients

In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(ingredients_test[i], 0))).cpu().detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 15.2
Recall {1: 0.08979999999999998, 5: 0.2829, 10: 0.41550000000000004}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 141.25
Recall {1: 0.014410000000000001, 5: 0.05796, 10: 0.10085999999999999}


#### Instructions

In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(instructions_test[i], 0))).cpu().detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 21.85
Recall {1: 0.077, 5: 0.2365, 10: 0.35269999999999996}
Running im2instructions for dims = 512 and sample = 10000
Mean median 204.25
Recall {1: 0.012629999999999999, 5: 0.04911, 10: 0.08324000000000001}
