In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader, IterableDataset

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import time
import os
import copy
import json
import random

from PIL import Image

# from transformers import BertTokenizer, BertModel
import pickle

from sklearn.decomposition import PCA
# from cca_zoo.models import CCA

# from cca_zoo.deepmodels import architectures
# from cca_zoo.deepmodels import DVCCA, DCCA
# from cca_zoo.deepmodels.architectures import BaseEncoder, Encoder, Decoder
# from cca_zoo.deepmodels.dcca import _DCCA_base

from sklearn.preprocessing import normalize

# making sure that the whole embedding tensor is printed in output
torch.set_printoptions(threshold=10_000)

# Loading necessary files

In [2]:
img_val = torch.load("img_val.pt")
text_val = torch.load("text_val.pt")

img_train = torch.load("img_train.pt")
text_train = torch.load("text_train.pt")

img_test = torch.load("img_test.pt")
text_test = torch.load("text_test.pt")

ingredients_test = torch.load("test_ingredients.pt")
instructions_test = torch.load("test_instructions.pt")
title_test = torch.load("test_title.pt")

ingredients_train = torch.load("train_ingredients.pt")
instructions_train = torch.load("train_instructions.pt")
title_train = torch.load("train_title.pt")

## Ranking function

In [3]:
def ranker(im_vecs, instr_vecs, N = 1000, flag = "image"):
    # Ranker
    idxs = range(N)

    glob_rank = []
    glob_recall = {1:0.0,5:0.0,10:0.0}
    for i in range(10):

        ids = random.sample(range(0,len(im_vecs)), N)
        
        im_sub = im_vecs[ids,:]
        instr_sub = instr_vecs[ids,:]

        if flag == "image":
            sims = np.dot(im_sub,instr_sub.T) # for im2recipe
        else:
            sims = np.dot(instr_sub,im_sub.T) # for recipe2im

        med_rank = []
        recall = {1:0.0,5:0.0,10:0.0}

        for ii in idxs:

            # name = ids_sub[ii]
            # get a column of similarities
            sim = sims[ii,:]

            # sort indices in descending order
            sorting = np.argsort(sim)[::-1].tolist()

            # find where the index of the pair sample ended up in the sorting
            pos = sorting.index(ii)

            if (pos+1) == 1:
                recall[1]+=1
            if (pos+1) <=5:
                recall[5]+=1
            if (pos+1)<=10:
                recall[10]+=1

            # store the position
            med_rank.append(pos+1)

        for i in recall.keys():
            recall[i]=recall[i]/N

        med = np.median(med_rank)
#         print ("median", med)

        for i in recall.keys():
            glob_recall[i]+=recall[i]
        glob_rank.append(med)

    for i in glob_recall.keys():
        glob_recall[i] = glob_recall[i]/10
    
    print ("Mean median", np.average(glob_rank))
    print ("Recall", glob_recall)

# STEP 1

## im2recipe

### Dimensional Analysis with Val data

In [None]:
def determine_latent_dims(dims, size, flag = "image"):

    print("Applying CCA")
    cca = CCA(latent_dims = dims, random_state = 0)
    cca.fit((img_train, text_train))
    print("CCA done")
    
    print("Transforming")
    img_train_r, text_train_r = cca.transform((img_train, text_train))
    img_val_r, text_val_r = cca.transform((img_val, text_val))

    print("Results for latent dims:", str(dims), " and test sample:", str(size), " and im2recipe")
    ranker(img_val_r, text_val_r, size, flag)

##### For 1k samples

In [None]:
# For 1k samples
for dim in [2, 10, 25, 50, 100, 200, 500, 1000]:
    determine_latent_dims(dim, 1000)

Applying CCA
CCA done
Transforming
Results for latent dims: 2  and test sample: 1000  and im2recipe
Mean median 208.7
Recall {1: 0.0029000000000000002, 5: 0.015200000000000002, 10: 0.030700000000000005}
Applying CCA
CCA done
Transforming
Results for latent dims: 10  and test sample: 1000  and im2recipe
Mean median 22.65
Recall {1: 0.0509, 5: 0.20049999999999998, 10: 0.31489999999999996}
Applying CCA
CCA done
Transforming
Results for latent dims: 25  and test sample: 1000  and im2recipe
Mean median 5.0
Recall {1: 0.2132, 5: 0.524, 10: 0.6721000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 50  and test sample: 1000  and im2recipe
Mean median 2.5
Recall {1: 0.3512, 5: 0.6982999999999999, 10: 0.8155999999999999}
Applying CCA
CCA done
Transforming
Results for latent dims: 100  and test sample: 1000  and im2recipe
Mean median 2.0
Recall {1: 0.41369999999999996, 5: 0.7453999999999998, 10: 0.8433999999999999}
Applying CCA
CCA done
Transforming
Results for latent dims: 

##### For 10k samples

In [None]:
# For 10k samples
for dim in [2, 10, 25, 50, 100, 200, 500, 1000]:
    determine_latent_dims(dim, 10000)

Applying CCA
CCA done
Transforming
Results for latent dims: 2  and test sample: 10000  and im2recipe
Mean median 201.25
Recall {1: 0.0033, 5: 0.0157, 10: 0.031100000000000006}
Applying CCA
CCA done
Transforming
Results for latent dims: 10  and test sample: 10000  and im2recipe
Mean median 22.8
Recall {1: 0.05349999999999999, 5: 0.20070000000000002, 10: 0.3192}
Applying CCA
CCA done
Transforming
Results for latent dims: 25  and test sample: 10000  and im2recipe
Mean median 4.85
Recall {1: 0.21070000000000003, 5: 0.5294000000000001, 10: 0.6819000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 50  and test sample: 10000  and im2recipe
Mean median 2.6
Recall {1: 0.3436, 5: 0.6906000000000001, 10: 0.8056000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 100  and test sample: 10000  and im2recipe
Mean median 2.0
Recall {1: 0.4053, 5: 0.7384000000000001, 10: 0.8400000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 200  and t

### Ablation Studies with fixed dim = 500

##### For sample size = 1000, latent_dims = 500

In [None]:
# im2recipe
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, text_train))
print("CCA done")
print("Transforming")
img_train_r, text_train_r = cca.transform((img_train, text_train))
img_test_r, text_test_r = cca.transform((img_test, text_test))
print("Results for im2recipe: latent dims = 500, all the recipe fields")
ranker(img_test_r, text_test_r,  1000, "image")

# im2instructions
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, instructions_train))
print("CCA done")
img_test_r, instructions_test_r = cca.transform((img_test, instructions_test))
print("Results for im2instructions: latent dims = 500, only instructions")
ranker(img_test_r, instructions_test_r, 1000, "image")

# im2ingredients
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, ingredients_train))
print("CCA done")
img_test_r, ingredients_test_r = cca.transform((img_test, ingredients_test))
print("Results for im2ingredients: latent dims = 500, only ingredients")
ranker(img_test_r, ingredients_test_r,  1000, "image")

# im2title
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, title_train))
print("CCA done")
img_test_r, title_test_r = cca.transform((img_test, title_test))
print("Results for im2title: latent dims = 500, only title")
ranker(img_test_r, title_test_r,  1000, "image")

Applying CCA
CCA done
Transforming
Results for im2recipe: latent dims = 500, all the recipe fields
Mean median 1.0
Recall {1: 0.5516, 5: 0.7968000000000001, 10: 0.85}
Applying CCA
CCA done
Results for im2instructions: latent dims = 500, only instructions
Mean median 2.8
Recall {1: 0.3527, 5: 0.6108, 10: 0.6873}
Applying CCA
CCA done
Results for im2ingredients: latent dims = 500, only ingredients
Mean median 3.0
Recall {1: 0.35550000000000004, 5: 0.6089, 10: 0.6807000000000001}
Applying CCA
CCA done
Results for im2title: latent dims = 500, only title
Mean median 9.6
Recall {1: 0.22159999999999996, 5: 0.4396, 10: 0.5117}


##### For sample size = 10000, latent_dims = 500

In [None]:
# im2recipe
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, text_train))
print("CCA done")
print("Transforming")
img_train_r, text_train_r = cca.transform((img_train, text_train))
img_test_r, text_test_r = cca.transform((img_test, text_test))
print("Results for im2recipe: latent dims = 500, all the recipe fields")
ranker(img_test_r, text_test_r,  10000, "image")

# im2instructions
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, instructions_train))
print("CCA done")
img_test_r, instructions_test_r = cca.transform((img_test, instructions_test))
print("Results for im2instructions: latent dims = 500, only instructions")
ranker(img_test_r, instructions_test_r, 10000, "image")

# im2ingredients
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, ingredients_train))
print("CCA done")
img_test_r, ingredients_test_r = cca.transform((img_test, ingredients_test))
print("Results for im2ingredients: latent dims = 500, only ingredients")
ranker(img_test_r, ingredients_test_r,  10000, "image")

# im2title
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((img_train, title_train))
print("CCA done")
img_test_r, title_test_r = cca.transform((img_test, title_test))
print("Results for im2title: latent dims = 500, only title")
ranker(img_test_r, title_test_r,  10000, "image")

Applying CCA
CCA done
Transforming
Results for im2recipe: latent dims = 500, all the recipe fields
Mean median 1.0
Recall {1: 0.5408000000000002, 5: 0.7966000000000001, 10: 0.85}
Applying CCA
CCA done
Results for im2instructions: latent dims = 500, only instructions
Mean median 3.0
Recall {1: 0.3435, 5: 0.6043000000000001, 10: 0.6788000000000001}
Applying CCA
CCA done
Results for im2ingredients: latent dims = 500, only ingredients
Mean median 3.0
Recall {1: 0.36279999999999996, 5: 0.6005, 10: 0.6766000000000001}
Applying CCA
CCA done
Results for im2title: latent dims = 500, only title
Mean median 10.9
Recall {1: 0.2109, 5: 0.4307, 10: 0.4992}


## recipe2im

### Using Validation Data to figure out optimal latent_dims for 1k and 10k samples

In [None]:
def determine_latent_dims_recipe2im(dims, size, flag = "text"):

    print("Applying CCA")
    cca = CCA(latent_dims = dims, random_state = 0)
    cca.fit((text_train, img_train))
    print("CCA done")
    
    print("Transforming")
    text_train_r, img_train_r = cca.transform((img_train, text_train))
    text_val_r, img_val_r = cca.transform((text_val, img_val))

    print("Results for latent dims:", str(dims), " and test sample:", str(size), " and im2recipe")
    ranker(img_val_r, text_val_r, size, flag)

##### For 1k samples

In [None]:
# For 1k samples
for dim in [2, 10, 25, 50, 100, 200, 500, 1000]:
    determine_latent_dims_recipe2im(dim, 1000, "text")

Applying CCA
CCA done
Transforming
Results for latent dims: 2  and test sample: 1000  and im2recipe
Mean median 206.7
Recall {1: 0.0035000000000000005, 5: 0.016900000000000005, 10: 0.032900000000000006}
Applying CCA
CCA done
Transforming
Results for latent dims: 10  and test sample: 1000  and im2recipe
Mean median 22.45
Recall {1: 0.0506, 5: 0.1991, 10: 0.3191}
Applying CCA
CCA done
Transforming
Results for latent dims: 25  and test sample: 1000  and im2recipe
Mean median 5.0
Recall {1: 0.22290000000000001, 5: 0.5223000000000001, 10: 0.6703000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 50  and test sample: 1000  and im2recipe
Mean median 2.6
Recall {1: 0.35409999999999997, 5: 0.6824, 10: 0.8046000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 100  and test sample: 1000  and im2recipe
Mean median 2.0
Recall {1: 0.4122, 5: 0.7384999999999998, 10: 0.8432000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 200  and tes

##### For 10k samples

In [None]:
# For 10k samples
for dim in [2, 10, 25, 50, 100, 200, 500, 1000]:
    determine_latent_dims_recipe2im(dim, 10000, "text")

Applying CCA
CCA done
Transforming
Results for latent dims: 2  and test sample: 10000  and im2recipe
Mean median 209.65
Recall {1: 0.0035000000000000005, 5: 0.015999999999999997, 10: 0.03250000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 10  and test sample: 10000  and im2recipe
Mean median 22.85
Recall {1: 0.055700000000000006, 5: 0.2036, 10: 0.32120000000000004}
Applying CCA
CCA done
Transforming
Results for latent dims: 25  and test sample: 10000  and im2recipe
Mean median 4.9
Recall {1: 0.22090000000000004, 5: 0.5244000000000001, 10: 0.6756000000000001}
Applying CCA
CCA done
Transforming
Results for latent dims: 50  and test sample: 10000  and im2recipe
Mean median 2.7
Recall {1: 0.3515, 5: 0.6797000000000001, 10: 0.7995}
Applying CCA
CCA done
Transforming
Results for latent dims: 100  and test sample: 10000  and im2recipe
Mean median 2.0
Recall {1: 0.41339999999999993, 5: 0.7431000000000001, 10: 0.8404999999999999}
Applying CCA
CCA done
Transforming
Resul

### Ablation Studies with fixed dims = 500

##### For sample size = 1000, latent_dims = 500

In [None]:
# recipe2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((text_train, img_train))
print("CCA done")
print("Transforming")
text_train_r, img_train_r = cca.transform((text_train, img_train))
text_test_r, img_test_r = cca.transform((text_test, img_test))
print("Results for recipe2im: latent dims = 500, all the recipe fields")
ranker(img_test_r, text_test_r,  1000, "text")

# instructions2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((instructions_train, img_train))
print("CCA done")
instructions_test_r, img_test_r = cca.transform((instructions_test, img_test))
print("Results for instructions2im: latent dims = 500, only instructions")
ranker(img_test_r, instructions_test_r, 1000, "text")

# ingredients2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((ingredients_train, img_train))
print("CCA done")
ingredients_test_r, img_test_r = cca.transform((ingredients_test, img_test))
print("Results for ingredients2im: latent dims = 500, only ingredients")
ranker(img_test_r, ingredients_test_r,  1000, "text")

# title2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((title_train, img_train))
print("CCA done")
title_test_r, img_test_r  = cca.transform((title_test, img_test))
print("Results for title2im: latent dims = 500, only title")
ranker(img_test_r, title_test_r,  1000, "text")

Applying CCA
CCA done
Transforming
Results for recipe2im: latent dims = 500, all the recipe fields
Mean median 1.0
Recall {1: 0.5437000000000001, 5: 0.7927000000000001, 10: 0.8454}
Applying CCA
CCA done
Results for instructions2im: latent dims = 500, only instructions
Mean median 2.9
Recall {1: 0.36050000000000004, 5: 0.6161, 10: 0.6944}
Applying CCA
CCA done
Results for ingredients2im: latent dims = 500, only ingredients
Mean median 2.8
Recall {1: 0.371, 5: 0.6043999999999999, 10: 0.6749}
Applying CCA
CCA done
Results for title2im: latent dims = 500, only title
Mean median 9.8
Recall {1: 0.21880000000000002, 5: 0.43279999999999996, 10: 0.5057}


##### For sample size = 10000, latent_dims = 500

In [None]:
# recipe2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((text_train, img_train))
print("CCA done")
print("Transforming")
text_train_r, img_train_r = cca.transform((text_train, img_train))
text_test_r, img_test_r = cca.transform((text_test, img_test))
print("Results for recipe2im: latent dims = 500, all the recipe fields")
ranker(img_test_r, text_test_r,  10000, "text")

# instructions2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((instructions_train, img_train))
print("CCA done")
instructions_test_r, img_test_r = cca.transform((instructions_test, img_test))
print("Results for instructions2im: latent dims = 500, only instructions")
ranker(img_test_r, instructions_test_r, 10000, "text")

# ingredients2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((ingredients_train, img_train))
print("CCA done")
ingredients_test_r, img_test_r = cca.transform((ingredients_test, img_test))
print("Results for ingredients2im: latent dims = 500, only ingredients")
ranker(img_test_r, ingredients_test_r,  10000, "text")

# title2im
print("Applying CCA")
cca = CCA(latent_dims = 500, random_state = 0)
cca.fit((title_train, img_train))
print("CCA done")
title_test_r, img_test_r  = cca.transform((title_test, img_test))
print("Results for title2im: latent dims = 500, only title")
ranker(img_test_r, title_test_r,  10000, "text")

Applying CCA
CCA done
Transforming
Results for recipe2im: latent dims = 500, all the recipe fields
Mean median 1.0
Recall {1: 0.5551, 5: 0.7871, 10: 0.842}
Applying CCA
CCA done
Results for instructions2im: latent dims = 500, only instructions
Mean median 3.0
Recall {1: 0.3602, 5: 0.6132, 10: 0.6824}
Applying CCA
CCA done
Results for ingredients2im: latent dims = 500, only ingredients
Mean median 3.0
Recall {1: 0.3642, 5: 0.6056, 10: 0.6761}
Applying CCA
CCA done
Results for title2im: latent dims = 500, only title
Mean median 10.65
Recall {1: 0.21480000000000002, 5: 0.42969999999999997, 10: 0.5042000000000001}


# STEP 2

## Non Linear Embeddings - Normal Loss

In [4]:
import matplotlib.pyplot as plt
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [5]:
!export CUDA_VISIBLE_DEVICES='2,3'

In [6]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

### Model Creation dims = 512

In [7]:
class EmbeddingDataset(Dataset):
    def __init__(self, image_emb, text_emb, transform=None):
        self.image_emb = torch.as_tensor(np.array(image_emb))
        self.text_emb = torch.as_tensor(np.array(text_emb))        
        self.transform = transform

    def __len__(self):
        return len(self.image_emb)

    def __getitem__(self, idx):
        return self.image_emb[idx], self.text_emb[idx]

In [8]:
class EmbeddingNetwork(nn.Module):
    def __init__(self, output_size, input_size=1024):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(),
            nn.LeakyReLU()
        )
        self.layer2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = self.layer1(x)
        return self.layer2(x)

#### im2recipe and recipe2im

In [None]:
img_model = EmbeddingNetwork(512)
img_model= nn.DataParallel(img_model, device_ids=[1,2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(512);
txt_model= nn.DataParallel(txt_model, device_ids=[1,2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

In [None]:
train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [11]:
class AverageMeter(object):
    # Utility function for timers
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [9]:
def train(train_loader, img_model, txt_model, criterion, optimizer, epoch):
    print('Starting training epoch {}'.format(epoch))
    img_model.train()
    txt_model.train()
    
    batch_time, data_time, losses = AverageMeter(), AverageMeter(), AverageMeter()
    end = time.time()
    running_loss = 0.
    last_loss = 0.
    optimizer.zero_grad()
    
    for i, (image_emb, text_emb) in enumerate(train_loader):
    
        # Use GPU if available
        if use_gpu: 
            image_emb, text_emb = image_emb.to(f'cuda:{img_model.device_ids[0]}'), text_emb.to(f'cuda:{txt_model.device_ids[0]}')

        data_time.update(time.time() - end)

        # Run forward pass
        out_image_emb = img_model(image_emb) 
        out_text_emb = txt_model(text_emb)
        loss = criterion(out_image_emb, out_text_emb) 

        # Compute gradient and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        # Print model accuracy -- in the code below
        running_loss += loss.item()
        if i % 2000 == 0:
            last_loss = running_loss / 2000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
        
        if i % 2000 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                'Time {batch_time.val} ({batch_time.avg})\t'
                'Data {data_time.val} ({data_time.avg})\t'.format(
                  epoch, i, len(train_loader), batch_time=batch_time,
                 data_time=data_time)) 

    print('Finished training epoch {}'.format(epoch))


In [None]:
best_losses = 1e10
epochs = 10

for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00012609273195266724
Epoch: [0][0/4400]	Time 0.04702162742614746 (0.04702162742614746)	Data 0.004031181335449219 (0.004031181335449219)	
  batch 2001 loss: 0.24717661296576263
Epoch: [0][2000/4400]	Time 0.016170024871826172 (0.02201926880988522)	Data 0.0010480880737304688 (0.0020143839194141943)	
  batch 4001 loss: 0.24701430730521678
Epoch: [0][4000/4400]	Time 0.007898330688476562 (0.021020093759099592)	Data 0.0008800029754638672 (0.0025363345052027398)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012473785132169723
Epoch: [1][0/4400]	Time 0.013044357299804688 (0.013044357299804688)	Data 0.0018548965454101562 (0.0018548965454101562)	
  batch 2001 loss: 0.24695295250415802
Epoch: [1][2000/4400]	Time 0.01843857765197754 (0.013877756532462223)	Data 0.010588645935058594 (0.003927832183570995)	
  batch 4001 loss: 0.2469778815358877
Epoch: [1][4000/4400]	Time 0.009508848190307617 (0.01323062686495887)	Data 0.00092673301

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_title = EmbeddingNetwork(512)
img_model_title= nn.DataParallel(img_model_title, device_ids=[1,2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(512);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[1,2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model_title.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

In [None]:
epochs = 5

for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
    torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-512-epoch-{}.pth'.format(epoch+1))
    torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0002364463210105896
Epoch: [0][0/4400]	Time 0.032234907150268555 (0.032234907150268555)	Data 0.004739999771118164 (0.004739999771118164)	
  batch 2001 loss: 0.4691778804063797
Epoch: [0][2000/4400]	Time 0.019277572631835938 (0.021416319542560264)	Data 0.001470804214477539 (0.002138279843842727)	
  batch 4001 loss: 0.46928279390931127
Epoch: [0][4000/4400]	Time 0.012997865676879883 (0.021658227194014026)	Data 0.0013132095336914062 (0.0022000336879433708)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0002340276688337326
Epoch: [1][0/4400]	Time 0.013715028762817383 (0.013715028762817383)	Data 0.002393484115600586 (0.002393484115600586)	
  batch 2001 loss: 0.4689103953242302
Epoch: [1][2000/4400]	Time 0.01427316665649414 (0.020105006634027346)	Data 0.0012676715850830078 (0.0022924063147335633)	
  batch 4001 loss: 0.4691453038007021
Epoch: [1][4000/4400]	Time 0.008884906768798828 (0.020155201372996593)	Data 0.0013082027435

#### im2ingredients and ingredients2im

In [None]:
img_model_ingredients = EmbeddingNetwork(512)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[1,2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(512)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[1,2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 10.312287109375
Epoch: [0][0/4400]	Time 0.017222166061401367 (0.017222166061401367)	Data 0.0023643970489501953 (0.0023643970489501953)	
  batch 2001 loss: 8465.284108154297
Epoch: [0][2000/4400]	Time 0.017594575881958008 (0.012661087935951458)	Data 0.00970458984375 (0.004405748719039528)	
  batch 4001 loss: 8259.536986328125
Epoch: [0][4000/4400]	Time 0.03874492645263672 (0.012619327706296693)	Data 0.0299530029296875 (0.004399139980648911)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 4.2461435546875
Epoch: [1][0/4400]	Time 0.008536815643310547 (0.008536815643310547)	Data 0.0014150142669677734 (0.0014150142669677734)	
  batch 2001 loss: 8320.310545898437
Epoch: [1][2000/4400]	Time 0.009508609771728516 (0.012479719431742258)	Data 0.0010323524475097656 (0.00433090947259372)	
  batch 4001 loss: 8351.937978759766
Epoch: [1][4000/4400]	Time 0.008218765258789062 (0.01246253724874064)	Data 0.0009169578552246094 (0.00424889414586

#### im2instructions and instructions2im

In [None]:
img_model_instructions = EmbeddingNetwork(512)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[1,2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));
txt_model_instructions = EmbeddingNetwork(512)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[1,2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0003244844079017639
Epoch: [0][0/4400]	Time 0.01202392578125 (0.01202392578125)	Data 0.0015544891357421875 (0.0015544891357421875)	
  batch 2001 loss: 0.26000524858385327
Epoch: [0][2000/4400]	Time 0.008465290069580078 (0.012482271618630992)	Data 0.0008780956268310547 (0.004153854188056423)	
  batch 4001 loss: 0.2534713015407324
Epoch: [0][4000/4400]	Time 0.01741790771484375 (0.012547566276346258)	Data 0.009671688079833984 (0.0042191943774310325)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0001269243061542511
Epoch: [1][0/4400]	Time 0.00867319107055664 (0.00867319107055664)	Data 0.0012242794036865234 (0.0012242794036865234)	
  batch 2001 loss: 0.255437093809247
Epoch: [1][2000/4400]	Time 0.00827479362487793 (0.011620957156767076)	Data 0.0007474422454833984 (0.00357155559183299)	
  batch 4001 loss: 0.25655249509960415
Epoch: [1][4000/4400]	Time 0.034348487854003906 (0.012103707693243468)	Data 0.027225017547607422 (0.

### Model Creation dims = 256

#### im2recipe and recipe2im

In [17]:
img_model = EmbeddingNetwork(64)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(64);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00033096739649772646
Epoch: [0][0/4400]	Time 0.009809494018554688 (0.009809494018554688)	Data 0.0020685195922851562 (0.0020685195922851562)	
  batch 2001 loss: 0.25110939494520423
Epoch: [0][2000/4400]	Time 0.00599360466003418 (0.006085285242053045)	Data 0.0007460117340087891 (0.0007254072929965682)	
  batch 4001 loss: 0.2445601435303688
Epoch: [0][4000/4400]	Time 0.005810737609863281 (0.006053785537666334)	Data 0.0006694793701171875 (0.0007168109701204765)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012183801084756851
Epoch: [1][0/4400]	Time 0.0065135955810546875 (0.0065135955810546875)	Data 0.0011260509490966797 (0.0011260509490966797)	
  batch 2001 loss: 0.2472734719440341
Epoch: [1][2000/4400]	Time 0.006289243698120117 (0.006054450010788673)	Data 0.0007345676422119141 (0.0007097547141270063)	
  batch 4001 loss: 0.24817016860842706
Epoch: [1][4000/4400]	Time 0.0058710575103759766 (0.0060453876737534065)	Data 0.

#### im2title and title2im

In [18]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(64)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(64);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0002283381372690201
Epoch: [0][0/4400]	Time 0.007862329483032227 (0.007862329483032227)	Data 0.0012707710266113281 (0.0012707710266113281)	
  batch 2001 loss: 0.46875293447077276
Epoch: [0][2000/4400]	Time 0.005851030349731445 (0.006021185197691986)	Data 0.0006771087646484375 (0.0007055822102681569)	
  batch 4001 loss: 0.4688471542447805
Epoch: [0][4000/4400]	Time 0.005922555923461914 (0.00599160858226758)	Data 0.0006802082061767578 (0.0006984085835268783)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00023445358872413636
Epoch: [1][0/4400]	Time 0.006246089935302734 (0.006246089935302734)	Data 0.0010914802551269531 (0.0010914802551269531)	
  batch 2001 loss: 0.46840228547155854
Epoch: [1][2000/4400]	Time 0.0060079097747802734 (0.0059432157690914676)	Data 0.0006971359252929688 (0.0006883508976789071)	
  batch 4001 loss: 0.46883764205873013
Epoch: [1][4000/4400]	Time 0.005984783172607422 (0.005949453841564328)	Data 0.00

#### im2ingredients and ingredients2im

In [19]:
img_model_ingredients = EmbeddingNetwork(64)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(64)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00033422619104385377
Epoch: [0][0/4400]	Time 0.01181483268737793 (0.01181483268737793)	Data 0.0015099048614501953 (0.0015099048614501953)	
  batch 2001 loss: 0.25932632664591077
Epoch: [0][2000/4400]	Time 0.006026506423950195 (0.006081557881528291)	Data 0.0006926059722900391 (0.0007135293770884943)	
  batch 4001 loss: 0.25341323935985566
Epoch: [0][4000/4400]	Time 0.006021738052368164 (0.006072451906840642)	Data 0.0006964206695556641 (0.0007056836574204533)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012086976319551468
Epoch: [1][0/4400]	Time 0.00619816780090332 (0.00619816780090332)	Data 0.0009703636169433594 (0.0009703636169433594)	
  batch 2001 loss: 0.25523896896839143
Epoch: [1][2000/4400]	Time 0.005952358245849609 (0.006008099699425495)	Data 0.0006701946258544922 (0.0006924394009888977)	
  batch 4001 loss: 0.2558343113809824
Epoch: [1][4000/4400]	Time 0.005926609039306641 (0.006048412210969322)	Data 0.000689

#### im2instructions and instructions2im

In [20]:
img_model_instructions = EmbeddingNetwork(64)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(64)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0003304431736469269
Epoch: [0][0/4400]	Time 0.007906436920166016 (0.007906436920166016)	Data 0.0015239715576171875 (0.0015239715576171875)	
  batch 2001 loss: 0.25847531884908675
Epoch: [0][2000/4400]	Time 0.006007671356201172 (0.0061836679955234175)	Data 0.0006887912750244141 (0.0007450405446843229)	
  batch 4001 loss: 0.2516851299479604
Epoch: [0][4000/4400]	Time 0.005887031555175781 (0.0061224357869082225)	Data 0.0006945133209228516 (0.0007264040255957739)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0001218315213918686
Epoch: [1][0/4400]	Time 0.006499052047729492 (0.006499052047729492)	Data 0.0012738704681396484 (0.0012738704681396484)	
  batch 2001 loss: 0.2536966943666339
Epoch: [1][2000/4400]	Time 0.006014823913574219 (0.006115863467382824)	Data 0.0006887912750244141 (0.0007232024275261661)	
  batch 4001 loss: 0.2543358924239874
Epoch: [1][4000/4400]	Time 0.005895376205444336 (0.00610050372081052)	Data 0.00067

### Model Creation dims = 128

#### im2recipe and recipe2im

In [12]:
img_model = EmbeddingNetwork(128)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(128);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-128-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0003114860653877258
Epoch: [0][0/4400]	Time 4.029629468917847 (4.029629468917847)	Data 0.0025763511657714844 (0.0025763511657714844)	
  batch 2001 loss: 0.24822020929306746
Epoch: [0][2000/4400]	Time 0.006309986114501953 (0.008277938581597264)	Data 0.0007994174957275391 (0.0007816712180713843)	
  batch 4001 loss: 0.24229517125338315
Epoch: [0][4000/4400]	Time 0.006105184555053711 (0.007278278630186813)	Data 0.0007100105285644531 (0.0007796298858673088)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00011981219798326493
Epoch: [1][0/4400]	Time 0.006433010101318359 (0.006433010101318359)	Data 0.0010907649993896484 (0.0010907649993896484)	
  batch 2001 loss: 0.24500784227997063
Epoch: [1][2000/4400]	Time 0.0060389041900634766 (0.006266078014840846)	Data 0.0006804466247558594 (0.0007230656436536981)	
  batch 4001 loss: 0.24615362104028463
Epoch: [1][4000/4400]	Time 0.006071805953979492 (0.0062487453736236355)	Data 0.000693

#### im2title and title2im

In [18]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(64)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(64);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0002283381372690201
Epoch: [0][0/4400]	Time 0.007862329483032227 (0.007862329483032227)	Data 0.0012707710266113281 (0.0012707710266113281)	
  batch 2001 loss: 0.46875293447077276
Epoch: [0][2000/4400]	Time 0.005851030349731445 (0.006021185197691986)	Data 0.0006771087646484375 (0.0007055822102681569)	
  batch 4001 loss: 0.4688471542447805
Epoch: [0][4000/4400]	Time 0.005922555923461914 (0.00599160858226758)	Data 0.0006802082061767578 (0.0006984085835268783)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00023445358872413636
Epoch: [1][0/4400]	Time 0.006246089935302734 (0.006246089935302734)	Data 0.0010914802551269531 (0.0010914802551269531)	
  batch 2001 loss: 0.46840228547155854
Epoch: [1][2000/4400]	Time 0.0060079097747802734 (0.0059432157690914676)	Data 0.0006971359252929688 (0.0006883508976789071)	
  batch 4001 loss: 0.46883764205873013
Epoch: [1][4000/4400]	Time 0.005984783172607422 (0.005949453841564328)	Data 0.00

#### im2ingredients and ingredients2im

In [19]:
img_model_ingredients = EmbeddingNetwork(64)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(64)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00033422619104385377
Epoch: [0][0/4400]	Time 0.01181483268737793 (0.01181483268737793)	Data 0.0015099048614501953 (0.0015099048614501953)	
  batch 2001 loss: 0.25932632664591077
Epoch: [0][2000/4400]	Time 0.006026506423950195 (0.006081557881528291)	Data 0.0006926059722900391 (0.0007135293770884943)	
  batch 4001 loss: 0.25341323935985566
Epoch: [0][4000/4400]	Time 0.006021738052368164 (0.006072451906840642)	Data 0.0006964206695556641 (0.0007056836574204533)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012086976319551468
Epoch: [1][0/4400]	Time 0.00619816780090332 (0.00619816780090332)	Data 0.0009703636169433594 (0.0009703636169433594)	
  batch 2001 loss: 0.25523896896839143
Epoch: [1][2000/4400]	Time 0.005952358245849609 (0.006008099699425495)	Data 0.0006701946258544922 (0.0006924394009888977)	
  batch 4001 loss: 0.2558343113809824
Epoch: [1][4000/4400]	Time 0.005926609039306641 (0.006048412210969322)	Data 0.000689

#### im2instructions and instructions2im

In [20]:
img_model_instructions = EmbeddingNetwork(64)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(64)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0003304431736469269
Epoch: [0][0/4400]	Time 0.007906436920166016 (0.007906436920166016)	Data 0.0015239715576171875 (0.0015239715576171875)	
  batch 2001 loss: 0.25847531884908675
Epoch: [0][2000/4400]	Time 0.006007671356201172 (0.0061836679955234175)	Data 0.0006887912750244141 (0.0007450405446843229)	
  batch 4001 loss: 0.2516851299479604
Epoch: [0][4000/4400]	Time 0.005887031555175781 (0.0061224357869082225)	Data 0.0006945133209228516 (0.0007264040255957739)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0001218315213918686
Epoch: [1][0/4400]	Time 0.006499052047729492 (0.006499052047729492)	Data 0.0012738704681396484 (0.0012738704681396484)	
  batch 2001 loss: 0.2536966943666339
Epoch: [1][2000/4400]	Time 0.006014823913574219 (0.006115863467382824)	Data 0.0006887912750244141 (0.0007232024275261661)	
  batch 4001 loss: 0.2543358924239874
Epoch: [1][4000/4400]	Time 0.005895376205444336 (0.00610050372081052)	Data 0.00067

### Model Creation dims = 64

#### im2recipe and recipe2im

In [17]:
img_model = EmbeddingNetwork(64)
img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);
txt_model = EmbeddingNetwork(64);
txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);
optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

train_dataset = EmbeddingDataset(img_train, text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

best_losses = 1e10
epochs = 10
for epoch in range(epochs):
    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'checkpoints/img-model-full-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'checkpoints/txt-model-full-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00033096739649772646
Epoch: [0][0/4400]	Time 0.009809494018554688 (0.009809494018554688)	Data 0.0020685195922851562 (0.0020685195922851562)	
  batch 2001 loss: 0.25110939494520423
Epoch: [0][2000/4400]	Time 0.00599360466003418 (0.006085285242053045)	Data 0.0007460117340087891 (0.0007254072929965682)	
  batch 4001 loss: 0.2445601435303688
Epoch: [0][4000/4400]	Time 0.005810737609863281 (0.006053785537666334)	Data 0.0006694793701171875 (0.0007168109701204765)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012183801084756851
Epoch: [1][0/4400]	Time 0.0065135955810546875 (0.0065135955810546875)	Data 0.0011260509490966797 (0.0011260509490966797)	
  batch 2001 loss: 0.2472734719440341
Epoch: [1][2000/4400]	Time 0.006289243698120117 (0.006054450010788673)	Data 0.0007345676422119141 (0.0007097547141270063)	
  batch 4001 loss: 0.24817016860842706
Epoch: [1][4000/4400]	Time 0.0058710575103759766 (0.0060453876737534065)	Data 0.

#### im2title and title2im

In [18]:
title_dataset = EmbeddingDataset(img_train, title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

img_model_title = EmbeddingNetwork(64)
img_model_title= nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(64);
txt_model_title= nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(img_model.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_title.device_ids[0]}'))
    img_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))
    txt_model_title = txt_model_title.to((f'cuda:{txt_model_title.device_ids[0]}'))

epochs = 5
for epoch in range(epochs):
    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'checkpoints/img-model-title-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'checkpoints/txt-model-title-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0002283381372690201
Epoch: [0][0/4400]	Time 0.007862329483032227 (0.007862329483032227)	Data 0.0012707710266113281 (0.0012707710266113281)	
  batch 2001 loss: 0.46875293447077276
Epoch: [0][2000/4400]	Time 0.005851030349731445 (0.006021185197691986)	Data 0.0006771087646484375 (0.0007055822102681569)	
  batch 4001 loss: 0.4688471542447805
Epoch: [0][4000/4400]	Time 0.005922555923461914 (0.00599160858226758)	Data 0.0006802082061767578 (0.0006984085835268783)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00023445358872413636
Epoch: [1][0/4400]	Time 0.006246089935302734 (0.006246089935302734)	Data 0.0010914802551269531 (0.0010914802551269531)	
  batch 2001 loss: 0.46840228547155854
Epoch: [1][2000/4400]	Time 0.0060079097747802734 (0.0059432157690914676)	Data 0.0006971359252929688 (0.0006883508976789071)	
  batch 4001 loss: 0.46883764205873013
Epoch: [1][4000/4400]	Time 0.005984783172607422 (0.005949453841564328)	Data 0.00

#### im2ingredients and ingredients2im

In [19]:
img_model_ingredients = EmbeddingNetwork(64)
img_model_ingredients= nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'));

txt_model_ingredients = EmbeddingNetwork(64)
txt_model_ingredients= nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to((f'cuda:{txt_model_ingredients.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_ingredients.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

ingredients_dataset = EmbeddingDataset(img_train, ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    img_model_ingredients = img_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))
    txt_model_ingredients = txt_model_ingredients.to((f'cuda:{img_model_ingredients.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_ingredients.state_dict(), 'checkpoints/img-model-ingredients-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'checkpoints/txt-model-ingredients-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.00033422619104385377
Epoch: [0][0/4400]	Time 0.01181483268737793 (0.01181483268737793)	Data 0.0015099048614501953 (0.0015099048614501953)	
  batch 2001 loss: 0.25932632664591077
Epoch: [0][2000/4400]	Time 0.006026506423950195 (0.006081557881528291)	Data 0.0006926059722900391 (0.0007135293770884943)	
  batch 4001 loss: 0.25341323935985566
Epoch: [0][4000/4400]	Time 0.006021738052368164 (0.006072451906840642)	Data 0.0006964206695556641 (0.0007056836574204533)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00012086976319551468
Epoch: [1][0/4400]	Time 0.00619816780090332 (0.00619816780090332)	Data 0.0009703636169433594 (0.0009703636169433594)	
  batch 2001 loss: 0.25523896896839143
Epoch: [1][2000/4400]	Time 0.005952358245849609 (0.006008099699425495)	Data 0.0006701946258544922 (0.0006924394009888977)	
  batch 4001 loss: 0.2558343113809824
Epoch: [1][4000/4400]	Time 0.005926609039306641 (0.006048412210969322)	Data 0.000689

#### im2instructions and instructions2im

In [20]:
img_model_instructions = EmbeddingNetwork(64)
img_model_instructions= nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to((f'cuda:{img_model_instructions.device_ids[0]}'));

txt_model_instructions = EmbeddingNetwork(64)
txt_model_instructions= nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'));

optimizer = torch.optim.Adam(img_model_instructions.parameters(), lr=1e-2, weight_decay=0.0)
criterion = nn.MSELoss()

instructions_dataset = EmbeddingDataset(img_train, instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    img_model_instructions = img_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))
    txt_model_instructions = txt_model_instructions.to((f'cuda:{txt_model_instructions.device_ids[0]}'))


epochs = 5
for epoch in range(epochs):
    train(ingredients_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
  # Save checkpoint and replace old best model if current model is betterabs

torch.save(img_model_instructions.state_dict(), 'checkpoints/img-model-instructions-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'checkpoints/txt-model-instructions-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0003304431736469269
Epoch: [0][0/4400]	Time 0.007906436920166016 (0.007906436920166016)	Data 0.0015239715576171875 (0.0015239715576171875)	
  batch 2001 loss: 0.25847531884908675
Epoch: [0][2000/4400]	Time 0.006007671356201172 (0.0061836679955234175)	Data 0.0006887912750244141 (0.0007450405446843229)	
  batch 4001 loss: 0.2516851299479604
Epoch: [0][4000/4400]	Time 0.005887031555175781 (0.0061224357869082225)	Data 0.0006945133209228516 (0.0007264040255957739)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0001218315213918686
Epoch: [1][0/4400]	Time 0.006499052047729492 (0.006499052047729492)	Data 0.0012738704681396484 (0.0012738704681396484)	
  batch 2001 loss: 0.2536966943666339
Epoch: [1][2000/4400]	Time 0.006014823913574219 (0.006115863467382824)	Data 0.0006887912750244141 (0.0007232024275261661)	
  batch 4001 loss: 0.2543358924239874
Epoch: [1][4000/4400]	Time 0.005895376205444336 (0.00610050372081052)	Data 0.00067

### Dimensional Analysis. dims = 256 / 512 / 128 / 64

#### 512

In [26]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(64)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-64-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(64)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-64-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(64)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-64-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(64)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-64-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(64)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-64-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(64)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-64-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(64)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-64-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(64)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-64-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();



In [27]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 51.3
Recall {1: 0.056499999999999995, 5: 0.1654, 10: 0.2392}
Running im2recipe for dims = 512 and sample = 10000
Mean median 510.75
Recall {1: 0.009409999999999998, 5: 0.03598, 10: 0.05913}


In [28]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 4.9
Recall {1: 0.27979999999999994, 5: 0.5205, 10: 0.6196999999999999}
Running im2title for dims = 512 and sample = 10000
Mean median 38.7
Recall {1: 0.09568999999999998, 5: 0.24392, 10: 0.32765999999999995}


In [29]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 84.6
Recall {1: 0.0372, 5: 0.11200000000000002, 10: 0.1706}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 828.8
Recall {1: 0.00601, 5: 0.022809999999999997, 10: 0.038959999999999995}


In [30]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 84.05
Recall {1: 0.028300000000000002, 5: 0.09469999999999999, 10: 0.1502}
Running im2instructions for dims = 512 and sample = 10000
Mean median 835.5
Recall {1: 0.005619999999999999, 5: 0.02037, 10: 0.033839999999999995}


#### 256

In [None]:

# im2recipe 256
img_model_full_512 = EmbeddingNetwork(256)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-256-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(256)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-256-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 256
img_model_title_512 = EmbeddingNetwork(256)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-256-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(256)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-256-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 256
img_model_instructions_512 = EmbeddingNetwork(256)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-256-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(256)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-256-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 256
img_model_ingredients_512 = EmbeddingNetwork(256)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-256-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(256)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-256-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 256 and sample = 1000
Mean median 6.95
Recall {1: 0.2076, 5: 0.4608, 10: 0.571}
Running im2recipe for dims = 256 and sample = 10000
Mean median 59.2
Recall {1: 0.05, 5: 0.16153, 10: 0.24162}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 256 and sample = 1000
Mean median 2.0
Recall {1: 0.45620000000000005, 5: 0.7178, 10: 0.7921000000000001}
Running im2title for dims = 256 and sample = 10000
Mean median 9.1
Recall {1: 0.19510000000000002, 5: 0.41746999999999995, 10: 0.52024}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 256 and sample = 1000
Mean median 13.3
Recall {1: 0.09599999999999999, 5: 0.2969, 10: 0.4372}
Running im2ingredients for dims = 256 and sample = 10000
Mean median 127.05
Recall {1: 0.015189999999999999, 5: 0.06169, 10: 0.10497000000000001}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 256 and sample = 1000
Mean median 13.7
Recall {1: 0.1041, 5: 0.3077, 10: 0.44000000000000006}
Running im2instructions for dims = 256 and sample = 10000
Mean median 129.0
Recall {1: 0.01961, 5: 0.07319, 10: 0.12057999999999999}


#### 128

In [26]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(64)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-64-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(64)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-64-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(64)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-64-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(64)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-64-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(64)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-64-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(64)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-64-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(64)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-64-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(64)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-64-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();



In [27]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 51.3
Recall {1: 0.056499999999999995, 5: 0.1654, 10: 0.2392}
Running im2recipe for dims = 512 and sample = 10000
Mean median 510.75
Recall {1: 0.009409999999999998, 5: 0.03598, 10: 0.05913}


In [28]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 4.9
Recall {1: 0.27979999999999994, 5: 0.5205, 10: 0.6196999999999999}
Running im2title for dims = 512 and sample = 10000
Mean median 38.7
Recall {1: 0.09568999999999998, 5: 0.24392, 10: 0.32765999999999995}


In [29]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 84.6
Recall {1: 0.0372, 5: 0.11200000000000002, 10: 0.1706}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 828.8
Recall {1: 0.00601, 5: 0.022809999999999997, 10: 0.038959999999999995}


In [30]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 84.05
Recall {1: 0.028300000000000002, 5: 0.09469999999999999, 10: 0.1502}
Running im2instructions for dims = 512 and sample = 10000
Mean median 835.5
Recall {1: 0.005619999999999999, 5: 0.02037, 10: 0.033839999999999995}


#### 64

In [26]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(64)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-64-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(64)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-64-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(64)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-64-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(64)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-64-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(64)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-64-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(64)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-64-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(64)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-64-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(64)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-64-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();



In [27]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 51.3
Recall {1: 0.056499999999999995, 5: 0.1654, 10: 0.2392}
Running im2recipe for dims = 512 and sample = 10000
Mean median 510.75
Recall {1: 0.009409999999999998, 5: 0.03598, 10: 0.05913}


In [28]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 4.9
Recall {1: 0.27979999999999994, 5: 0.5205, 10: 0.6196999999999999}
Running im2title for dims = 512 and sample = 10000
Mean median 38.7
Recall {1: 0.09568999999999998, 5: 0.24392, 10: 0.32765999999999995}


In [29]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 84.6
Recall {1: 0.0372, 5: 0.11200000000000002, 10: 0.1706}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 828.8
Recall {1: 0.00601, 5: 0.022809999999999997, 10: 0.038959999999999995}


In [30]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).cpu().detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 84.05
Recall {1: 0.028300000000000002, 5: 0.09469999999999999, 10: 0.1502}
Running im2instructions for dims = 512 and sample = 10000
Mean median 835.5
Recall {1: 0.005619999999999999, 5: 0.02037, 10: 0.033839999999999995}


### Evaluation and Ablation Studies

 We can see that dimensions = 512 has a better performance

In [None]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(512)
img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("checkpoints/img-model-full-512-epoch-10.pth"))
img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(512)
txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("checkpoints/txt-model-full-512-epoch-10.pth"))
txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.eval();

#im2title 512
img_model_title_512 = EmbeddingNetwork(512)
img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("checkpoints/img-model-title-512-epoch-5.pth"))
img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(512)
txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("checkpoints/txt-model-title-512-epoch-5.pth"))
txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.eval();

#im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(512)
img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("checkpoints/img-model-instructions-512-epoch-5.pth"))
img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(512)
txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("checkpoints/txt-model-instructions-512-epoch-5.pth"))
txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(512)
img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("checkpoints/img-model-ingredients-512-epoch-5.pth"))
img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(512)
txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("checkpoints/txt-model-ingredients-512-epoch-5.pth"))
txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.eval();



In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_test[i], 0))).cpu().detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 5.2
Recall {1: 0.23480000000000004, 5: 0.5183000000000001, 10: 0.6353}
Running im2recipe for dims = 512 and sample = 10000
Mean median 43.1
Recall {1: 0.06054999999999999, 5: 0.18576, 10: 0.27418}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(title_test[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 24.6
Recall {1: 0.12349999999999998, 5: 0.29239999999999994, 10: 0.3778}
Running im2title for dims = 512 and sample = 10000
Mean median 247.35
Recall {1: 0.03185, 5: 0.09637999999999998, 10: 0.14421}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(ingredients_test[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 15.2
Recall {1: 0.08979999999999998, 5: 0.2829, 10: 0.41550000000000004}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 141.25
Recall {1: 0.014410000000000001, 5: 0.05796, 10: 0.10085999999999999}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 512))
text_test_nonlinear = np.zeros(shape = (len(img_test), 512))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_test[i], 0))).cpu().detach().numpy()
    text_test_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(instructions_test[i], 0))).cpu().detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 21.85
Recall {1: 0.077, 5: 0.2365, 10: 0.35269999999999996}
Running im2instructions for dims = 512 and sample = 10000
Mean median 204.25
Recall {1: 0.012629999999999999, 5: 0.04911, 10: 0.08324000000000001}


## 2.2 Non-Linear Embeddings - Triplet Loss

In [6]:
import matplotlib.pyplot as plt
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [7]:
!export CUDA_VISIBLE_DEVICES='0,1,2,3'

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Negative Training Data Sampling

In [9]:
indices = list(range(0, len(text_train)))
random.seed(0)
random.shuffle(indices)

In [10]:
neg_text_train = [text_train[i] for i in indices]
neg_title_train = [title_train[i] for i in indices]
neg_ingredients_train = [ingredients_train[i] for i in indices]
neg_instructions_train = [instructions_train[i] for i in indices]

### Model Creation dims = 512

In [11]:
class EmbeddingDataset(Dataset):
    def __init__(self, anchor_emb, positive_emb, negative_emb, transform=None):
        self.anchor_emb = torch.as_tensor(np.array(anchor_emb))
        self.positive_emb = torch.as_tensor(np.array(positive_emb))
        self.negative_emb = torch.as_tensor(np.array(negative_emb))
        self.transform = transform

    def __len__(self):
        return len(self.anchor_emb)

    def __getitem__(self, idx):
        return self.anchor_emb[idx], self.positive_emb[idx], self.negative_emb[idx]

In [12]:
class EmbeddingNetwork(nn.Module):
    def __init__(self, output_size, input_size=1024):
        super().__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.Dropout(),
            nn.LeakyReLU()
        )
        self.layer2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = self.layer1(x)
        return self.layer2(x)

In [13]:
class AverageMeter(object):
    # Utility function for timers
    
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0
    
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [14]:
def train(train_loader, img_model, txt_model, criterion, optimizer, epoch):
    print('Starting training epoch {}'.format(epoch))
    img_model.train()
    txt_model.train()
    
    batch_time, data_time, losses = AverageMeter(), AverageMeter(), AverageMeter()
    end = time.time()
    running_loss = 0.
    last_loss = 0.
    optimizer.zero_grad()
    
    for i, (anchor_emb, positive_emb, negative_emb) in enumerate(train_loader):
    
        # Use GPU if available
        if use_gpu: 
            anchor_emb, positive_emb, negative_emb = anchor_emb.to(device), positive_emb.to(device), negative_emb.to(device)

        data_time.update(time.time() - end)

        # Run forward pass
        out_anchor_emb = img_model(anchor_emb) 
        out_positive_emb = txt_model(positive_emb)
        out_negative_emb = txt_model(negative_emb)
        loss = criterion(out_anchor_emb, out_positive_emb, out_negative_emb) 

        # Compute gradient and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_time.update(time.time() - end)
        end = time.time()

        # Print model accuracy -- in the code below
        running_loss += loss.item()
        if i % 10000 == 0:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
        
        if i % 10000 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                'Time {batch_time.val} ({batch_time.avg})\t'
                'Data {data_time.val} ({data_time.avg})\t'.format(
                  epoch, i, len(train_loader), batch_time=batch_time,
                 data_time=data_time)) 

    print('Finished training epoch {}'.format(epoch))


#### im2recipe and recipe2im

In [None]:
train_dataset = EmbeddingDataset(img_train, text_train, neg_text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [None]:
img_model = EmbeddingNetwork(512)
# img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(512);
# txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(list(img_model.parameters()) + list(txt_model.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))

In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

In [None]:
best_losses = 1e10
epochs = 10

for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'triplet_checkpoints/img-model-full-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'triplet_checkpoints/txt-model-full-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011019824743270874
Epoch: [0][0/4400]	Time 0.2401421070098877 (0.2401421070098877)	Data 0.037871360778808594 (0.037871360778808594)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0
Epoch: [1][0/4400]	Time 0.00605463981628418 (0.00605463981628418)	Data 0.002034425735473633 (0.002034425735473633)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 7.46917724609375e-06
Epoch: [2][0/4400]	Time 0.006739377975463867 (0.006739377975463867)	Data 0.002443075180053711 (0.002443075180053711)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.0
Epoch: [3][0/4400]	Time 0.0062716007232666016 (0.0062716007232666016)	Data 0.002272367477416992 (0.002272367477416992)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 1.800537109375e-05
Epoch: [4][0/4400]	Time 0.006798267364501953 (0.006798267364501953)	Data 0.0027828216552734375 (0.0027828216552734375)	
Finished training epoch 4
St

#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train, neg_title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_title = EmbeddingNetwork(512)
# img_model_title = nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(512);
# txt_model_title = nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(list(img_model_title.parameters()) + list(txt_model_title.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_title = img_model_title.to(device)
    txt_model_title = txt_model_title.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'triplet_checkpoints/img-model-title-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'triplet_checkpoints/txt-model-title-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0009698835015296936
Epoch: [0][0/4400]	Time 0.00953817367553711 (0.00953817367553711)	Data 0.0030028820037841797 (0.0030028820037841797)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00015620994567871094
Epoch: [1][0/4400]	Time 0.005942344665527344 (0.005942344665527344)	Data 0.002160310745239258 (0.002160310745239258)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.0001813507080078125
Epoch: [2][0/4400]	Time 0.0066792964935302734 (0.0066792964935302734)	Data 0.0028858184814453125 (0.0028858184814453125)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.00028369140625
Epoch: [3][0/4400]	Time 0.006560325622558594 (0.006560325622558594)	Data 0.002418994903564453 (0.002418994903564453)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0001792449951171875
Epoch: [4][0/4400]	Time 0.006670236587524414 (0.006670236587524414)	Data 0.0025289058685302734 (0.002528

#### im2ingredients and ingredients2im

In [None]:
ingredients_dataset = EmbeddingDataset(img_train, ingredients_train, neg_ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_ingredients = EmbeddingNetwork(512)
# img_model_ingredients = nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to(device);

txt_model_ingredients = EmbeddingNetwork(512);
# txt_model_ingredients = nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to(device);

optimizer = torch.optim.Adam(list(img_model_ingredients.parameters()) + list(txt_model_ingredients.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_ingredients = img_model_ingredients.to(device)
    txt_model_ingredients = txt_model_ingredients.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_ingredients.state_dict(), 'triplet_checkpoints/img-model-ingredients-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'triplet_checkpoints/txt-model-ingredients-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0013143726587295532
Epoch: [0][0/4400]	Time 0.011211156845092773 (0.011211156845092773)	Data 0.00505518913269043 (0.00505518913269043)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00015041542053222657
Epoch: [1][0/4400]	Time 0.00642085075378418 (0.00642085075378418)	Data 0.002285480499267578 (0.002285480499267578)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.000117706298828125
Epoch: [2][0/4400]	Time 0.006246089935302734 (0.006246089935302734)	Data 0.002184629440307617 (0.002184629440307617)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.0001165313720703125
Epoch: [3][0/4400]	Time 0.007015228271484375 (0.007015228271484375)	Data 0.002960205078125 (0.002960205078125)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 3.69110107421875e-05
Epoch: [4][0/4400]	Time 0.006520986557006836 (0.006520986557006836)	Data 0.002350330352783203 (0.002350330352783203

#### im2instructions and instructions2im

In [None]:
instructions_dataset = EmbeddingDataset(img_train, instructions_train, neg_instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_instructions = EmbeddingNetwork(512)
# img_model_instructions = nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to(device);

txt_model_instructions = EmbeddingNetwork(512);
# txt_model_instructions = nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to(device);

optimizer = torch.optim.Adam(list(img_model_instructions.parameters()) + list(txt_model_instructions.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_instructions = img_model_instructions.to(device)
    txt_model_instructions = txt_model_instructions.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(instructions_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_instructions.state_dict(), 'triplet_checkpoints/img-model-instructions-512-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'triplet_checkpoints/txt-model-instructions-512-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011441240310668946
Epoch: [0][0/4400]	Time 0.24729275703430176 (0.24729275703430176)	Data 0.040459632873535156 (0.040459632873535156)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 8.026313781738282e-05
Epoch: [1][0/4400]	Time 0.006369590759277344 (0.006369590759277344)	Data 0.0022590160369873047 (0.0022590160369873047)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.00019618606567382813
Epoch: [2][0/4400]	Time 0.006463050842285156 (0.006463050842285156)	Data 0.002434968948364258 (0.002434968948364258)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 2.0751953125e-05
Epoch: [3][0/4400]	Time 0.006967067718505859 (0.006967067718505859)	Data 0.0028460025787353516 (0.0028460025787353516)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.00011408233642578125
Epoch: [4][0/4400]	Time 0.007019996643066406 (0.007019996643066406)	Data 0.002835988998413086 (0.00283598

### Model Creation dims = 256

#### im2recipe and recipe2im

In [None]:
train_dataset = EmbeddingDataset(img_train, text_train, neg_text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [None]:
img_model = EmbeddingNetwork(256)
# img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(256);
# txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(list(img_model.parameters()) + list(txt_model.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

In [None]:
best_losses = 1e10
epochs = 10

for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'triplet_checkpoints/img-model-full-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'triplet_checkpoints/txt-model-full-256-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011071594953536987
Epoch: [0][0/4400]	Time 0.009580373764038086 (0.009580373764038086)	Data 0.004029989242553711 (0.004029989242553711)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 2.946949005126953e-05
Epoch: [1][0/4400]	Time 0.006267070770263672 (0.006267070770263672)	Data 0.0022928714752197266 (0.0022928714752197266)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.0
Epoch: [2][0/4400]	Time 0.00625157356262207 (0.00625157356262207)	Data 0.002292156219482422 (0.002292156219482422)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 9.822845458984375e-05
Epoch: [3][0/4400]	Time 0.005419015884399414 (0.005419015884399414)	Data 0.0016505718231201172 (0.0016505718231201172)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0
Epoch: [4][0/4400]	Time 0.006276130676269531 (0.006276130676269531)	Data 0.002288818359375 (0.002288818359375)	
Finished training epoch 4


#### im2title and title2im

In [None]:
title_dataset = EmbeddingDataset(img_train, title_train, neg_title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_title = EmbeddingNetwork(256)
# img_model_title = nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(256);
# txt_model_title = nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(list(img_model_title.parameters()) + list(txt_model_title.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_title = img_model_title.to(device)
    txt_model_title = txt_model_title.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'triplet_checkpoints/img-model-title-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'triplet_checkpoints/txt-model-title-256-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011362178325653076
Epoch: [0][0/4400]	Time 0.01126861572265625 (0.01126861572265625)	Data 0.0036783218383789062 (0.0036783218383789062)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00026532459259033205
Epoch: [1][0/4400]	Time 0.0063397884368896484 (0.0063397884368896484)	Data 0.002401590347290039 (0.002401590347290039)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.00017798614501953125
Epoch: [2][0/4400]	Time 0.0063893795013427734 (0.0063893795013427734)	Data 0.002305269241333008 (0.002305269241333008)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.00026576995849609376
Epoch: [3][0/4400]	Time 0.006070852279663086 (0.006070852279663086)	Data 0.002187967300415039 (0.002187967300415039)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0003370819091796875
Epoch: [4][0/4400]	Time 0.0063018798828125 (0.0063018798828125)	Data 0.0024487972259521484 (0.002

#### im2ingredients and ingredients2im

In [None]:
ingredients_dataset = EmbeddingDataset(img_train, ingredients_train, neg_ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_ingredients = EmbeddingNetwork(256)
# img_model_ingredients = nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to(device);

txt_model_ingredients = EmbeddingNetwork(256);
# txt_model_ingredients = nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to(device);

optimizer = torch.optim.Adam(list(img_model_ingredients.parameters()) + list(txt_model_ingredients.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_ingredients = img_model_ingredients.to(device)
    txt_model_ingredients = txt_model_ingredients.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_ingredients.state_dict(), 'triplet_checkpoints/img-model-ingredients-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'triplet_checkpoints/txt-model-ingredients-256-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011087838411331177
Epoch: [0][0/4400]	Time 0.1564500331878662 (0.1564500331878662)	Data 0.012141704559326172 (0.012141704559326172)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.0001292104721069336
Epoch: [1][0/4400]	Time 0.0044171810150146484 (0.0044171810150146484)	Data 0.0018279552459716797 (0.0018279552459716797)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 3.330230712890625e-06
Epoch: [2][0/4400]	Time 0.0043370723724365234 (0.0043370723724365234)	Data 0.0017855167388916016 (0.0017855167388916016)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 4.06494140625e-05
Epoch: [3][0/4400]	Time 0.004402637481689453 (0.004402637481689453)	Data 0.001814126968383789 (0.001814126968383789)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 2.51922607421875e-05
Epoch: [4][0/4400]	Time 0.004415035247802734 (0.004415035247802734)	Data 0.001775503158569336 (0.00177550

#### im2instructions and instructions2im

In [None]:
instructions_dataset = EmbeddingDataset(img_train, instructions_train, neg_instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

In [None]:
img_model_instructions = EmbeddingNetwork(256)
# img_model_instructions = nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to(device);

txt_model_instructions = EmbeddingNetwork(256);
# txt_model_instructions = nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to(device);

optimizer = torch.optim.Adam(list(img_model_instructions.parameters()) + list(txt_model_instructions.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [None]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_instructions = img_model_instructions.to(device)
    txt_model_instructions = txt_model_instructions.to(device)

In [None]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(instructions_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_instructions.state_dict(), 'triplet_checkpoints/img-model-instructions-256-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'triplet_checkpoints/txt-model-instructions-256-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011826374530792237
Epoch: [0][0/4400]	Time 0.012595415115356445 (0.012595415115356445)	Data 0.0024366378784179688 (0.0024366378784179688)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 8.850574493408204e-05
Epoch: [1][0/4400]	Time 0.00675201416015625 (0.00675201416015625)	Data 0.0028083324432373047 (0.0028083324432373047)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 1.4438629150390626e-05
Epoch: [2][0/4400]	Time 0.006068229675292969 (0.006068229675292969)	Data 0.0022687911987304688 (0.0022687911987304688)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.0
Epoch: [3][0/4400]	Time 0.006207466125488281 (0.006207466125488281)	Data 0.0023376941680908203 (0.0023376941680908203)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 3.316497802734375e-05
Epoch: [4][0/4400]	Time 0.006067991256713867 (0.006067991256713867)	Data 0.002235889434814453 (0.002235889434814453

### Model Creation dims = 128

#### im2recipe and recipe2im

In [15]:
train_dataset = EmbeddingDataset(img_train, text_train, neg_text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [16]:
img_model = EmbeddingNetwork(128)
# img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(128);
# txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(list(img_model.parameters()) + list(txt_model.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [17]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

In [18]:
best_losses = 1e10
epochs = 10

for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'triplet_checkpoints/img-model-full-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'triplet_checkpoints/txt-model-full-128-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0010089569091796876
Epoch: [0][0/4400]	Time 0.24431228637695312 (0.24431228637695312)	Data 0.033136606216430664 (0.033136606216430664)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 1.0908126831054688e-05
Epoch: [1][0/4400]	Time 0.006811380386352539 (0.006811380386352539)	Data 0.0026102066040039062 (0.0026102066040039062)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 9.984779357910156e-05
Epoch: [2][0/4400]	Time 0.0066530704498291016 (0.0066530704498291016)	Data 0.0027441978454589844 (0.0027441978454589844)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.00013487625122070312
Epoch: [3][0/4400]	Time 0.0061986446380615234 (0.0061986446380615234)	Data 0.002288818359375 (0.002288818359375)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0
Epoch: [4][0/4400]	Time 0.00616002082824707 (0.00616002082824707)	Data 0.0018663406372070312 (0.0018663406372070312)	
Fi

#### im2title and title2im

In [19]:
title_dataset = EmbeddingDataset(img_train, title_train, neg_title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

In [20]:
img_model_title = EmbeddingNetwork(128)
# img_model_title = nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(128);
# txt_model_title = nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(list(img_model_title.parameters()) + list(txt_model_title.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [21]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_title = img_model_title.to(device)
    txt_model_title = txt_model_title.to(device)

In [22]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'triplet_checkpoints/img-model-title-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'triplet_checkpoints/txt-model-title-128-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0010203046798706054
Epoch: [0][0/4400]	Time 0.009414196014404297 (0.009414196014404297)	Data 0.003870248794555664 (0.003870248794555664)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00015114307403564454
Epoch: [1][0/4400]	Time 0.005550861358642578 (0.005550861358642578)	Data 0.0017321109771728516 (0.0017321109771728516)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.00029604148864746096
Epoch: [2][0/4400]	Time 0.006967067718505859 (0.006967067718505859)	Data 0.0029146671295166016 (0.0029146671295166016)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.0002752838134765625
Epoch: [3][0/4400]	Time 0.006146669387817383 (0.006146669387817383)	Data 0.002075672149658203 (0.002075672149658203)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0002822723388671875
Epoch: [4][0/4400]	Time 0.006098031997680664 (0.006098031997680664)	Data 0.002319812774658203 (0.0

#### im2ingredients and ingredients2im

In [23]:
ingredients_dataset = EmbeddingDataset(img_train, ingredients_train, neg_ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

In [24]:
img_model_ingredients = EmbeddingNetwork(128)
# img_model_ingredients = nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to(device);

txt_model_ingredients = EmbeddingNetwork(128);
# txt_model_ingredients = nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to(device);

optimizer = torch.optim.Adam(list(img_model_ingredients.parameters()) + list(txt_model_ingredients.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [25]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_ingredients = img_model_ingredients.to(device)
    txt_model_ingredients = txt_model_ingredients.to(device)

In [26]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_ingredients.state_dict(), 'triplet_checkpoints/img-model-ingredients-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'triplet_checkpoints/txt-model-ingredients-128-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011006090641021728
Epoch: [0][0/4400]	Time 0.009591341018676758 (0.009591341018676758)	Data 0.0042994022369384766 (0.0042994022369384766)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00011099052429199219
Epoch: [1][0/4400]	Time 0.006002664566040039 (0.006002664566040039)	Data 0.0017342567443847656 (0.0017342567443847656)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 3.583908081054687e-05
Epoch: [2][0/4400]	Time 0.005978822708129883 (0.005978822708129883)	Data 0.00231170654296875 (0.00231170654296875)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 4.360580444335937e-05
Epoch: [3][0/4400]	Time 0.008204460144042969 (0.008204460144042969)	Data 0.002305746078491211 (0.002305746078491211)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 7.928848266601563e-05
Epoch: [4][0/4400]	Time 0.0068166255950927734 (0.0068166255950927734)	Data 0.002844095230102539 (0.00

#### im2instructions and instructions2im

In [15]:
instructions_dataset = EmbeddingDataset(img_train, instructions_train, neg_instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

In [16]:
img_model_instructions = EmbeddingNetwork(128)
# img_model_instructions = nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to(device);

txt_model_instructions = EmbeddingNetwork(128);
# txt_model_instructions = nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to(device);

optimizer = torch.optim.Adam(list(img_model_instructions.parameters()) + list(txt_model_instructions.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [17]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_instructions = img_model_instructions.to(device)
    txt_model_instructions = txt_model_instructions.to(device)

In [18]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(instructions_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_instructions.state_dict(), 'triplet_checkpoints/img-model-instructions-128-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'triplet_checkpoints/txt-model-instructions-128-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0009684301614761352
Epoch: [0][0/4400]	Time 0.24962973594665527 (0.24962973594665527)	Data 0.03155255317687988 (0.03155255317687988)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00011184978485107422
Epoch: [1][0/4400]	Time 0.0062177181243896484 (0.0062177181243896484)	Data 0.0022830963134765625 (0.0022830963134765625)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 3.931236267089844e-05
Epoch: [2][0/4400]	Time 0.006750583648681641 (0.006750583648681641)	Data 0.002492189407348633 (0.002492189407348633)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.00016136932373046875
Epoch: [3][0/4400]	Time 0.0062749385833740234 (0.0062749385833740234)	Data 0.002020597457885742 (0.002020597457885742)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.000120208740234375
Epoch: [4][0/4400]	Time 0.006830930709838867 (0.006830930709838867)	Data 0.0025472640991210938 (0.002

### Model Creation dims = 64

#### im2recipe and recipe2im

In [24]:
train_dataset = EmbeddingDataset(img_train, text_train, neg_text_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [25]:
img_model = EmbeddingNetwork(64)
# img_model= nn.DataParallel(img_model, device_ids=[2,3])
img_model.to(device);

txt_model = EmbeddingNetwork(64);
# txt_model= nn.DataParallel(txt_model, device_ids=[2,3])
txt_model.to(device);

optimizer = torch.optim.Adam(list(img_model.parameters()) + list(txt_model.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [26]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model = img_model.to(device)
    txt_model = txt_model.to(device)

In [27]:
best_losses = 1e10
epochs = 10

for epoch in range(epochs):

    train(train_loader, img_model, txt_model, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model.state_dict(), 'triplet_checkpoints/img-model-full-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model.state_dict(), 'triplet_checkpoints/txt-model-full-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0011592025756835937
Epoch: [0][0/4400]	Time 0.016534090042114258 (0.016534090042114258)	Data 0.00530552864074707 (0.00530552864074707)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 4.945969581604004e-05
Epoch: [1][0/4400]	Time 0.009243965148925781 (0.009243965148925781)	Data 0.002117633819580078 (0.002117633819580078)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 4.458427429199219e-06
Epoch: [2][0/4400]	Time 0.006767988204956055 (0.006767988204956055)	Data 0.002257108688354492 (0.002257108688354492)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.0
Epoch: [3][0/4400]	Time 0.0064907073974609375 (0.0064907073974609375)	Data 0.0022830963134765625 (0.0022830963134765625)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 0.0
Epoch: [4][0/4400]	Time 0.006551265716552734 (0.006551265716552734)	Data 0.0025415420532226562 (0.0025415420532226562)	
Finished training 

#### im2title and title2im

In [28]:
title_dataset = EmbeddingDataset(img_train, title_train, neg_title_train)
title_loader = DataLoader(title_dataset, batch_size=64, shuffle=False)

In [29]:
img_model_title = EmbeddingNetwork(64)
# img_model_title = nn.DataParallel(img_model_title, device_ids=[2,3])
img_model_title.to(device);

txt_model_title = EmbeddingNetwork(64);
# txt_model_title = nn.DataParallel(txt_model_title, device_ids=[2,3])
txt_model_title.to(device);

optimizer = torch.optim.Adam(list(img_model_title.parameters()) + list(txt_model_title.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [30]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_title = img_model_title.to(device)
    txt_model_title = txt_model_title.to(device)

In [31]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(title_loader, img_model_title, txt_model_title, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_title.state_dict(), 'triplet_checkpoints/img-model-title-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_title.state_dict(), 'triplet_checkpoints/txt-model-title-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0009304186701774597
Epoch: [0][0/4400]	Time 0.011830329895019531 (0.011830329895019531)	Data 0.0037920475006103516 (0.0037920475006103516)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 0.00020077419281005858
Epoch: [1][0/4400]	Time 0.006789207458496094 (0.006789207458496094)	Data 0.002667665481567383 (0.002667665481567383)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 0.00010777568817138671
Epoch: [2][0/4400]	Time 0.0063877105712890625 (0.0063877105712890625)	Data 0.0018839836120605469 (0.0018839836120605469)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.00017299652099609376
Epoch: [3][0/4400]	Time 0.007192373275756836 (0.007192373275756836)	Data 0.0028738975524902344 (0.0028738975524902344)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 7.336044311523438e-05
Epoch: [4][0/4400]	Time 0.007213115692138672 (0.007213115692138672)	Data 0.002979040145874023

#### im2ingredients and ingredients2im

In [15]:
ingredients_dataset = EmbeddingDataset(img_train, ingredients_train, neg_ingredients_train)
ingredients_loader = DataLoader(ingredients_dataset, batch_size=64, shuffle=False)

In [16]:
img_model_ingredients = EmbeddingNetwork(64)
# img_model_ingredients = nn.DataParallel(img_model_ingredients, device_ids=[2,3])
img_model_ingredients.to(device);

txt_model_ingredients = EmbeddingNetwork(64);
# txt_model_ingredients = nn.DataParallel(txt_model_ingredients, device_ids=[2,3])
txt_model_ingredients.to(device);

optimizer = torch.optim.Adam(list(img_model_ingredients.parameters()) + list(txt_model_ingredients.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [17]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_ingredients = img_model_ingredients.to(device)
    txt_model_ingredients = txt_model_ingredients.to(device)

In [18]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(ingredients_loader, img_model_ingredients, txt_model_ingredients, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_ingredients.state_dict(), 'triplet_checkpoints/img-model-ingredients-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_ingredients.state_dict(), 'triplet_checkpoints/txt-model-ingredients-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0010432976484298707
Epoch: [0][0/4400]	Time 0.23984885215759277 (0.23984885215759277)	Data 0.04573535919189453 (0.04573535919189453)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 4.1985034942626956e-05
Epoch: [1][0/4400]	Time 0.007517814636230469 (0.007517814636230469)	Data 0.0033102035522460938 (0.0033102035522460938)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 4.564476013183594e-05
Epoch: [2][0/4400]	Time 0.006379842758178711 (0.006379842758178711)	Data 0.0022394657135009766 (0.0022394657135009766)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 0.000120880126953125
Epoch: [3][0/4400]	Time 0.00734710693359375 (0.00734710693359375)	Data 0.003060579299926758 (0.003060579299926758)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 7.970809936523437e-05
Epoch: [4][0/4400]	Time 0.006373167037963867 (0.006373167037963867)	Data 0.0023865699768066406 (0.00238656

#### im2instructions and instructions2im

In [19]:
instructions_dataset = EmbeddingDataset(img_train, instructions_train, neg_instructions_train)
instructions_loader = DataLoader(instructions_dataset, batch_size=64, shuffle=False)

In [20]:
img_model_instructions = EmbeddingNetwork(64)
# img_model_instructions = nn.DataParallel(img_model_instructions, device_ids=[2,3])
img_model_instructions.to(device);

txt_model_instructions = EmbeddingNetwork(64);
# txt_model_instructions = nn.DataParallel(txt_model_instructions, device_ids=[2,3])
txt_model_instructions.to(device);

optimizer = torch.optim.Adam(list(img_model_instructions.parameters()) + list(txt_model_instructions.parameters()), lr=1e-2, weight_decay=0.0)
criterion = nn.TripletMarginLoss(margin = 1)
# criterion = nn.TripletMarginWithDistanceLoss(distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y))


In [21]:
use_gpu = torch.cuda.is_available()
# use_gpu = False
if use_gpu: 
    criterion = criterion.to(device)
    img_model_instructions = img_model_instructions.to(device)
    txt_model_instructions = txt_model_instructions.to(device)

In [22]:
best_losses = 1e10
epochs = 5

for epoch in range(epochs):

    train(instructions_loader, img_model_instructions, txt_model_instructions, criterion, optimizer, epoch)
        
  # Save checkpoint and replace old best model if current model is betterabs
torch.save(img_model_instructions.state_dict(), 'triplet_checkpoints/img-model-instructions-64-epoch-{}.pth'.format(epoch+1))
torch.save(txt_model_instructions.state_dict(), 'triplet_checkpoints/txt-model-instructions-64-epoch-{}.pth'.format(epoch+1))

Starting training epoch 0
  batch 1 loss: 0.0008711038827896118
Epoch: [0][0/4400]	Time 0.010983467102050781 (0.010983467102050781)	Data 0.004624366760253906 (0.004624366760253906)	
Finished training epoch 0
Starting training epoch 1
  batch 1 loss: 8.974552154541016e-05
Epoch: [1][0/4400]	Time 0.006373405456542969 (0.006373405456542969)	Data 0.002432584762573242 (0.002432584762573242)	
Finished training epoch 1
Starting training epoch 2
  batch 1 loss: 9.349822998046876e-05
Epoch: [2][0/4400]	Time 0.0062749385833740234 (0.0062749385833740234)	Data 0.0023622512817382812 (0.0023622512817382812)	
Finished training epoch 2
Starting training epoch 3
  batch 1 loss: 8.623123168945313e-05
Epoch: [3][0/4400]	Time 0.005517482757568359 (0.005517482757568359)	Data 0.0014796257019042969 (0.0014796257019042969)	
Finished training epoch 3
Starting training epoch 4
  batch 1 loss: 6.567001342773437e-05
Epoch: [4][0/4400]	Time 0.00663447380065918 (0.00663447380065918)	Data 0.002488851547241211 (0.002

### Dimensional Analysis. dims = 64 / 128 / 256 / 512

#### 512

In [None]:
# im2recipe 512
img_model_full_512 = EmbeddingNetwork(512)
# img_model_full_512 = nn.DataParallel(img_model_full_512, device_ids=[1])
img_model_full_512.load_state_dict(torch.load("triplet_checkpoints/img-model-full-512-epoch-10.pth"))
# img_model_full_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_full_512.to('cpu')
img_model_full_512.eval();
txt_model_full_512 = EmbeddingNetwork(512)
# txt_model_full_512 = nn.DataParallel(txt_model_full_512, device_ids=[1])
txt_model_full_512.load_state_dict(torch.load("triplet_checkpoints/txt-model-full-512-epoch-10.pth"))
# txt_model_full_512.to((f'cuda:{txt_model_full_512.device_ids[0]}'));
txt_model_full_512.to('cpu')
txt_model_full_512.eval();

# im2title 512
img_model_title_512 = EmbeddingNetwork(512)
# img_model_title_512 = nn.DataParallel(img_model_title_512, device_ids=[1])
img_model_title_512.load_state_dict(torch.load("triplet_checkpoints/img-model-title-512-epoch-5.pth"))
# img_model_title_512.to((f'cuda:{img_model_title_512.device_ids[0]}'));
img_model_title_512.to('cpu')
img_model_title_512.eval();
txt_model_title_512 = EmbeddingNetwork(512)
# txt_model_title_512 = nn.DataParallel(txt_model_title_512, device_ids=[1])
txt_model_title_512.load_state_dict(torch.load("triplet_checkpoints/txt-model-title-512-epoch-5.pth"))
# txt_model_title_512.to((f'cuda:{txt_model_title_512.device_ids[0]}'));
txt_model_title_512.to('cpu')
txt_model_title_512.eval();

# im2instructions 512
img_model_instructions_512 = EmbeddingNetwork(512)
# img_model_instructions_512 = nn.DataParallel(img_model_instructions_512, device_ids=[1])
img_model_instructions_512.load_state_dict(torch.load("triplet_checkpoints/img-model-instructions-512-epoch-5.pth"))
# img_model_instructions_512.to((f'cuda:{img_model_instructions_512.device_ids[0]}'));
img_model_instructions_512.to('cpu')
img_model_instructions_512.eval();
txt_model_instructions_512 = EmbeddingNetwork(512)
# txt_model_instructions_512 = nn.DataParallel(txt_model_instructions_512, device_ids=[1])
txt_model_instructions_512.load_state_dict(torch.load("triplet_checkpoints/txt-model-instructions-512-epoch-5.pth"))
# txt_model_instructions_512.to((f'cuda:{txt_model_instructions_512.device_ids[0]}'));
txt_model_instructions_512.to('cpu')
txt_model_instructions_512.eval();

#im2ingredients 512
img_model_ingredients_512 = EmbeddingNetwork(512)
# img_model_ingredients_512 = nn.DataParallel(img_model_ingredients_512, device_ids=[1])
img_model_ingredients_512.load_state_dict(torch.load("triplet_checkpoints/img-model-ingredients-512-epoch-5.pth"))
# img_model_ingredients_512.to((f'cuda:{img_model_full_512.device_ids[0]}'));
img_model_ingredients_512.to('cpu')
img_model_ingredients_512.eval();
txt_model_ingredients_512 = EmbeddingNetwork(512)
# txt_model_ingredients_512 = nn.DataParallel(txt_model_ingredients_512, device_ids=[1])
txt_model_ingredients_512.load_state_dict(torch.load("triplet_checkpoints/txt-model-ingredients-512-epoch-5.pth"))
# txt_model_ingredients_512.to((f'cuda:{txt_model_ingredients_512.device_ids[0]}'));
txt_model_ingredients_512.to('cpu')
txt_model_ingredients_512.eval();



In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

# img_val_nonlinear.to(device)
# text_val_nonlinear.to(device)

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_512(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_full_512(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 512 and sample = 1000
Mean median 2.4
Recall {1: 0.3538, 5: 0.6819, 10: 0.7863000000000001}
Running im2recipe for dims = 512 and sample = 10000
Mean median 15.9
Recall {1: 0.10740000000000001, 5: 0.3032, 10: 0.41979}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_512(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_title_512(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2title for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 512 and sample = 1000
Mean median 5.1
Recall {1: 0.20360000000000006, 5: 0.5205, 10: 0.6624000000000001}
Running im2title for dims = 512 and sample = 10000
Mean median 42.3
Recall {1: 0.043919999999999994, 5: 0.15214, 10: 0.23866}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_512(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_512(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2ingredients for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 512 and sample = 1000
Mean median 4.1
Recall {1: 0.263, 5: 0.5714999999999999, 10: 0.6995000000000001}
Running im2ingredients for dims = 512 and sample = 10000
Mean median 31.0
Recall {1: 0.06504, 5: 0.20672000000000001, 10: 0.30518999999999996}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 512))
text_val_nonlinear = np.zeros(shape = (len(img_val), 512))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_512(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_512(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2instructions for dims = 512 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 512 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 512 and sample = 1000
Mean median 3.0
Recall {1: 0.3229, 5: 0.6366999999999999, 10: 0.7559}
Running im2instructions for dims = 512 and sample = 10000
Mean median 22.1
Recall {1: 0.08642999999999999, 5: 0.25167, 10: 0.36322000000000004}


#### 256

In [None]:

# im2recipe 256
img_model_full_256 = EmbeddingNetwork(256)
# img_model_full_256 = nn.DataParallel(img_model_full_256, device_ids=[1])
img_model_full_256.load_state_dict(torch.load("triplet_checkpoints/img-model-full-256-epoch-10.pth"))
# img_model_full_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_full_256.to('cpu')
img_model_full_256.eval();
txt_model_full_256 = EmbeddingNetwork(256)
# txt_model_full_256 = nn.DataParallel(txt_model_full_256, device_ids=[1])
txt_model_full_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-full-256-epoch-10.pth"))
# txt_model_full_256.to((f'cuda:{txt_model_full_256.device_ids[0]}'));
txt_model_full_256.to('cpu')
txt_model_full_256.eval();

#im2title 256
img_model_title_256 = EmbeddingNetwork(256)
# img_model_title_256 = nn.DataParallel(img_model_title_256, device_ids=[1])
img_model_title_256.load_state_dict(torch.load("triplet_checkpoints/img-model-title-256-epoch-5.pth"))
# img_model_title_256.to((f'cuda:{img_model_title_256.device_ids[0]}'));
img_model_title_256.to('cpu')
img_model_title_256.eval();
txt_model_title_256 = EmbeddingNetwork(256)
# txt_model_title_256 = nn.DataParallel(txt_model_title_256, device_ids=[1])
txt_model_title_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-title-256-epoch-5.pth"))
# txt_model_title_256.to((f'cuda:{txt_model_title_256.device_ids[0]}'));
txt_model_title_256.to('cpu')
txt_model_title_256.eval();

#im2instructions 256
img_model_instructions_256 = EmbeddingNetwork(256)
# img_model_instructions_256 = nn.DataParallel(img_model_instructions_256, device_ids=[1])
img_model_instructions_256.load_state_dict(torch.load("triplet_checkpoints/img-model-instructions-256-epoch-5.pth"))
# img_model_instructions_256.to((f'cuda:{img_model_instructions_256.device_ids[0]}'));
img_model_instructions_256.to('cpu')
img_model_instructions_256.eval();
txt_model_instructions_256 = EmbeddingNetwork(256)
# txt_model_instructions_256 = nn.DataParallel(txt_model_instructions_256, device_ids=[1])
txt_model_instructions_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-instructions-256-epoch-5.pth"))
# txt_model_instructions_256.to((f'cuda:{txt_model_instructions_256.device_ids[0]}'));
txt_model_instructions_256.to('cpu')
txt_model_instructions_256.eval();

#im2ingredients 256
img_model_ingredients_256 = EmbeddingNetwork(256)
# img_model_ingredients_256 = nn.DataParallel(img_model_ingredients_256, device_ids=[1])
img_model_ingredients_256.load_state_dict(torch.load("triplet_checkpoints/img-model-ingredients-256-epoch-5.pth"))
# img_model_ingredients_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_ingredients_256.to('cpu')
img_model_ingredients_256.eval();
txt_model_ingredients_256 = EmbeddingNetwork(256)
# txt_model_ingredients_256 = nn.DataParallel(txt_model_ingredients_256, device_ids=[1])
txt_model_ingredients_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-ingredients-256-epoch-5.pth"))
# txt_model_ingredients_256.to((f'cuda:{txt_model_ingredients_256.device_ids[0]}'));
txt_model_ingredients_256.to('cpu')
txt_model_ingredients_256.eval();

In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_256(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_full_256(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 256 and sample = 1000
Mean median 2.1
Recall {1: 0.3718, 5: 0.697, 10: 0.7957}
Running im2recipe for dims = 256 and sample = 10000
Mean median 15.0
Recall {1: 0.11236000000000002, 5: 0.31418, 10: 0.43373999999999996}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_256(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_title_256(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2title for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 256 and sample = 1000
Mean median 4.9
Recall {1: 0.2165, 5: 0.5333, 10: 0.6711}
Running im2title for dims = 256 and sample = 10000
Mean median 39.5
Recall {1: 0.04725999999999999, 5: 0.16085, 10: 0.24957}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_256(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_256(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 256 and sample = 1000
Mean median 3.9
Recall {1: 0.2761, 5: 0.5915, 10: 0.7167000000000001}
Running im2ingredients for dims = 256 and sample = 10000
Mean median 28.5
Recall {1: 0.0715, 5: 0.22025999999999998, 10: 0.32059000000000004}


In [None]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 256))
text_val_nonlinear = np.zeros(shape = (len(img_val), 256))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_256(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_256(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 256 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 256 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 256 and sample = 1000
Mean median 3.2
Recall {1: 0.3028, 5: 0.6179, 10: 0.74}
Running im2instructions for dims = 256 and sample = 10000
Mean median 24.7
Recall {1: 0.08179, 5: 0.24386000000000002, 10: 0.34886}


#### 128

In [19]:
# im2recipe 128
img_model_full_128 = EmbeddingNetwork(128)
# img_model_full_128 = nn.DataParallel(img_model_full_128, device_ids=[1])
img_model_full_128.load_state_dict(torch.load("triplet_checkpoints/img-model-full-128-epoch-10.pth"))
# img_model_full_128.to((f'cuda:{img_model_full_128.device_ids[0]}'));
img_model_full_128.to('cpu')
img_model_full_128.eval();
txt_model_full_128 = EmbeddingNetwork(128)
# txt_model_full_128 = nn.DataParallel(txt_model_full_128, device_ids=[1])
txt_model_full_128.load_state_dict(torch.load("triplet_checkpoints/txt-model-full-128-epoch-10.pth"))
# txt_model_full_128.to((f'cuda:{txt_model_full_128.device_ids[0]}'));
txt_model_full_128.to('cpu')
txt_model_full_128.eval();

#im2title 128
img_model_title_128 = EmbeddingNetwork(128)
# img_model_title_128 = nn.DataParallel(img_model_title_128, device_ids=[1])
img_model_title_128.load_state_dict(torch.load("triplet_checkpoints/img-model-title-128-epoch-5.pth"))
# img_model_title_128.to((f'cuda:{img_model_title_128.device_ids[0]}'));
img_model_title_128.to('cpu')
img_model_title_128.eval();
txt_model_title_128 = EmbeddingNetwork(128)
# txt_model_title_128 = nn.DataParallel(txt_model_title_128, device_ids=[1])
txt_model_title_128.load_state_dict(torch.load("triplet_checkpoints/txt-model-title-128-epoch-5.pth"))
# txt_model_title_128.to((f'cuda:{txt_model_title_128.device_ids[0]}'));
txt_model_title_128.to('cpu')
txt_model_title_128.eval();

#im2instructions 128
img_model_instructions_128 = EmbeddingNetwork(128)
# img_model_instructions_128 = nn.DataParallel(img_model_instructions_128, device_ids=[1])
img_model_instructions_128.load_state_dict(torch.load("triplet_checkpoints/img-model-instructions-128-epoch-5.pth"))
# img_model_instructions_128.to((f'cuda:{img_model_instructions_128.device_ids[0]}'));
img_model_instructions_128.to('cpu')
img_model_instructions_128.eval();
txt_model_instructions_128 = EmbeddingNetwork(128)
# txt_model_instructions_128 = nn.DataParallel(txt_model_instructions_128, device_ids=[1])
txt_model_instructions_128.load_state_dict(torch.load("triplet_checkpoints/txt-model-instructions-128-epoch-5.pth"))
# txt_model_instructions_128.to((f'cuda:{txt_model_instructions_128.device_ids[0]}'));
txt_model_instructions_128.to('cpu')
txt_model_instructions_128.eval();

#im2ingredients 128
img_model_ingredients_128 = EmbeddingNetwork(128)
# img_model_ingredients_128 = nn.DataParallel(img_model_ingredients_128, device_ids=[1])
img_model_ingredients_128.load_state_dict(torch.load("triplet_checkpoints/img-model-ingredients-128-epoch-5.pth"))
# img_model_ingredients_128.to((f'cuda:{img_model_full_128.device_ids[0]}'));
img_model_ingredients_128.to('cpu')
img_model_ingredients_128.eval();
txt_model_ingredients_128 = EmbeddingNetwork(128)
# txt_model_ingredients_128 = nn.DataParallel(txt_model_ingredients_128, device_ids=[1])
txt_model_ingredients_128.load_state_dict(torch.load("triplet_checkpoints/txt-model-ingredients-128-epoch-5.pth"))
# txt_model_ingredients_128.to((f'cuda:{txt_model_ingredients_128.device_ids[0]}'));
txt_model_ingredients_128.to('cpu')
txt_model_ingredients_128.eval();

In [20]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_128(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_full_128(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 128 and sample = 1000
Mean median 2.0
Recall {1: 0.3781, 5: 0.6936, 10: 0.797}
Running im2recipe for dims = 128 and sample = 10000
Mean median 13.9
Recall {1: 0.12241000000000002, 5: 0.32748000000000005, 10: 0.44809}


In [21]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_128(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_title_128(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2title for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 128 and sample = 1000
Mean median 5.1
Recall {1: 0.21220000000000003, 5: 0.5246000000000001, 10: 0.6697}
Running im2title for dims = 128 and sample = 10000
Mean median 41.6
Recall {1: 0.04628, 5: 0.15596, 10: 0.24558}


In [22]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_128(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_128(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 128 and sample = 1000
Mean median 3.85
Recall {1: 0.26820000000000005, 5: 0.5885, 10: 0.7089000000000001}
Running im2ingredients for dims = 128 and sample = 10000
Mean median 29.0
Recall {1: 0.06841, 5: 0.21638000000000002, 10: 0.31759000000000004}


In [23]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 128))
text_val_nonlinear = np.zeros(shape = (len(img_val), 128))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_128(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_128(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 128 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 128 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 128 and sample = 1000
Mean median 3.0
Recall {1: 0.32460000000000006, 5: 0.6363, 10: 0.7506}
Running im2instructions for dims = 128 and sample = 10000
Mean median 21.8
Recall {1: 0.08918000000000001, 5: 0.25699000000000005, 10: 0.36541}


#### 64

In [23]:
# im2recipe 64
img_model_full_64 = EmbeddingNetwork(64)
# img_model_full_64 = nn.DataParallel(img_model_full_64, device_ids=[1])
img_model_full_64.load_state_dict(torch.load("triplet_checkpoints/img-model-full-64-epoch-10.pth"))
# img_model_full_64.to((f'cuda:{img_model_full_64.device_ids[0]}'));
img_model_full_64.to('cpu')
img_model_full_64.eval();
txt_model_full_64 = EmbeddingNetwork(64)
# txt_model_full_64 = nn.DataParallel(txt_model_full_64, device_ids=[1])
txt_model_full_64.load_state_dict(torch.load("triplet_checkpoints/txt-model-full-64-epoch-10.pth"))
# txt_model_full_64.to((f'cuda:{txt_model_full_64.device_ids[0]}'));
txt_model_full_64.to('cpu')
txt_model_full_64.eval();

#im2title 64
img_model_title_64 = EmbeddingNetwork(64)
# img_model_title_64 = nn.DataParallel(img_model_title_64, device_ids=[1])
img_model_title_64.load_state_dict(torch.load("triplet_checkpoints/img-model-title-64-epoch-5.pth"))
# img_model_title_64.to((f'cuda:{img_model_title_64.device_ids[0]}'));
img_model_title_64.to('cpu')
img_model_title_64.eval();
txt_model_title_64 = EmbeddingNetwork(64)
# txt_model_title_64 = nn.DataParallel(txt_model_title_64, device_ids=[1])
txt_model_title_64.load_state_dict(torch.load("triplet_checkpoints/txt-model-title-64-epoch-5.pth"))
# txt_model_title_64.to((f'cuda:{txt_model_title_64.device_ids[0]}'));
txt_model_title_64.to('cpu')
txt_model_title_64.eval();

#im2instructions 64
img_model_instructions_64 = EmbeddingNetwork(64)
# img_model_instructions_64 = nn.DataParallel(img_model_instructions_64, device_ids=[1])
img_model_instructions_64.load_state_dict(torch.load("triplet_checkpoints/img-model-instructions-64-epoch-5.pth"))
# img_model_instructions_64.to((f'cuda:{img_model_instructions_64.device_ids[0]}'));
img_model_instructions_64.to('cpu')
img_model_instructions_64.eval();
txt_model_instructions_64 = EmbeddingNetwork(64)
# txt_model_instructions_64 = nn.DataParallel(txt_model_instructions_64, device_ids=[1])
txt_model_instructions_64.load_state_dict(torch.load("triplet_checkpoints/txt-model-instructions-64-epoch-5.pth"))
# txt_model_instructions_64.to((f'cuda:{txt_model_instructions_64.device_ids[0]}'));
txt_model_instructions_64.to('cpu')
txt_model_instructions_64.eval();

#im2ingredients 64
img_model_ingredients_64 = EmbeddingNetwork(64)
# img_model_ingredients_64 = nn.DataParallel(img_model_ingredients_64, device_ids=[1])
img_model_ingredients_64.load_state_dict(torch.load("triplet_checkpoints/img-model-ingredients-64-epoch-5.pth"))
# img_model_ingredients_64.to((f'cuda:{img_model_full_64.device_ids[0]}'));
img_model_ingredients_64.to('cpu')
img_model_ingredients_64.eval();
txt_model_ingredients_64 = EmbeddingNetwork(64)
# txt_model_ingredients_64 = nn.DataParallel(txt_model_ingredients_64, device_ids=[1])
txt_model_ingredients_64.load_state_dict(torch.load("triplet_checkpoints/txt-model-ingredients-64-epoch-5.pth"))
# txt_model_ingredients_64.to((f'cuda:{txt_model_ingredients_64.device_ids[0]}'));
txt_model_ingredients_64.to('cpu')
txt_model_ingredients_64.eval();

In [24]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_full_64(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_full_64(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2recipe for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2recipe for dims = 64 and sample = 1000
Mean median 2.4
Recall {1: 0.35960000000000003, 5: 0.6848, 10: 0.7952}
Running im2recipe for dims = 64 and sample = 10000
Mean median 14.9
Recall {1: 0.11610999999999998, 5: 0.31612999999999997, 10: 0.43578}


In [25]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_title_64(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_title_64(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2title for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2title for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2title for dims = 64 and sample = 1000
Mean median 4.9
Recall {1: 0.21749999999999997, 5: 0.5327999999999999, 10: 0.6723}
Running im2title for dims = 64 and sample = 10000
Mean median 39.25
Recall {1: 0.046579999999999996, 5: 0.16155, 10: 0.25102}


In [26]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_ingredients_64(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_ingredients_64(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2ingredients for dims = 64 and sample = 1000
Mean median 4.0
Recall {1: 0.269, 5: 0.5833999999999999, 10: 0.7110999999999998}
Running im2ingredients for dims = 64 and sample = 10000
Mean median 28.6
Recall {1: 0.07246999999999999, 5: 0.22054, 10: 0.32203}


In [27]:
img_val_nonlinear = np.zeros(shape = (len(img_val), 64))
text_val_nonlinear = np.zeros(shape = (len(img_val), 64))

for i in range(len(img_val)):
    img_val_nonlinear[i] = img_model_instructions_64(torch.Tensor(np.expand_dims(img_val[i], 0))).detach().numpy()
    text_val_nonlinear[i] = txt_model_instructions_64(torch.Tensor(np.expand_dims(text_val[i], 0))).detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 64 and sample = 1000")
ranker(img_val_nonlinear, text_val_nonlinear, 1000, "image")
print("Running im2instructions for dims = 64 and sample = 10000")
ranker(img_val_nonlinear, text_val_nonlinear, 10000, "image")

Running im2instructions for dims = 64 and sample = 1000
Mean median 3.0
Recall {1: 0.3296, 5: 0.6529, 10: 0.7682}
Running im2instructions for dims = 64 and sample = 10000
Mean median 20.0
Recall {1: 0.09388999999999999, 5: 0.27048000000000005, 10: 0.38217}


### Evaluation and Ablation Studies

 We can see that dimensions = 256 has a better performance

In [None]:
# im2recipe 256
img_model_full_256 = EmbeddingNetwork(256)
# img_model_full_256 = nn.DataParallel(img_model_full_256, device_ids=[1])
img_model_full_256.load_state_dict(torch.load("triplet_checkpoints/img-model-full-256-epoch-10.pth"))
# img_model_full_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_full_256.to('cpu')
img_model_full_256.eval();
txt_model_full_256 = EmbeddingNetwork(256)
# txt_model_full_256 = nn.DataParallel(txt_model_full_256, device_ids=[1])
txt_model_full_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-full-256-epoch-10.pth"))
# txt_model_full_256.to((f'cuda:{txt_model_full_256.device_ids[0]}'));
txt_model_full_256.to('cpu')
txt_model_full_256.eval();

#im2title 256
img_model_title_256 = EmbeddingNetwork(256)
# img_model_title_256 = nn.DataParallel(img_model_title_256, device_ids=[1])
img_model_title_256.load_state_dict(torch.load("triplet_checkpoints/img-model-title-256-epoch-5.pth"))
# img_model_title_256.to((f'cuda:{img_model_title_256.device_ids[0]}'));
img_model_title_256.to('cpu')
img_model_title_256.eval();
txt_model_title_256 = EmbeddingNetwork(256)
# txt_model_title_256 = nn.DataParallel(txt_model_title_256, device_ids=[1])
txt_model_title_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-title-256-epoch-5.pth"))
# txt_model_title_256.to((f'cuda:{txt_model_title_256.device_ids[0]}'));
txt_model_title_256.to('cpu')
txt_model_title_256.eval();

#im2instructions 256
img_model_instructions_256 = EmbeddingNetwork(256)
# img_model_instructions_256 = nn.DataParallel(img_model_instructions_256, device_ids=[1])
img_model_instructions_256.load_state_dict(torch.load("triplet_checkpoints/img-model-instructions-256-epoch-5.pth"))
# img_model_instructions_256.to((f'cuda:{img_model_instructions_256.device_ids[0]}'));
img_model_instructions_256.to('cpu')
img_model_instructions_256.eval();
txt_model_instructions_256 = EmbeddingNetwork(256)
# txt_model_instructions_256 = nn.DataParallel(txt_model_instructions_256, device_ids=[1])
txt_model_instructions_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-instructions-256-epoch-5.pth"))
# txt_model_instructions_256.to((f'cuda:{txt_model_instructions_256.device_ids[0]}'));
txt_model_instructions_256.to('cpu')
txt_model_instructions_256.eval();

#im2ingredients 256
img_model_ingredients_256 = EmbeddingNetwork(256)
# img_model_ingredients_256 = nn.DataParallel(img_model_ingredients_256, device_ids=[1])
img_model_ingredients_256.load_state_dict(torch.load("triplet_checkpoints/img-model-ingredients-256-epoch-5.pth"))
# img_model_ingredients_256.to((f'cuda:{img_model_full_256.device_ids[0]}'));
img_model_ingredients_256.to('cpu')
img_model_ingredients_256.eval();
txt_model_ingredients_256 = EmbeddingNetwork(256)
# txt_model_ingredients_256 = nn.DataParallel(txt_model_ingredients_256, device_ids=[1])
txt_model_ingredients_256.load_state_dict(torch.load("triplet_checkpoints/txt-model-ingredients-256-epoch-5.pth"))
# txt_model_ingredients_256.to((f'cuda:{txt_model_ingredients_256.device_ids[0]}'));
txt_model_ingredients_256.to('cpu')
txt_model_ingredients_256.eval();

In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 256))
text_test_nonlinear = np.zeros(shape = (len(img_test), 256))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_full_256(torch.Tensor(np.expand_dims(img_test[i], 0))).detach().numpy()
    text_test_nonlinear[i] = txt_model_full_256(torch.Tensor(np.expand_dims(text_test[i], 0))).detach().numpy()

# im2recipe and recipe2im
print("Running im2recipe for dims = 256 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2recipe for dims = 256 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2recipe for dims = 256 and sample = 1000
Mean median 2.0
Recall {1: 0.3794, 5: 0.6984, 10: 0.7993}
Running im2recipe for dims = 256 and sample = 10000
Mean median 15.1
Recall {1: 0.11216, 5: 0.31331000000000003, 10: 0.43186}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 256))
text_test_nonlinear = np.zeros(shape = (len(img_test), 256))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_title_256(torch.Tensor(np.expand_dims(img_test[i], 0))).detach().numpy()
    text_test_nonlinear[i] = txt_model_title_256(torch.Tensor(np.expand_dims(text_test[i], 0))).detach().numpy()

# im2title and title2im
print("Running im2title for dims = 256 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2title for dims = 256 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2title for dims = 256 and sample = 1000
Mean median 4.85
Recall {1: 0.217, 5: 0.5293, 10: 0.665}
Running im2title for dims = 256 and sample = 10000
Mean median 40.25
Recall {1: 0.04741000000000001, 5: 0.16038000000000002, 10: 0.25114000000000003}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 256))
text_test_nonlinear = np.zeros(shape = (len(img_test), 256))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_ingredients_256(torch.Tensor(np.expand_dims(img_test[i], 0))).detach().numpy()
    text_test_nonlinear[i] = txt_model_ingredients_256(torch.Tensor(np.expand_dims(text_test[i], 0))).detach().numpy()

# im2ingredients and ingredients2im
print("Running im2ingredients for dims = 256 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2ingredients for dims = 256 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2ingredients for dims = 256 and sample = 1000
Mean median 3.8
Recall {1: 0.276, 5: 0.5836, 10: 0.7084}
Running im2ingredients for dims = 256 and sample = 10000
Mean median 28.4
Recall {1: 0.07182999999999999, 5: 0.22010000000000002, 10: 0.31999}


In [None]:
img_test_nonlinear = np.zeros(shape = (len(img_test), 256))
text_test_nonlinear = np.zeros(shape = (len(img_test), 256))

for i in range(len(img_test)):
    img_test_nonlinear[i] = img_model_instructions_256(torch.Tensor(np.expand_dims(img_test[i], 0))).detach().numpy()
    text_test_nonlinear[i] = txt_model_instructions_256(torch.Tensor(np.expand_dims(text_test[i], 0))).detach().numpy()

# im2instructions and instructions2im
print("Running im2instructions for dims = 256 and sample = 1000")
ranker(img_test_nonlinear, text_test_nonlinear, 1000, "image")
print("Running im2instructions for dims = 256 and sample = 10000")
ranker(img_test_nonlinear, text_test_nonlinear, 10000, "image")

Running im2instructions for dims = 256 and sample = 1000
Mean median 3.4
Recall {1: 0.29660000000000003, 5: 0.6060999999999999, 10: 0.7335}
Running im2instructions for dims = 256 and sample = 10000
Mean median 24.7
Recall {1: 0.07762, 5: 0.23694999999999994, 10: 0.34320000000000006}
