# Data Preprocessing

In [None]:
import json

def get_file_data(file_path):
    file = open(file_path)
    data = json.load(file)
    file.close()
    return data

def lookup_table(data):
    lookup_dict = {}
    for temp in data:
        lookup_dict[temp['id']] = temp
    return lookup_dict

def write_to_file(file_path, data):
    with open(file_path, "w") as outfile:
        json.dump(data, outfile, indent=4)

if __name__ == '__main__':

    data1 = get_file_data('data/layer1.json')
    data2 = get_file_data('data/layer2.json')

    lookup = lookup_table(data1)
    partition_map = {'train':0, 'test':1, 'val':2} 
    text_data, image_data = [list(), list(), list()], [list(), list(), list()]
    for row in data2:
        if row['id'] in lookup:
            text = lookup[row['id']]
            image = row
            image['images'] = [i['id']for i in row['images']]
            partition = partition_map[text['partition']]
            text_data[partition].append(text)
            image_data[partition].append(image)

    partition_map = ['train', 'test', 'val']
    for idx, text in enumerate(text_data):
        file_path = 'data/' + partition_map[idx] + '/text.json'
        write_to_file(file_path, text)
    for idx, images in enumerate(image_data):
        file_path = 'data/' + partition_map[idx] + '/image.json'
        write_to_file(file_path, images)

# Image Encodings

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader, IterableDataset

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import time
import os
import copy
import json

from PIL import Image

from transformers import BertTokenizer, BertModel
import pickle

# making sure that the whole embedding tensor is printed in output
torch.set_printoptions(threshold=10_000)

In [None]:
# making sure the feature extraction runs on GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model_conv = torchvision.models.resnet50(pretrained=True)
last_layer = model_conv._modules.get('avgpool')

# transforming each image
data_transforms = transforms.Compose([
        transforms.Resize((256,256)),
        transforms.CenterCrop((224,224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

# test_dataset = MyDataset("D:/Projects/ML Project/test/", "E:/data/test/image.json", data_transforms)
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
"""Function that hooks to the layer before the FCN in ResNet and extracts the output embedding from it"""

def get_vector(img):
    my_embedding = torch.zeros(2048)
    
    def copy_data(m, i, o):
        my_embedding.copy_(o.data.reshape(o.data.size(1)))

    h = last_layer.register_forward_hook(copy_data)
    model_conv(img)
    h.remove()

    return my_embedding

In [None]:
# file containing preprocessed image and text data
# file path to be changed for different embeddings

test_file = open('E:/data/test/image.json')
test_data = json.load(test_file)
test_file.close()
output = []

# feature extraction logic
for row in test_data:
    id = row["id"]
    for image in row["images"]:
        image_path = "D:/Projects/ML Project/test/" + image[0] + "/" + image[1] + "/" + image[2] + "/" + image[3] + "/" + image
        img = Image.open(image_path)

        transformed_image = data_transforms(img)
        emb = get_vector(transformed_image.unsqueeze(0))    
        
        # store embedding as (id, image_file_name, embedding)
        output.append((id, image, emb))
        
        if len(output) % 1000 == 0:
            print(len(output))

In [None]:
torch.save(output, "E:/test_emb.pt")

# Text Encodings

In [None]:
# CHANGE FILENAME HERE
test_file = open('E:/data/train/text.json')
test_data = json.load(test_file)
test_file.close()

In [None]:
class JsonDataset(IterableDataset):
    def __init__(self, file):
        self.file = file

    def __iter__(self):
        for row in self.file:
            id = row["id"]
            title = row["title"]
            ingredients = row["ingredients"]
            instructions = row["instructions"]

            ingredient_text = ""
            instructions_text = ""

            ingredient_text = " ".join(ingredient["text"] for ingredient in ingredients)
            instructions_text = " ".join(instruction["text"] for instruction in instructions)
            
            full_text = title + " " + ingredient_text + " " + instructions_text

            yield id, title, ingredient_text, instructions_text, full_text

    
dataset = JsonDataset(test_data)
dataloader = DataLoader(dataset, batch_size=1)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
counter = 0

# in case error occurs, a range of rows to start from
# batch_start = 
# batch_end = 100000

for (id, title, ingredient_text, instructions_text, full_text) in dataloader:
    
#     if (counter in range(batch_start, batch_end)):
    # encode title
    encoded_title = tokenizer(title, return_tensors='pt', max_length=512, truncation = True)
    output_title = model(**encoded_title)
    output_title = torch.mean(output_title["last_hidden_state"], dim = 1)

    with open('E:/train_text_title_6.pkl', 'ab') as f:
        pickle.dump((id, output_title.squeeze(0)), f)

    # encode ingredients
    encoded_ingredients = tokenizer(ingredient_text, return_tensors='pt', max_length=512, truncation = True)
    output_ingredients = model(**encoded_ingredients)
    output_ingredients = torch.mean(output_ingredients["last_hidden_state"], dim = 1)

    with open('E:/train_text_ingredients_6.pkl', 'ab') as f:
        pickle.dump((id, output_ingredients.squeeze(0)), f)

    # encode instructions
    encoded_instructions = tokenizer(instructions_text, return_tensors='pt', max_length=512, truncation = True)
    output_instructions = model(**encoded_instructions)
    output_instructions = torch.mean(output_instructions["last_hidden_state"], dim = 1)

    with open('E:/train_text_instructions_6.pkl', 'ab') as f:
        pickle.dump((id, output_instructions.squeeze(0)), f)

    #encode everything
    encoded_full = tokenizer(full_text, return_tensors='pt', max_length=512, truncation = True)
    output_full = model(**encoded_full)
    output_full = torch.mean(output_full["last_hidden_state"], dim = 1)

    with open('E:/train_text_full_6.pkl', 'ab') as f:
        pickle.dump((id, output_full.squeeze(0)), f)

    print(counter)
    counter += 1


In [None]:
# check if encodings generated properly
emb = []
with open('E:/train_text_title_2.pkl', 'rb') as f:
    while True:
        emb.append(pickle.load(f))

# Aligning Encodings

## Image Encodings

In [None]:
img_train = torch.load("E:/train_emb.pt")

In [None]:
img_val = torch.load("E:/val_emb.pt")

In [None]:
img_test = torch.load("E:/test_emb.pt")

In [None]:
text_test_full = []
with open('E:/test_text_full.pkl', 'rb') as f:
    try:
        while True:
            text_test_full.append(pickle.load(f))
    except:
        pass

text_test_ingredients = []
with open('E:/test_text_ingredients.pkl', 'rb') as f:
    try:
        while True:
            text_test_ingredients.append(pickle.load(f))
    except:
        pass

text_test_instructions = []
with open('E:/test_text_instructions.pkl', 'rb') as f:
    try:
        while True:
            text_test_instructions.append(pickle.load(f))
    except:
        pass

text_test_title = []
with open('E:/test_text_title.pkl', 'rb') as f:
    try:
        while True:
            text_test_title.append(pickle.load(f))
    except:
        pass

In [None]:
text_val_title = []
with open('E:/val_text_title.pkl', 'rb') as f:
    try:
        while True:
            text_val_title.append(pickle.load(f))
    except:
        pass

text_val_instructions = []
with open('E:/val_text_instructions.pkl', 'rb') as f:
    try:
        while True:
            text_val_instructions.append(pickle.load(f))
    except:
        pass

text_val_ingredients = []
with open('E:/val_text_ingredients.pkl', 'rb') as f:
    try:
        while True:
            text_val_ingredients.append(pickle.load(f))
    except:
        pass

text_val_full = []
with open('E:/val_text_full.pkl', 'rb') as f:
    try:
        while True:
            text_val_full.append(pickle.load(f))
    except:
        pass

## Text Encodings

In [None]:
i, j = 0, 0
text_test_full_final = []
text_test_ingredients_final = []
text_test_instructions_final = []
text_test_title_final = []

while i < len(img_test):
    id = img_test[i][0]

    if text_test_full[j][0][0] == id and text_test_ingredients[j][0][0] == id and \
    text_test_instructions[j][0][0] == id and text_test_title[j][0][0] == id:
        text_test_full_final.append(text_test_full[j][1])
        text_test_ingredients_final.append(text_test_ingredients[j][1])
        text_test_instructions_final.append(text_test_instructions[j][1])
        text_test_title_final.append(text_test_title[j][1])

        i += 1
    else:
        j += 1

In [None]:
with open('E:/text_test_full_final.pkl', 'wb') as f:
    pickle.dump(text_test_full_final ,f)
with open('E:/text_test_instructions_final.pkl', 'wb') as f:
    pickle.dump(text_test_instructions_final ,f)
with open('E:/text_test_ingredients_final.pkl', 'wb') as f:
    pickle.dump(text_test_ingredients_final ,f)
with open('E:/text_test_title_final.pkl', 'wb') as f:
    pickle.dump(text_test_title_final ,f)

In [None]:
len(text_val_title)

In [None]:
len(img_val)

In [None]:
i, j = 0, 0
text_val_full_final = []
text_val_ingredients_final = []
text_val_instructions_final = []
text_val_title_final = []

while i < len(img_val):
    id = img_val[i][0]
#     print(id, text_val_full[j][0][0])
    if text_val_full[j][0][0] == id:
        text_val_full_final.append(text_val_full[j][1])
        text_val_ingredients_final.append(text_val_ingredients[j][1])
        text_val_instructions_final.append(text_val_instructions[j][1])
        text_val_title_final.append(text_val_title[j][1])

        i += 1
    else:
        j += 1

In [None]:
len(text_val_full_final)

In [None]:
with open('E:/text_val_full_final.pkl', 'wb') as f:
    pickle.dump(text_val_full_final ,f)
with open('E:/text_val_instructions_final.pkl', 'wb') as f:
    pickle.dump(text_val_instructions_final ,f)
with open('E:/text_val_ingredients_final.pkl', 'wb') as f:
    pickle.dump(text_val_ingredients_final ,f)
with open('E:/text_val_title_final.pkl', 'wb') as f:
    pickle.dump(text_val_title_final ,f)

In [None]:
text_train_title = []
with open('E:/train_text_title_1.pkl', 'rb') as f:
    try:
        while True:
            text_train_title.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_title2.pkl', 'rb') as f:
    try:
        while True:
            text_train_title.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_title_3.pkl', 'rb') as f:
    try:
        while True:
            text_train_title.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_title_200000.pkl', 'rb') as f:
    try:
        while True:
            text_train_title.append(pickle.load(f))
    except:
        pass
    
text_train_ingredients = []
with open('E:/train_text_ingredients_1.pkl', 'rb') as f:
    try:
        while True:
            text_train_ingredients.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_ingredients2.pkl', 'rb') as f:
    try:
        while True:
            text_train_ingredients.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_ingredients_3.pkl', 'rb') as f:
    try:
        while True:
            text_train_ingredients.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_ingredients_200000.pkl', 'rb') as f:
    try:
        while True:
            text_train_ingredients.append(pickle.load(f))
    except:
        pass
    
text_train_instructions = []
with open('E:/train_text_instructions_1.pkl', 'rb') as f:
    try:
        while True:
            text_train_instructions.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_instructions2.pkl', 'rb') as f:
    try:
        while True:
            text_train_instructions.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_instructions_3.pkl', 'rb') as f:
    try:
        while True:
            text_train_instructions.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_instructions_200000.pkl', 'rb') as f:
    try:
        while True:
            text_train_instructions.append(pickle.load(f))
    except:
        pass

text_train_full = []
with open('E:/train_text_full_1.pkl', 'rb') as f:
    try:
        while True:
            text_train_full.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_full2.pkl', 'rb') as f:
    try:
        while True:
            text_train_full.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_full_3.pkl', 'rb') as f:
    try:
        while True:
            text_train_full.append(pickle.load(f))
    except:
        pass
with open('E:/train_text_full_200000.pkl', 'rb') as f:
    try:
        while True:
            text_train_full.append(pickle.load(f))
    except:
        pass  

In [None]:
i = 0
x = [y[0][0] for y in text_train_full]
while i < len(img_train):
    id = img_train[i][0]
    if id not in x:
        img_train.pop(i)
    i += 1

In [None]:
i, j = 0, 0
text_train_full_final = []
text_train_ingredients_final = []
text_train_instructions_final = []
text_train_title_final = []

while i < len(img_train):
    id = img_train[i][0]
#     print(id, text_train_full[j][0][0])
    if text_train_full[j][0][0] == id and text_train_ingredients[j][0][0] == id and \
    text_train_instructions[j][0][0] == id and text_train_title[j][0][0] == id:
        text_train_full_final.append(text_train_full[j][1])
        text_train_ingredients_final.append(text_train_ingredients[j][1])
        text_train_instructions_final.append(text_train_instructions[j][1])
        text_train_title_final.append(text_train_title[j][1])

        i += 1
    else:
        j += 1

In [None]:
img_train_final = [img_train[i][2] for i in range(len(img_train))]
img_val_final = [img_val[i][2] for i in range(len(img_val))]
img_test_final = [img_test[i][2] for i in range(len(img_test))]

In [None]:
torch.save(img_train_final, "E:/img_train_final.pt")
torch.save(img_val_final, "E:/img_val_final.pt")
torch.save(img_test_final, "E:/img_test_final.pt")

# Professor's Encodings

In [None]:
# FULL DATA
prof_train_data_full = []
with open('E:/embeddings_train1.pkl', 'rb') as f:
    try:
        while True:
            prof_train_data_full.append(pickle.load(f))
    except:
        pass

prof_test_data_full = []
with open('E:/embeddings_test1.pkl', 'rb') as f:
    try:
        while True:
            prof_test_data_full.append(pickle.load(f))
    except:
        pass

prof_val_data_full = []
with open('E:/embeddings_val1.pkl', 'rb') as f:
    try:
        while True:
            prof_val_data_full.append(pickle.load(f))
    except:
        pass

# INGREDIENTS DATA
prof_train_data_ingredients = []
with open('E:/ingredients_embeddings_train.pkl', 'rb') as f:
    try:
        while True:
            prof_train_data_ingredients.append(pickle.load(f))
    except:
        pass

prof_test_data_ingredients = []
with open('E:/ingredients_embeddings_test.pkl', 'rb') as f:
    try:
        while True:
            prof_test_data_ingredients.append(pickle.load(f))
    except:
        pass
    
prof_val_data_ingredients = []
with open('E:/ingredients_embeddings_val.pkl', 'rb') as f:
    try:
        while True:
            prof_val_data_ingredients.append(pickle.load(f))
    except:
        pass

# INSTRUCTIONS DATA
prof_train_data_instructions = []
with open('E:/instructions_embeddings_train.pkl', 'rb') as f:
    try:
        while True:
            prof_train_data_instructions.append(pickle.load(f))
    except:
        pass

prof_val_data_instructions = []
with open('E:/instructions_embeddings_val.pkl', 'rb') as f:
    try:
        while True:
            prof_val_data_instructions.append(pickle.load(f))
    except:
        pass

prof_test_data_instructions = []
with open('E:/instructions_embeddings_test.pkl', 'rb') as f:
    try:
        while True:
            prof_test_data_instructions.append(pickle.load(f))
    except:
        pass

# TITLE DATA 
prof_train_data_title = []
with open('E:/title_embeddings_train.pkl', 'rb') as f:
    try:
        while True:
            prof_train_data_title.append(pickle.load(f))
    except:
        pass

prof_val_data_title = []
with open('E:/title_embeddings_val.pkl', 'rb') as f:
    try:
        while True:
            prof_val_data_title.append(pickle.load(f))
    except:
        pass

prof_test_data_title = []
with open('E:/title_embeddings_test.pkl', 'rb') as f:
    try:
        while True:
            prof_test_data_title.append(pickle.load(f))
    except:
        pass

In [None]:
# keep changing file name
embeddings = []
for i in range(len(prof_val_data_title[0][0])):
    embeddings.append(prof_val_data_title[0][0][i])

torch.save(embeddings, "E:/val_title.pt")