In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from model import EncoderRNN, AttnDecoderRNN
import json
import helpers


encoder_dict = torch.load('./model-concat.pt', map_location=torch.device('cpu'))['encoder_state_dict']
decoder_dict = torch.load('./model-concat.pt', map_location=torch.device('cpu'))['decoder_state_dict']
    
with open('../project_data/project_train_data_instr.json') as json_file:
    train_data = json.load(json_file)

In [2]:
N_EPOCHS = 15
LEARNING_RATE = 0.01
REPORT_EVERY = 1000
HIDDEN_DIM = 256
#BATCH_SIZE = 20
#N_LAYERS = 1
teacher_forcing_ratio = 1
TRAIN_SET_SIZE = 1000
n_words = 43863
MAX_LENGTH = 159

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_num_threads(10)

encoder = EncoderRNN(n_words, HIDDEN_DIM).to(device)
decoder = AttnDecoderRNN(HIDDEN_DIM, n_words, max_length=MAX_LENGTH).to(device)

encoder.load_state_dict(encoder_dict)
decoder.load_state_dict(decoder_dict)

<All keys matched successfully>

In [3]:
encoder.eval()
decoder.eval()

recipe_step_pairs, idx2word, word2idx, ml = helpers.get_tensor_data()
n_words = len(word2idx)
print(recipe_step_pairs[0])

Number of short ingredient lists:  130567
Average ingredient list length: 14.175872007959267
No ingredients filtered
Max instruction step length:  70
Number of long instructions:  61032
Average instruction length: 149.95270527301457
Total instruction steps:  489828
Recipes filtered:  61455
Recipes left after filtering:  75241
Recipe step pairs:  223824
New max length:  159
tensor([[43860],
        [   34],
        [   35],
        [   36],
        [    1],
        [   37],
        [   38],
        [   39],
        [   40],
        [   41],
        [   42],
        [   43],
        [   27],
        [   44],
        [    2],
        [   45],
        [   46],
        [   47],
        [   48],
        [   49],
        [   50],
        [   51],
        [   52],
        [   27],
        [   53],
        [   27],
        [43862]])
<SOS> unsalted butter onion flour sugar powder soda cheese frozen corn kernels roasted marinated red bell peppers basil Preheat oven to 400 degrees F ( 205 degrees 

In [5]:
from random import choice
from helpers import idx_to_words
from nltk.tokenize import sent_tokenize, word_tokenize
import re

def evaluate(encoder, decoder, input_tensor, gold_standard):
    with torch.no_grad():
        max_length = MAX_LENGTH
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden(device)

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        loss = 0
        
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[word2idx['<SOS>']]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if di < len(gold_standard):
                loss += loss_function(decoder_output, gold_standard[di])
            else:
                loss += loss_function(decoder_output, gold_standard[-1])
            if topi.item() == word2idx['<EOS>']:
                if di < len(gold_standard) and gold_standard[di] !=  word2idx['<EOS>']:
                    for dj in range(di, len(gold_standard)):
                        loss += loss_function(decoder_output, gold_standard[dj])
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(idx2word[str(topi.item())])

            decoder_input = topi.squeeze().detach()

        return decoded_words, loss.item()/len(gold_standard), decoder_attentions

    
def random_evaluate(evaluation_data, n=10):
    for i in range(n):
        pair = choice(evaluation_data)
        print('Instruction step', idx_to_words(pair[0], idx2word))
        print('Next step', idx_to_words(pair[1], idx2word))
        output_words, loss, attentions = evaluate(encoder, decoder, pair[0].to(device), pair[1].to(device))
        output_sentence = ' '.join(output_words)
        print('Generated instructions', output_sentence)
        print("Loss: ", loss)
        print('')
        
        
def evaluate_with_given_input(instruction_pair):
    output_words, loss, attentions = evaluate(encoder, decoder, instruction_pair[0].to(device), instruction_pair[1].to(device))
    output_sentence = ' '.join(output_words)
    return output_sentence, loss, attentions

    
def tokenize(instruction_step):
    words_tokenized = word_tokenize(instruction_step)
    return words_tokenized


def add_helper_tokens(step_tokenized):
    new_step = ['<SOS>']
    new_step.extend(step_tokenized)
    new_step.append('<EOS>')
    return new_step

def to_idx_repr(tokenized_instruction):
    idx_list = [word2idx[w] if w in word2idx else word2idx['<LN>'] for w in tokenized_instruction]
    return idx_list
    

def prepare_input_instruction(text):
    tokenized = tokenize(text)
    tokenized_h = add_helper_tokens(tokenized)
    idx_list = to_idx_repr(tokenized_h)
    return idx_list


def remove_helper_tokens(text):
    helpers_r = r'(<SOS>)|(<EOS>)'
    cleaned_text = re.sub(helpers_r, "", text, count=2)
    return cleaned_text


def generate_next_steps(first_step):
    print('Input: ', first_step)
    steps = []
    made_up_instruction = first_step
    i = 1
    while len(steps) < 10 and made_up_instruction != "<SOS> <EOS>":
        tensor = prepare_input_instruction(made_up_instruction)
        made_up_instruction = evaluate_with_given_input(tensor)
        steps.append(made_up_instruction)
        print(i,".", remove_helper_tokens(made_up_instruction))
        i = i + 1

        
def get_instruction_steps(recipes, ingredients):
    recipe_step_pairs = []
    for i, recipe in enumerate(recipes):
        ingr_str = " ".join(ingredients[i])
        ingr = prepare_input_instruction(ingr_str)
        for i, instr_step in enumerate(recipe[:-1]):
            ingr_tensor = ingr[:-1]
            instr_tensor = instr_step[1:]
            ingr_tensor.extend(instr_tensor)
            ingr_tensor = torch.tensor(ingr_tensor).view(-1, 1)
            target = torch.tensor(recipe[i+1]).view(-1, 1)
            recipe_step_pairs.append((ingr_tensor, target))
    print("Recipe step pairs: ", len(recipe_step_pairs))
    return recipe_step_pairs


def preprocess_ingredients(ingredient_data):
    # Extract quantity and quantity variable information
    amount_r = r'((\d{1,2}|½|¼)(\/\d)?(\s\d\/\d)?(\s\(\d{1,2}\sounce\))?)'
    measure_r = r'(cup(s)?|teaspoon(s)?|packet(s)?|box(es)?|package(s)?|tablespoon(s)?|ounce(s)?|pinch|square(s)?|pound(s)?|slice(s)?|bunch|cube(s)?|can(s)?|pint(s)?|drop(s)?|quart(s)?)'
    random_notes_r = r'(\(.*\))'

    parsed_ingredients_per_recipe = []
    for rec in ingredient_data:
        parsed_ingredients = {}
        for ing in rec:
            amount = re.search(amount_r, ing)
            measure = re.search(measure_r, ing)
            content = re.sub(amount_r, "", ing, count=1)
            content = re.sub(measure_r, "", content, count=1)
            content = re.sub(random_notes_r, "", content)
            content = content.strip()
            if amount and measure:
                amount_re = re.sub(random_notes_r, "", amount.group(0))
                parsed_ingredients[content] = (amount_re, measure.group(0))
            elif amount:
                amount_re = re.sub(random_notes_r, "", amount.group(0))
                parsed_ingredients[content] = (amount_re, "")
            elif measure:
                parsed_ingredients[content] = ("", measure.group(0))
            else:
                parsed_ingredients[content] = ("", "")
        parsed_ingredients_per_recipe.append(parsed_ingredients)

    print(len(parsed_ingredients_per_recipe))
    print(parsed_ingredients_per_recipe[0])

    # Get ingredient names
    ingr_names_per_recipe = []
    for ingr in parsed_ingredients_per_recipe:
        ingr_names = [key.strip() for key in ingr.keys()]
        ingr_names_per_recipe.append(ingr_names)

    ingr_names_comma = r'.*,'
    ingr_names_end = r'\s(\S+)$'

    simple_ingr_names = []
    for rec in ingr_names_per_recipe:
        simple_rec = []
        for i in rec:
            name = re.search(ingr_names_comma, i)
            if not name:
                name = re.search(ingr_names_end, i)
            if name:
                simple_rec.append(name.group(0).replace(',','').strip())
        simple_ingr_names.append(simple_rec)
    print(simple_ingr_names[0])
    
    # Create a list and set of all the ingredients together
    list_of_ingredients = []
    for rec in simple_ingr_names:
        list_of_ingredients.extend(rec)

    ingr_counts = Counter(list_of_ingredients)
    print(ingr_counts.most_common(30))
    set_of_ingredients = set(list_of_ingredients)
    print("Number of simplified ingredients: ", len(set_of_ingredients))
    return ingr_names_per_recipe


def preprocess_instruction_data_from_recipes(recipes, limit):
    preprocessed = []
    filtered_out = 0
    rm_indices = []
    for i, rec in enumerate(recipes):
        ingredients, instructions = rec
        rec_steps = []
        use_rec = True
        for step in instructions:
            if len(step) < limit:
                ingr_str = " ".join(ingredients)
                instr_tensor = prepare_input_instruction(step)
                rec_steps.append(instr_tensor)
            else:
                use_rec = False
                filtered_out = filtered_out + 1
                rm_indices.append(i)
        if use_rec:
            preprocessed.append(rec_steps)
    print(filtered_out, " recipes filtered out")
    return preprocessed, rm_indices


#made_up_instruction = "chicken Italian-seasoned bread crumbs small onion cloves garlic taste oil Mix ground chicken , 1/4 cup bread crumbs , onion , egg , garlic , salt , and black pepper in a bowl . Moisten hands and shape chicken mixture , 2 tablespoons at a time , into flat , oval-shaped patties ."
#generate_next_steps(made_up_instruction)

loss_function = nn.NLLLoss()
random_evaluate(recipe_step_pairs)

Instruction step <SOS> Paste water cans Hunt ’ s® Diced Tomatoes with Basil Garlic and Oregano Bring to a boil over high heat . Reduce heat to low ; simmer 10 minutes , stirring occasionally . <EOS>
Next step <SOS> Serve over hot cooked pasta . <EOS>
Generated instructions <SOS> Meanwhile , heat a large skillet , heat remaining 1 tablespoon oil and moderately high heat until hot but not smoking , then add garlic , cook until fragrant , about 10 minutes . <EOS>
Loss:  41.80839157104492

Instruction step <SOS> pudding butter shredded coconut cherries Drop teaspoonfuls of the yellow cake mixture onto ungreased baking sheets . Make a slight indentation in the center of each cookie with 2 fingers . Sprinkle 1/4 cup coconut on top ; press gently into the center . Place a maraschino cherry in the center of each cookie . <EOS>
Next step <SOS> Bake in the preheated oven until edges look dry and bottoms are golden , 8 to 10 minutes . <EOS>
Generated instructions <SOS> Bake in the preheated oven 

In [6]:
from collections import Counter

cookstr = [json.loads(line) for line in open('../../original_data/cookstr-recipes.json', 'r')]

In [7]:
test_ingr = [rec['ingredients'] for rec in cookstr]
test_instr = [rec['instructions'] for rec in cookstr]

preprocessed_ingredients = preprocess_ingredients(test_ingr)
print(preprocessed_ingredients[0])

recipes = [(preprocessed_ingredients[i], test_instr[i]) for i, rec in enumerate(preprocessed_ingredients)]

limit = 120
prcessed, rm_indices = preprocess_instruction_data_from_recipes(recipes, limit)
print(prcessed[0])

preprocessed_ingredients = [ing for i, ing in enumerate(preprocessed_ingredients) if i not in rm_indices]

test_data_steps = get_instruction_steps(prcessed, preprocessed_ingredients)

print(helpers.idx_to_words(test_data_steps[0][0], idx2word))
print(helpers.idx_to_words(test_data_steps[0][1], idx2word))


7918
{'softened butter': ('1', 'tablespoon'), 'flour': ('2', 'tablespoons'), 'sifted cake flour': ('3', 'cups'), 'double-acting baking powder': ('4', 'teaspoons'), 'salt': ('½', 'teaspoon'), 'unsalted butter, at room temperature': ('8', 'ounces'), 'granulated sugar': ('2', 'cups'), 'eggs, at room temperature': ('4', ''), 'milk, at room temperature': ('1', 'cup'), 'to 1½  vanilla extract': ('1', 'teaspoons'), '¾  strained orange juice': ('', 'cup'), 'lemon juice': ('2', 'tablespoons'), '¾  granulated sugar': ('', 'cup'), 'finely grated orange rind': ('1', 'tablespoon')}
['butter', 'flour', 'powder', 'unsalted butter', 'sugar', 'eggs', 'milk', 'extract', 'juice', 'juice', 'sugar', 'rind']
[('oil', 3711), ('pepper', 3065), ('salt', 2158), ('sugar', 1815), ('flour', 1559), ('juice', 1375), ('butter', 1158), ('powder', 1013), ('vinegar', 953), ('sauce', 944), ('leaves', 940), ('cream', 917), ('water', 874), ('parsley', 787), ('cheese', 746), ('taste', 706), ('onion', 687), ('milk', 654), ('

In [8]:
total_loss = 0

for t in test_data_steps:
    output, loss, attention = evaluate_with_given_input(t)
    total_loss += loss
    
print("Average loss for test set: ", total_loss/len(test_data_steps))


Average loss for test set:  12.771939133316128


In [9]:
random_evaluate(test_data_steps)

Instruction step <SOS> cooked short grain white rice fresh orange juice bananas , ripe fresh lemon juice egg yolk egg whites fat free powdered milk sugar Puree all ingredients in blender ; transfer to a glass baking dish . <EOS>
Next step <SOS> Set baking dish in a larger pan filled with 2 to 3 inches of water . <EOS>
Generated instructions <SOS> Bake in the preheated oven until the toothpick inserted into the center comes out clean , about 45 minutes . <EOS>
Loss:  12.997452629937065

Instruction step <SOS> olive oil fresh orange juice fresh lemon juice tomato sauce Recaito ¼ fresh tuna papaya , cut into chunks dark-brown sugar scallions , chopped garlic clove , minced red cayenne pepper half salt , half <LN> mixture In a glass dish , cover fish with marinade . <EOS>
Next step <SOS> Refrigerate for 2 hours , turning fish at least once . <EOS>
Generated instructions <SOS> Bake in the preheated oven until the knife inserted into the center of the loaf comes out clean , about 45 minutes 