In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from model import EncoderRNN, AttnDecoderRNN
import json
import helpers


encoder_dict = torch.load('./model-concat.pt', map_location=torch.device('cpu'))['encoder_state_dict']
decoder_dict = torch.load('./model-concat.pt', map_location=torch.device('cpu'))['decoder_state_dict']
    
with open('../project_data/project_train_data_instr.json') as json_file:
    train_data = json.load(json_file)

In [3]:
N_EPOCHS = 15
LEARNING_RATE = 0.01
REPORT_EVERY = 1000
HIDDEN_DIM = 256
#BATCH_SIZE = 20
#N_LAYERS = 1
teacher_forcing_ratio = 1
TRAIN_SET_SIZE = 1000
n_words = 43863
MAX_LENGTH = 159

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_num_threads(10)

encoder = EncoderRNN(n_words, HIDDEN_DIM).to(device)
decoder = AttnDecoderRNN(HIDDEN_DIM, n_words, max_length=MAX_LENGTH).to(device)

encoder.load_state_dict(encoder_dict)
decoder.load_state_dict(decoder_dict)

<All keys matched successfully>

In [4]:
encoder.eval()
decoder.eval()

recipe_step_pairs, idx2word, word2idx, ml = helpers.get_tensor_data()
n_words = len(word2idx)
print(recipe_step_pairs[2])

Number of short ingredient lists:  130567
Average ingredient list length: 14.175872007959267
No ingredients filtered
Max instruction step length:  70
Number of long instructions:  61032
Average instruction length: 149.95270527301457
Total instruction steps:  489828
Recipes filtered:  61455
Recipes left after filtering:  75241
Recipe step pairs:  223824
New max length:  159
tensor([[43860],
        [   34],
        [   35],
        [   36],
        [    1],
        [   37],
        [   38],
        [   39],
        [   40],
        [   41],
        [   42],
        [   43],
        [   27],
        [   44],
        [    2],
        [   45],
        [   46],
        [   47],
        [   48],
        [   49],
        [   50],
        [   51],
        [   52],
        [   27],
        [   53],
        [   27],
        [43862]])
<SOS> unsalted butter onion flour sugar powder soda cheese frozen corn kernels roasted marinated red bell peppers basil Preheat oven to 400 degrees F ( 205 degrees 

In [5]:
print(recipe_step_pairs[2])

(tensor([[43860],
        [    0],
        [    1],
        [    2],
        [    3],
        [    4],
        [    5],
        [    6],
        [    7],
        [    8],
        [    9],
        [   10],
        [   11],
        [   12],
        [   13],
        [   14],
        [   15],
        [   16],
        [   54],
        [   55],
        [   56],
        [   57],
        [    3],
        [   49],
        [   32],
        [    5],
        [   49],
        [    4],
        [   49],
        [   58],
        [   49],
        [   45],
        [   32],
        [    6],
        [   37],
        [   59],
        [   60],
        [   27],
        [   44],
        [   61],
        [   62],
        [    1],
        [   45],
        [   63],
        [   56],
        [   64],
        [   47],
        [   65],
        [   66],
        [   67],
        [   68],
        [   27],
        [43862]]), tensor([[43860],
        [   69],
        [   70],
        [   45],
        [   71],
        [  

In [103]:
from random import choice
from helpers import idx_to_words
from nltk.tokenize import sent_tokenize, word_tokenize
import re

def evaluate(encoder, decoder, input_tensor, gold_standard):
    with torch.no_grad():
        max_length = MAX_LENGTH
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden(device)

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        loss = 0
        
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[word2idx['<SOS>']]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if di < len(gold_standard):
                loss += loss_function(decoder_output, gold_standard[di])
            else:
                loss += loss_function(decoder_output, gold_standard[-1])
            if topi.item() == word2idx['<EOS>']:
                if di < len(gold_standard) and gold_standard[di] !=  word2idx['<EOS>']:
                    for dj in range(di, len(gold_standard)):
                        loss += loss_function(decoder_output, gold_standard[dj])
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(idx2word[str(topi.item())])

            decoder_input = topi.squeeze().detach()

        return decoded_words, loss.item()/len(gold_standard), decoder_attentions

    
def random_evaluate(evaluation_data, n=10):
    for i in range(n):
        pair = choice(evaluation_data)
        print('Instruction step', idx_to_words(pair[0], idx2word))
        print('Next step', idx_to_words(pair[1], idx2word))
        output_words, loss, attentions = evaluate(encoder, decoder, pair[0].to(device), pair[1].to(device))
        output_sentence = ' '.join(output_words)
        print('Generated instructions', output_sentence)
        print("Loss: ", loss)
        print('')
        
        
def evaluate_with_given_input(pair):
    #print('Instruction step', idx_to_words(pair[0], idx2word))
    #print('Next step', idx_to_words(pair[1], idx2word))
    #print(pair[0])
    #print(pair[1])
    output_words, loss, attentions = evaluate(encoder, decoder, pair[0].to(device), pair[1].to(device))
    output_sentence = ' '.join(output_words)
    #print('Generated instructions', output_sentence)
    return output_sentence, loss, attentions

    
def tokenize(instruction_step):
    words_tokenized = word_tokenize(instruction_step)
    return words_tokenized


def add_helper_tokens(step_tokenized):
    new_step = ['<SOS>']
    new_step.extend(step_tokenized)
    new_step.append('<EOS>')
    return new_step

def to_idx_repr(tokenized_instruction):
    idx_list = [word2idx[w] if w in word2idx else word2idx['<LN>'] for w in tokenized_instruction]
    return idx_list
    

def prepare_input_instruction(text):
    tokenized = tokenize(text)
    tokenized_h = add_helper_tokens(tokenized)
    idx_list = to_idx_repr(tokenized_h)
    return idx_list

def prepare_input_instruction_eval(text):
    tokenized = tokenize(text)
    tokenized_h = add_helper_tokens(tokenized[3:-3])
    tensor = to_idx_repr(tokenized_h)
    return tensor

def remove_helper_tokens(text):
    helpers_r = r'(<SOS>)|(<EOS>)'
    cleaned_text = re.sub(helpers_r, "", text, count=2)
    return cleaned_text


def generate_next_steps(first_step):
    print('Input: ', first_step)
    steps = []
    made_up_instruction = first_step
    i = 1
    while len(steps) < 10 and made_up_instruction != "<SOS> <EOS>":
        tensor_input = torch.tensor([prepare_input_instruction(made_up_instruction)]).view(-1, 1)
        tensor_target = torch.tensor([prepare_input_instruction("dummy target")]).view(-1, 1)
        #print(tensor_input)
        #print(tensor_target)
        made_up_instruction, loss, attn = evaluate_with_given_input((tensor_input, tensor_target))
        #print(made_up_instruction)
        steps.append(made_up_instruction)
        printable = remove_helper_tokens(made_up_instruction)
        print(i,".", printable)
        i = i + 1

        
def get_instruction_steps(recipes, ingredients):
    recipe_step_pairs = []
    for i, recipe in enumerate(recipes):
        ingr_str = " ".join(ingredients[i])
        ingr = prepare_input_instruction(ingr_str)
        for i, instr_step in enumerate(recipe[:-1]):
            ingr_tensor = ingr[:-1]
            instr_tensor = instr_step[1:]
            ingr_tensor.extend(instr_tensor)
            ingr_tensor = torch.tensor(ingr_tensor).view(-1, 1)
            target = torch.tensor(recipe[i+1]).view(-1, 1)
            recipe_step_pairs.append((ingr_tensor, target))
    print("Recipe step pairs: ", len(recipe_step_pairs))
    return recipe_step_pairs


def preprocess_ingredients(ingredient_data):
    # Extract quantity and quantity variable information
    amount_r = r'((\d{1,2}|½|¼)(\/\d)?(\s\d\/\d)?(\s\(\d{1,2}\sounce\))?)'
    measure_r = r'(cup(s)?|teaspoon(s)?|packet(s)?|box(es)?|package(s)?|tablespoon(s)?|ounce(s)?|pinch|square(s)?|pound(s)?|slice(s)?|bunch|cube(s)?|can(s)?|pint(s)?|drop(s)?|quart(s)?)'
    random_notes_r = r'(\(.*\))'

    parsed_ingredients_per_recipe = []
    for rec in ingredient_data:
        parsed_ingredients = {}
        for ing in rec:
            amount = re.search(amount_r, ing)
            measure = re.search(measure_r, ing)
            content = re.sub(amount_r, "", ing, count=1)
            content = re.sub(measure_r, "", content, count=1)
            content = re.sub(random_notes_r, "", content)
            content = content.strip()
            if amount and measure:
                amount_re = re.sub(random_notes_r, "", amount.group(0))
                parsed_ingredients[content] = (amount_re, measure.group(0))
            elif amount:
                amount_re = re.sub(random_notes_r, "", amount.group(0))
                parsed_ingredients[content] = (amount_re, "")
            elif measure:
                parsed_ingredients[content] = ("", measure.group(0))
            else:
                parsed_ingredients[content] = ("", "")
        parsed_ingredients_per_recipe.append(parsed_ingredients)

    print(len(parsed_ingredients_per_recipe))
    print(parsed_ingredients_per_recipe[0])

    # Get ingredient names
    ingr_names_per_recipe = []
    for ingr in parsed_ingredients_per_recipe:
        ingr_names = [key.strip() for key in ingr.keys()]
        ingr_names_per_recipe.append(ingr_names)

    ingr_names_comma = r'.*,'
    ingr_names_end = r'\s(\S+)$'

    simple_ingr_names = []
    for rec in ingr_names_per_recipe:
        simple_rec = []
        for i in rec:
            name = re.search(ingr_names_comma, i)
            if not name:
                name = re.search(ingr_names_end, i)
            if name:
                simple_rec.append(name.group(0).replace(',','').strip())
        simple_ingr_names.append(simple_rec)
    print(simple_ingr_names[0])
    
    # Create a list and set of all the ingredients together
    list_of_ingredients = []
    for rec in simple_ingr_names:
        list_of_ingredients.extend(rec)

    ingr_counts = Counter(list_of_ingredients)
    print(ingr_counts.most_common(30))
    set_of_ingredients = set(list_of_ingredients)
    print("Number of simplified ingredients: ", len(set_of_ingredients))
    return ingr_names_per_recipe


def preprocess_instruction_data_from_recipes(recipes, limit):
    preprocessed = []
    filtered_out = 0
    rm_indices = []
    for i, rec in enumerate(recipes):
        ingredients, instructions = rec
        rec_steps = []
        use_rec = True
        for step in instructions:
            if len(step) < limit:
                ingr_str = " ".join(ingredients)
                instr_tensor = prepare_input_instruction(step)
                rec_steps.append(instr_tensor)
            else:
                use_rec = False
                filtered_out = filtered_out + 1
                rm_indices.append(i)
        if use_rec:
            preprocessed.append(rec_steps)
    print(filtered_out, " recipes filtered out")
    return preprocessed, rm_indices


#made_up_instruction = "chicken Italian-seasoned bread crumbs small onion cloves garlic taste oil Mix ground chicken , 1/4 cup bread crumbs , onion , egg , garlic , salt , and black pepper in a bowl . Moisten hands and shape chicken mixture , 2 tablespoons at a time , into flat , oval-shaped patties ."
#generate_next_steps(made_up_instruction)

loss_function = nn.NLLLoss()
random_evaluate(recipe_step_pairs)

Instruction step <SOS> breasts almonds crumbs granules long-grain brown rice Bake uncovered 30 minutes or until chicken is golden and crisp and meat is white in center when tested with a knife . <EOS>
Next step <SOS> Serve with hot cooked rice . <EOS>
Generated instructions <SOS> In a medium bowl , combine the chicken and the chicken and and toss to coat . <EOS>
Loss:  18.904644012451172

Instruction step <SOS> Filling pie cream cheese frozen whipped topping flakes Sprinkle top with additional 2 tablespoons of toasted coconut flakes if desired . <EOS>
Next step <SOS> Cover and chill for 2 hours , or until firm . <EOS>
Generated instructions <SOS> Bake for 15 minutes or until crust is golden brown . <EOS>
Loss:  8.565440838153545

Instruction step <SOS> refrigerated pie crusts margarine sweet onions cream cheese Cascadian Farm® frozen organic spinach Melt butter in 10-inch skillet . Cook onion in butter 2 to 3 minutes , stirring constantly , until tender . <EOS>
Next step <SOS> Stir tog

In [7]:
from collections import Counter

cookstr = [json.loads(line) for line in open('../../original_data/cookstr-recipes.json', 'r')]

In [8]:
test_ingr = [rec['ingredients'] for rec in cookstr]
test_instr = [rec['instructions'] for rec in cookstr]

preprocessed_ingredients = preprocess_ingredients(test_ingr)
print(preprocessed_ingredients[0])

recipes = [(preprocessed_ingredients[i], test_instr[i]) for i, rec in enumerate(preprocessed_ingredients)]

limit = 120
prcessed, rm_indices = preprocess_instruction_data_from_recipes(recipes, limit)
print(prcessed[0])

preprocessed_ingredients = [ing for i, ing in enumerate(preprocessed_ingredients) if i not in rm_indices]

test_data_steps = get_instruction_steps(prcessed, preprocessed_ingredients)

print(helpers.idx_to_words(test_data_steps[1][0], idx2word))
print(helpers.idx_to_words(test_data_steps[1][1], idx2word))


7918
{'softened butter': ('1', 'tablespoon'), 'flour': ('2', 'tablespoons'), 'sifted cake flour': ('3', 'cups'), 'double-acting baking powder': ('4', 'teaspoons'), 'salt': ('½', 'teaspoon'), 'unsalted butter, at room temperature': ('8', 'ounces'), 'granulated sugar': ('2', 'cups'), 'eggs, at room temperature': ('4', ''), 'milk, at room temperature': ('1', 'cup'), 'to 1½  vanilla extract': ('1', 'teaspoons'), '¾  strained orange juice': ('', 'cup'), 'lemon juice': ('2', 'tablespoons'), '¾  granulated sugar': ('', 'cup'), 'finely grated orange rind': ('1', 'tablespoon')}
['butter', 'flour', 'powder', 'unsalted butter', 'sugar', 'eggs', 'milk', 'extract', 'juice', 'juice', 'sugar', 'rind']
[('oil', 3711), ('pepper', 3065), ('salt', 2158), ('sugar', 1815), ('flour', 1559), ('juice', 1375), ('butter', 1158), ('powder', 1013), ('vinegar', 953), ('sauce', 944), ('leaves', 940), ('cream', 917), ('water', 874), ('parsley', 787), ('cheese', 746), ('taste', 706), ('onion', 687), ('milk', 654), ('

In [9]:
total_loss = 0
outputs = []

for t in test_data_steps:
    output, loss, attention = evaluate_with_given_input(t)
    total_loss += loss
    outputs.append(output)
    
print("Average loss for test set: ", total_loss/len(test_data_steps))


Average loss for test set:  12.771939133316128


In [83]:
test_data_steps[0]

(tensor([[43860],
         [ 4619],
         [ 3912],
         [   13],
         [ 1987],
         [ 2015],
         [20513],
         [  170],
         [21208],
         [  957],
         [  170],
         [  385],
         [  294],
         [  113],
         [   57],
         [   74],
         [   90],
         [   29],
         [ 1095],
         [   49],
         [   45],
         [24938],
         [   19],
         [  394],
         [   57],
         [ 3894],
         [   27],
         [43862]]), tensor([[43860],
         [  181],
         [   90],
         [   29],
         [ 1300],
         [  723],
         [   45],
         [  914],
         [  123],
         [   57],
         [ 1047],
         [   19],
         [  843],
         [   27],
         [43862]]))

In [130]:
generate_next_steps("chicken borth garlic onion salt pepper Preheat the oven")

Input:  chicken borth garlic onion salt pepper Preheat the oven
1 .  In a medium bowl , mix together the garlic , paprika , paprika , salt , and pepper . 
2 .  In a separate bowl , mix together the egg , and the . 
3 .  In a separate bowl , mix together the eggs , milk , and butter . Stir in the milk and Pour into the prepared pan . 
4 .  Bake for 30 minutes in the preheated oven . or until the knife inserted into the center comes out clean . Allow to cool slightly before serving . 
5 .  In a medium bowl , combine the cream , butter , and the sugar . Stir until well combined . 
6 .  Pour into the greased bowl floured blender , Add blend until smooth . 
7 .  In a separate bowl , combine the eggs , milk , and milk . Stir until the mixture is well mixed . 
8 .  In a separate bowl , whisk together the eggs , milk , and milk . Pour into milk mixture into the prepared baking dish . 
9 .  Bake for 30 minutes or until the knife inserted into the center comes out clean . Allow to cool slightly 

In [131]:
test_input = "milk sugar chocolate strawberries flour"

generate_next_steps(test_input)

Input:  milk sugar chocolate strawberries flour
1 .  In a separate bowl , beat egg white , milk , Stir until chocolate mixture is just moistened . 
2 .  Pour into prepared pan . Bake at 350 degrees F ( 175 degrees C ) for 45 minutes . 
3 .  Remove from oven , let cool slightly . 
4 .  In a bowl , whisk together 1 cup sugar , salt , and pepper . 
5 .  In a separate bowl , whisk together 1 cup of the butter , and sugar until light and fluffy . 
6 .  Stir together butter , milk , and salt , a bowl until the mixture resembles coarse crumbs . 
7 .  Bake in the preheated oven until the knife inserted into the center comes out clean , about 1 hour . 
8 .  Combine the remaining ingredients in a bowl ; Pour over the prepared baking dish . 
9 .  Bake in the preheated oven until the knife inserted into the center comes out clean , about 45 minutes . 
10 .  Cool the 5 minutes before serving . 


In [60]:
from nltk.translate import bleu_score, meteor_score
from nltk.metrics import scores
from rouge_score import rouge_scorer


avg_prec = 0
avg_recall = 0
avg_fscore = 0
avg_bleu = 0
#avg_rouge = []
avg_meteor = 0
avg_len = 0

N = len(test_data_steps)
results = []
targets = []
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)


for i, t in enumerate(test_data_steps):
    input_step = [str(s) for s in t[0].flatten().tolist()]
    target = [str(s) for s in t[1].flatten().tolist()]
    targets.append(target)
    result = outputs[i]
    #print(input_step)
    #print(target)
    #print(result)
    result_vec = [str(r) for r in prepare_input_instruction_eval(result)]
    results.append(result_vec)
    avg_len += len(result_vec)
    # sanity check
    #prep = idx_to_words(result_vec, idx2word)
    #print(result_vec)
    precision = scores.precision(set(result_vec), set(target))
    avg_prec += precision
    recall = scores.recall(set(result_vec), set(target))
    avg_recall += recall
    f_score = scores.f_measure(set(result_vec), set(target))
    avg_fscore += f_score
    bleu = bleu_score.sentence_bleu([target], result)
    avg_bleu += bleu
    rouge = scorer.score(" ".join(target), " ".join(result))
    #print(rouge)
    #avg_rouge.append(rouge['rougeL']['precision']
    meteor = meteor_score.single_meteor_score(" ".join(target), " ".join(result))
    avg_meteor += meteor

print("Average precision: ", avg_prec/N)
print("Average recall: ", avg_recall/N)
#print("F1-measure: ", avg_fscore/N)

print("Average BLEU: ", avg_bleu/N)
print("Average METEOR: ", avg_meteor/N)
#print("Average ROUGE-L: ", avg_bleu/N)
print("Average step length: ", avg_len/N)

Average precision:  0.35466079099718434
Average recall:  0.3903075913261745
Average BLEU:  4.9959912399462084e-234
Average METEOR:  0.0018386536329180355
Average step length:  18.997867803837952
