In [20]:
import requests
import bs4
from bs4 import BeautifulSoup as soup
import json
import pandas as pd
from pandas import DataFrame

import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp = spacy.load('en_core_web_sm')
from string import punctuation
from collections import defaultdict
import itertools
from word2number import w2n
from Levenshtein import distance
from fractions import Fraction

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [2]:
# Basic scraper

def get_ld_json(url: str) -> dict:
    parser = "html.parser"
    req = requests.get(url)
    page = soup(req.text, parser)
    return json.loads("".join(page.find("script", {"type":"application/ld+json"}).contents))

In [3]:
# Jsonld parser

def parse_json(url):
    jsonld = get_ld_json(url)
    useful = jsonld[1]
    name = useful['name']
    ingredients = useful["recipeIngredient"]
    instructions = useful["recipeInstructions"]
    
    steps = []
    for step in instructions:
        step = step['text'].split(".")[:-1]
        for s in step:
            steps.append(s.strip())
            
    return name, ingredients, steps


In [4]:
# Ingredients Function

def parse_ingredients(ingredients):
    df = DataFrame (ingredients,columns=['ingredients'])
    # 2 cases
    # "number" ("()") ("unit") (adjective) "noun/subject - ingredient" (, other)
    # contains "to taste"
    df = df["ingredients"]
    df_taste = df[df.str.contains('to taste', case = False)]
    
    df_unit = df[~df.str.contains('to taste', case = False)]
    
    # array of arrays: each array is amount, unit, ingredient, descriptor, preparation
    ingredients_parsed = []
    
    for i in df:
        curr_arr = ["", "", "", "", ""]
        if 'to taste' not in i:
        
            # before we look at POS, remove everything after the comma and put it in prep
            split_string = i.split(", ", 1)
            root_phrase = split_string[0]
            if len(split_string) > 1:
                other_piece = split_string[1]
            else:
                other_piece = ""

            curr_arr[4] = other_piece

            split_string2 = root_phrase.split("(", 1)
            if len(split_string2) > 1:
                split_string3 = split_string2[1].split(")", 1)
                curr_arr[3] = split_string3[0]
                root_phrase = split_string2[0].strip() + split_string3[1]
            doc = nlp(root_phrase)
            index = 0
            for token in doc:
                found_num = False
                # only get first number if it matches criteria
                #if found_num == False and token.pos_ == "NOUN" and not token.is_alpha or token.pos_ == "NUM":
                if index == 0:
                    curr_arr[0] = token.text
                elif index == 1:
                    if token.pos_ != "ADJ":
                        curr_arr[1] = token.text
                    else:
                        curr_arr[3] = token.text
                elif token.dep_ == "ROOT":
                    curr_arr[2] = token.text
                else: 
                    curr_arr[3] = curr_arr[3] + " " + token.text
                    curr_arr[3] = curr_arr[3].strip()
                index+=1
        else:
            i = i.replace("to taste", "")
            doc = nlp(i)
            curr_arr[0] = "to taste"
            for token in doc:
                if token.dep_ == "ROOT":
                    curr_arr[2] = token.text
                else:
                    curr_arr[3] = curr_arr[3] + " " + token.text
                    curr_arr[3] = curr_arr[3].strip()
        ingredients_parsed.append(curr_arr)
        #print(token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token.children, token.head)
        #displacy.render(doc, style="dep") # change to serve when we go to python


    
    return ingredients_parsed

In [49]:
# Methods function

def find_methods(step):
    newstep = step

    doc = nlp(newstep)
    methods = {}
    timescale = {}
    times = {}
    connector = {}
    for i, entity in enumerate(doc):
        if entity.pos_ == "VERB":  #and str(entity.head) == str(entity.text)
            methods[entity] = {'times':[], 'scale':None}
        if entity.dep_ == "pobj" or entity.dep_ == 'dobj':
            timescale[entity] = entity.head
        if entity.pos_ == "NUM" and (entity.dep_ == 'quantmod' or entity.dep_ == 'nummod'):
            times[entity] = entity.head
#         for m in methods:
#             if entity.head == m and str(entity.text) != str(entity.head):
#                 connector[entity] = m
        if str(entity.text) != str(entity.head):
            connector[entity] = entity.head
            
    if not methods:
        return {}
    for m in methods:
        if m in connector:
            n = connector[m]
            if n not in methods:
                connector[n] = m
                connector.pop(m)
    
    for scale in timescale:
        while timescale[scale] not in methods:
            if timescale[scale] not in connector:
#                 print("Error in dependency parsing")
                break
            timescale[scale] = connector[timescale[scale]]
            
    for time in times:
        if times[time] in times:
            times[time] = times[times[time]]
    
    for s in timescale:
        for t in times:
            if times[t] == s:
                try:
                    methods[timescale[s]]['times'].append(float(Fraction(str(t))))
                except:
                    try:
                        methods[timescale[s]]['times'].append(float(w2n.word_to_num(str(t))))
                    except:
                        pass
        
        try:
            methods[timescale[s]]['scale'] = str(s)
        except:
            pass
        
    M = {}
    for m in methods:
#         if methods[m]['scale'] == 'seconds':
#             methods[m]['times'] = [t/60 for t in methods[m]['times']]
#             methods[m]['scale'] = 'minutes'
#         elif methods[m]['scale'] == 'hours':
#             methods[m]['times'] = [t*60 for t in methods[m]['times']]
#             methods[m]['scale'] = 'minutes'
            
        val = methods[m]
        M[str(m).lower()] = val
    
    ## get indices of times, pobj (minutes, hours, days, etc), and method, and use distance to see which ones belong to which one.

    return M

In [6]:
# Tools function

def find_tool(sentence,filter_list):
    in_list = ['in','into','on','to','with','onto','through']
    tools =[]
    sent = nlp(sentence.lower())
    for chunk in sent.noun_chunks:
#         and chunk.root.head.pos_ != 'VERB' and not chunk.root.is_sent_start
        if chunk.root.text not in filter_list and chunk.root.head.text in in_list:
            tools.append(chunk.text)
#     for token in sent:
#         if token.text not in too and token.pos_ == 'NOUN':
#             not_tools.append(token.text)
    return tools

In [7]:
# Steps function

def write_step(number, ingredients, tools, methods):
    lingredients = len(ingredients)
    lmethods = len(methods)
    ltools = len(tools)
    
    s = 'Step ' + number + ': Ingredient(s): '
    for i in ingredients:
        s = s + i + ', '
    s = s + '; Cooking Method(s): '
    
    for m in methods:
        if not methods[m]['times']:
            s = s + m + ', '
        elif len(methods[m]['times']) == 1:
            s = s + m + ' for ' + str(methods[m]['times'][0]) + ' ' + methods[m]['scale'] + ', '
        else:
            s = s + m + ' for ' + ''.join(str(methods[m]['times'])) + ' ' + methods[m]['scale'] + ', '
    s = s + '; Tool(s): '
    
    for t in tools:
        s = s + t + ', '
    s = s + '.'
    
    if lmethods != ltools:
#         print("There are different numbers of actions and tools. Check outputs")
        pass
        
    return s

In [8]:
ingredients = ['apple','pear','banana']
tools = ['bowl', 'pan']
methods = {'mix': {'times': [], 'scale': 'bowl'}, 'cool': {'times': [35.0, 45.0], 'scale': 'minutes'}}

s = write_step('apples', ingredients, tools, methods)
print(s)

Name: apples; Ingredient(s): apple, pear, banana, ; Cooking Method(s): mix, cool for [35.0, 45.0] minutes, ; Tool(s): bowl, pan, .


In [53]:
# url = "https://www.allrecipes.com/recipe/263484/chef-johns-chocolate-croissants/?internalSource=hub%20recipe&referringContentType=Search"
url = "https://www.allrecipes.com/recipe/263384/spicy-shrimp-pad-thai/?internalSource=hub%20recipe&referringContentType=Search"
name, ingred, instructions = parse_json(url)
print(name)
print(instructions)
print(ingred)
    
    
    # get ingredients first
#     foods = [i for i in ingredients if i in sentence]

Spicy Shrimp Pad Thai
['Place noodles in a large bowl and cover with hot water', 'Set aside until noodles are softened, about 15 minutes', 'Drain and rinse thoroughly', 'Heat oil in a wok over medium heat', 'Cook and stir onion and garlic until onion is translucent, about 5 minutes', 'Combine rice wine vinegar, ketchup, fish sauce, chile sauce, peanut butter, soy sauce, lime juice, sugar, red pepper flakes, and cayenne pepper together in a bowl', 'Set aside', 'Add shrimp to the wok', 'Cook and stir until pink, about 4 minutes', 'Add the ketchup mixture and stir', 'Move all ingredients to one side to clear a small space in the pan', 'Pour in eggs and cook without stirring until partially set, about 3 minutes', 'Stir the drained noodles into the shrimp and egg mixture', 'Add 1/2 cup peanuts, 1/4 cup bean sprouts, and carrots', 'Cook and stir until noodles are heated through, about 2 minutes', 'Garnish with the remaining peanuts, bean sprouts, lime wedges, green onions, and cilantro']
['1

In [54]:
# Getting ingredients ------------------------------------------------

ingredients = parse_ingredients(ingred)
# print(ingredients)
core = []
for i in ingredients:
    i.reverse()
    if i[1] and i[2]:
        core.append(' '.join(i[1:3]).replace(' - ','-').replace('and','').strip())
    elif i[1]:
        core.append(' '.join(i[1:2]).replace(' - ', '-'.replace('and','').strip()))
    elif i[2]:
        core.append(i[2])
    else:
        core.append(i[3])
#     if i[2] not in core and i[2]: 
#         core.append(i[2])
#     elif not i[2]:
#         core.append(i[1])

core = list(dict.fromkeys(core))
# core = [stemmer.stem(each) for each in core]
print(core)
print("\n")
# Tools setup --------------------------------------------------------

ing = itertools.chain.from_iterable(ingredients)
ing = list(dict.fromkeys(ing))
filter_list = ['mixture','ingredients','degrees',
                'temperature','f', 'fahrenheit', 'c', 'celsius']
for ingr in ing:
    for word in ingr.split():
        filter_list.append(word)
for w in name.lower().split():
    filter_list.append(w)
filter_list = list(dict.fromkeys(filter_list))
# print(filter_list)

# Getting tools and methods

tools = []
for i, sentence in enumerate(instructions):
    print(sentence)
    lst = []
    for chunks in sentence.split():
        theone = None
        minimum = 10000000
        for full_item in core:
            if any(ind in chunks for ind in full_item.split()) and distance(full_item, chunks) < minimum:
                minimum = distance(full_item, chunks)
                theone = full_item
        lst.append(theone)
        
    lst = list(dict.fromkeys([l for l in lst if l]))
#         if any(ind in sentence for ind in item.split()):
#             lst.append(item)
            
    t = find_tool(sentence,filter_list)
    m = find_methods(sentence)
    s = write_step(str(i+1), lst, t, m)
    print(s)
    print("\n")
    
#     for each in t:
#         tools.append(each)
# tools = list(dict.fromkeys(tools))


['8 ounce dried rice noodles', 'peanut oil', 'small onion', 'garlic', 'rice wine vinegar', 'ketchup', 'fish sauce', 'sweet chile sauce', 'creamy peanut butter', 'light soy sauce', 'fresh lime juice', 'white sugar', 'red pepper flakes', 'cayenne pepper', 'uncooked medium shrimp', 'eggs', 'unsalted dry-roasted peanuts', 'bean sprouts', 'carrots shredded', 'lime', 'green onions chopped', 'coarsely cilantro chopped']


Place noodles in a large bowl and cover with hot water
Step 1: Ingredient(s): 8 ounce dried rice noodles, ; Cooking Method(s): cover, ; Tool(s): a large bowl, hot water, .


Set aside until noodles are softened, about 15 minutes
Step 2: Ingredient(s): 8 ounce dried rice noodles, ; Cooking Method(s): set, softened, ; Tool(s): .


Drain and rinse thoroughly
Step 3: Ingredient(s): ; Cooking Method(s): drain, ; Tool(s): .


Heat oil in a wok over medium heat
Step 4: Ingredient(s): peanut oil, uncooked medium shrimp, ; Cooking Method(s): ; Tool(s): a wok, .


Cook and stir onion 