In [198]:
import networkx as nx
from networkx.readwrite import json_graph

from nltk.corpus import wordnet as wn
from textblob import Word
from spacy.symbols import *
import spacy
nlp = spacy.load('en_core_web_sm')

from word2number import w2n
from fractions import Fraction

from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")

In [214]:
%run "./Get_Recipes.ipynb"

In [None]:
# word categories used for parsing

food = Word('food')
food_syn1 = food.get_synsets()[0] #Synset('food.n.01')
food_syn2 = food.get_synsets()[1] #Synset('food.n.02')
food_syn3 =  Word('oil').get_synsets()[0] #Synset('oil.n.01')

quant_syn1 = Word('mass_unit').get_synsets()[0] #Synset('mass_unit.n.01')
quant_syn2 = Word('containerful').get_synsets()[0] #Synset('containerful.n.01')

In [255]:
def get_num(word):
    neighbor = word
    amount = 0
    while neighbor.like_num:
        try:
            amount += float(Fraction(str(neighbor.text)))
        except:
            if amount == 0:
                amount += float(w2n.word_to_num(str(neighbor.text)))
        neighbor = neighbor.nbor(-1)
    return amount

In [217]:
def get_ingredient(sent):
    ingredient = {"amount": 0, "unit": None}
    ingredient_word = None
    ing_likelihood = 0
    for word in sent:
        word_ = Word(word.text)
        synsets = word_.get_synsets(pos=wn.NOUN)
        if (len(synsets) > 0):
            wordset = set([])
            for synset in synsets:
                wordset.update([i for i in synset.closure(lambda s:s.hypernyms())])
            # get ingredient
            if (food_syn1 in wordset or food_syn2 in wordset or food_syn3 in wordset) and quant_syn2 not in wordset:
#                     likelihood = 1.0/len(synsets)
#                     if (ingredient is None or likelihood > ing_likelihood) and \
#                         (word.tag_ == "NN" or word.tag_ == "NNS"):
                if (word.tag_ == "NN" or word.tag_ == "NNS"):
                    ingredient_word = word
#                         ing_likelihood = likelihood

            # get quantity unit
            if (quant_syn1 in wordset or quant_syn2 in wordset):
                ingredient["unit"] = word_.singularize()
                # get quantity
                neighbor = word.nbor(-1)
                ingredient["amount"] = get_num(neighbor)
                        
    # adjectives
    ingredient["descript"] = []
    try:
        ingredient["name"] = ingredient_word.lemma_
        for child in ingredient_word.children:
            if child.pos_ == "ADJ":
                ingredient["descript"].append(child.lemma_)
            elif ingredient["amount"] == 0:
                if (child.like_num):
                    ingredient["amount"] = get_num(child)                    

        # remove likelihood key
        return ingredient
    except:
        return None

In [218]:
def get_ingredient_list(text):
    text = text.replace("-", " ")
    text_list = text.split("\n")
    ingredients_set = {}
    for text in text_list:
        if (type(text).__name__ == "str"):
            text = unicode(text, 'utf-8')
        doc = nlp(text)
        for sent in doc.sents:
            ingredient = get_ingredient(sent)
            if ingredient != None:
                ingredients_set[Word(ingredient["name"])] = ingredient
            
    return ingredients_set

In [219]:
def has_match(text, ingredients_set):
    if (text in ingredients_set):
        return text
    
    ing1 = Word(text).get_synsets(pos=wn.NOUN)
    if len(ing1) > 0:
        for ingredient in ingredients_set:
            ing2 = Word(ingredient).get_synsets(pos=wn.NOUN)
            if len(ing2) > 0:
                if ing1[0].path_similarity(ing2[0]) > 0.3:
                    return ingredient
    return None

In [220]:
def has_heat(word):
    if (word.pos_ == "PUNCT"):
        return None
    text = word.text
    match = None
    hasHeat = False
    
    heat_keywords = ['celsius', 'c', 'fahrenheit', 'f', 'heat', 'temperature']
    if (text in heat_keywords):
        hasHeat = True
    
    word_ = Word(text)
    text_syn = Word(text).get_synsets()
    if len(text_syn) > 0:
        loop1 = True
        for keyword in heat_keywords:
            if loop1 == False:
                break
            keyword_syn = Word(keyword).get_synsets()
            if len(keyword_syn) > 0:
                loop2 = True
                for syn1 in text_syn:
                    if loop2 == False:
                        break
                    for syn2 in keyword_syn:
                        if syn1.path_similarity(syn2) >= 0.8:
                            hasHeat = True
                            match = keyword
                            loop1 = False
                            loop2 = False
                            break
                            
    if hasHeat == True:
        if match == "temperature":
            for child in word.children:
                if child.lemma_ == "room":
                    return "no"
        elif match == "heat":
            for child in word.children:
                if child.lemma_ == "high":
                    return "high"
                elif child.lemma_ == "medium":
                    return "medium"
                elif child.lemma_ == "low":
                    return "low"
            return "medium" #if unknown, use medium

        else:
            for child in word.children:
                if child.like_num:
                    childval = eval(child.text)
                    if match == "celsius" or match == "c":
                        if childval > 175:
                            return "very high"
                        elif childval > 100:
                            return "high"
                        elif childval > 80:
                            return "medium"
                        elif childval > 25:
                            return "low"
                        else:
                            return "no"
                    elif match == "fahrenheit" or match == "f":
                        if childval > 350:
                            return "very high"
                        elif childval > 210:
                            return "high"
                        elif childval > 180:
                            return "medium"
                        elif childval > 77:
                            return "low"
                        else:
                            return "no"    
    return None  

In [221]:
def create_graph():
    g = nx.DiGraph()
    return g

In [222]:
def get_instruction_step(sent, g, i, lastnode, ingredients_set):
    # i stands for step
    addTempNode = False
    temp_node = {"type": "heat", "label": "no"}
    list_of_source_nodes = []
   
    for word in sent:
        # check for new ingredients in flow
        match = has_match(word.lemma_, ingredients_set)
        if match:
            if g.has_node(match) != True:
                list_of_source_nodes.append(match)
        else:
            heat_amount = has_heat(word)
            if heat_amount != None:
                temp_node["label"] = heat_amount
                addTempNode = True
    
    # check if new temperature node is needed
    if addTempNode == True:
        g.add_node(i, attr_dict=temp_node)
        newnode = i
        if (lastnode != None):
            g.add_edge(lastnode, newnode)
        lastnode = newnode    
    if len(list_of_source_nodes) > 0:
        if addTempNode == False: 
            if (lastnode != None):
                for node in g.nodes(data=True):
                    if node[0] == lastnode:
                        if node[1]["type"] == "heat":
                            temp_node["label"] = node[1]["label"]
                        break
                g.add_node(i, attr_dict=temp_node)
                newnode = i
                g.add_edge(lastnode, newnode)
            else:
                g.add_node(i, attr_dict=temp_node)
                newnode = i
            lastnode = newnode
        
        for node in list_of_source_nodes:
            g.add_node(node, attr_dict={"type": "ingredient", "label": node, "ingredient": ingredients_set[node]})
            g.add_edge(node, lastnode)

    return lastnode

In [223]:
def addDoneNode(g, lastnode):
    g.add_node("done", attr_dict={"type": "done", "label": "done"})
    g.add_edge(lastnode, "done")

In [224]:
def get_instruction_steps(text, ingredients_set):
    # some dumb preprocessing
    text = text.replace(u"°F", u" F ")
    text = text.replace(u"°f", u" f ")
    text = text.replace(u"°C", u" C ")
    text = text.replace(u"°c", u" c ")
    text = text.strip()

    doc = nlp(text)
    g = create_graph()
    i = 0
    currnode = None
    for sent in doc.sents:
        currnode = get_instruction_step(sent, g, i, currnode, ingredients_set)
        i += 1
    addDoneNode(g, currnode)
    return json_graph.node_link_data(g)

In [242]:
def make_formatted_recipe(url):
    recipe = {"source": url}
    page = fetch_url(url)
    soup = BeautifulSoup(page, 'html.parser')
    
    title = soup.select(".title-source h1")
    assert len(title) == 1
    recipe["title"] = title[0].string.strip()
    
    rating = soup.select(".rating")
    assert len(rating) == 1
    recipe["rating"] = eval(str(rating[0].string))
    
    rating_count = soup.select(".reviews-count")
    assert len(rating_count) == 1
    recipe["rating_count"] = eval(str(rating_count[0].string))
    
    make_again_rating = soup.select(".prepare-again-rating span")
    assert len(make_again_rating) == 1
    recipe["make_again_rating"] = eval(str(make_again_rating[0].string.replace("%", "/100.0")))
    
    ingredients_text = soup.select(".ingredient-groups")
    assert len(ingredients_text) == 1
    ingredients_list = get_ingredient_list("\n".join(ingredients_text[0].findAll(text=True)))
    recipe["ingredients"] = ingredients_list
    
    steps = soup.select(".preparation-groups")
    assert len(steps) == 1
    steps_text = "".join(steps[0].findAll(text=True))
    recipe["steps"] = get_instruction_steps(steps_text, ingredients_list)

    return recipe