Skip to content

Commit

Permalink
Update USDA MyPlate and NIH Healthy Eating (#622)
Browse files Browse the repository at this point in the history
  • Loading branch information
jedwards1230 committed Oct 13, 2022
1 parent 837da9d commit 2181154
Show file tree
Hide file tree
Showing 7 changed files with 2,533 additions and 11 deletions.
95 changes: 85 additions & 10 deletions recipe_scrapers/nihhealthyeating.py
@@ -1,17 +1,27 @@
# mypy: disallow_untyped_defs=False
from typing import List, Optional
from attr import dataclass
from ._abstract import AbstractScraper
from ._exceptions import ElementNotFoundInHtml
from ._utils import get_minutes, get_yields, normalize_string


@dataclass
class IngredientGroup:
ingredients: List[str]
purpose: Optional[
str
] = None # this group of ingredients is {purpose} (e.g. "For the dressing")


class NIHHealthyEating(AbstractScraper):
@classmethod
def host(cls):
return "healthyeating.nhlbi.nih.gov"

def title(self):
# This content must be present for all recipes on this website.
return self.soup.h1.get_text().strip()
return normalize_string(self.soup.h1.get_text())

def total_time(self):
# This content must be present for all recipes on this website.
Expand Down Expand Up @@ -60,20 +70,66 @@ def image(self):

return image_relative_url

def ingredients(self):
def ingredient_groups(self) -> List[IngredientGroup]:
# This content must be present for recipes on this website.
ingredients_div = self.soup.find("div", {"id": "ingredients"})
section = []

if ingredients_div is None:
raise ElementNotFoundInHtml("Ingredients not found.")

# Find more than one lists of ingredients
ingredients_h4_sections = ingredients_div.find_all("h4")

# Ingredients are broken down into sections
# https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=11&cId=1&rId=5

if len(ingredients_h4_sections) >= 2:
ingredients_sections = ingredients_div.find_all("tr")
for ingredients_section in ingredients_sections:
items = ingredients_section.find("p").get_text().strip().split("\n")
# create ingredient group for each section
res = IngredientGroup(
ingredients=items,
purpose=normalize_string(ingredients_section.find("h4").get_text()),
)
section.append(res)
return section

# Default case
ingredients_p = ingredients_div.findAll("p")
ingredients = [normalize_string(para.get_text()) for para in ingredients_p]

return [
ingredients_list = [
ing for ing in ingredients if not ing.lower().startswith("recipe cards")
]

# Edge case: ingredents are a mix for single main ingredients and a single sub section
# https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=0&cId=10&rId=163

if len(ingredients_h4_sections) == 1:
items = (
ingredients_div.find("h4")
.find_next_sibling("p")
.get_text()
.strip()
.split("\n")
)
group = IngredientGroup(
purpose=normalize_string(ingredients_h4_sections[0].get_text()),
ingredients=items,
)
section.append(group)
section.append(IngredientGroup(ingredients=ingredients_list[:-1]))
return section

return [IngredientGroup(ingredients_list)]

def ingredients(self) -> List[str]:
results = []
for ingredient_group in self.ingredient_groups():
results.extend(ingredient_group.ingredients)
return results

def instructions(self):
# This content must be present for recipes on this website.
directions_div = self.soup.find("div", {"id": "recipe_directions"})
Expand All @@ -95,7 +151,7 @@ def nutrients(self):
self.soup.find("div", {"id": "nutrition_info"}).find("table").find_all("tr")
):
for element in s.find_all("td"):
if element.text.strip() != "":
if element.get_text().strip() != "":
elements.append(normalize_string(element.get_text()))

for i in range(0, len(elements), 2):
Expand All @@ -107,31 +163,50 @@ def nutrients(self):

def description(self):
return normalize_string(
self.soup.find("p", {"class": "recipe_detail_subtext"}).text.strip()
self.soup.find("p", {"class": "recipe_detail_subtext"}).get_text()
)

def prep_time(self):
return get_minutes(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[0]
.text.strip()
.get_text()
)

def cook_time(self):
return get_minutes(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[1]
.text.strip()
.get_text()
)

def serving_size(self):
return normalize_string(
self.soup.find("table", {"class": "recipe_time_table"})
.find_all("td")[3]
.text.strip()
.get_text()
)

def recipe_source(self):
return normalize_string(
self.soup.find("div", {"id": "Recipe_Source"}).text.split(": ")[1].strip()
self.soup.find("div", {"id": "Recipe_Source"}).get_text().split(": ")[1]
)

def recipe_cards(self):
recipe_cards_maker = self.soup.find("strong", string="Recipe Cards:")

if recipe_cards_maker is None:
return None

recipe_cards = []
recipe_cards_maker_siblings = recipe_cards_maker.next_siblings
for recipe_cards_maker_sibling in recipe_cards_maker_siblings:
link = recipe_cards_maker_sibling.find("a")
if recipe_cards_maker_sibling.name == "li":
recipe_cards.append(
{
"size": normalize_string(recipe_cards_maker_sibling.get_text()),
"url": link.get("href"),
}
)
return recipe_cards
41 changes: 41 additions & 0 deletions recipe_scrapers/usdamyplate.py
Expand Up @@ -69,3 +69,44 @@ def instructions(self):
instructions = div.find("div", {"class", "field__item"})

return "\n".join(instructions.stripped_strings)

def nutrients(self):
nutrition = {}

table = self.soup.find(
"form", {"class": "mp-recipe-full__nutrition-form"}
).find("table")
rows = table.find_all("tr")

elements = []
for row in rows:
cols = row.find_all("td")
cols = [ele.text.strip() for ele in cols]
elements.append([ele for ele in cols if ele])

for el in elements:
if len(el) > 1:
nutrition[el[0]] = el[1]

return nutrition

def serving_size(self):
return normalize_string(
self.soup.find("div", {"class": "field--name-field-recipe-serving-size"})
.find("span", {"class": "field__item"})
.get_text()
)

def description(self):
return normalize_string(
self.soup.find("div", {"class": "mp-recipe-full__description"})
.find("p")
.get_text()
)

def recipe_source(self):
return normalize_string(
self.soup.find("span", {"class": "field--name-field-source"})
.find("p")
.get_text()
)
File renamed without changes.

0 comments on commit 2181154

Please sign in to comment.