Update USDA MyPlate and NIH Healthy Eating (#622)

hhursev · Oct 13, 2022 · 2181154 · 2181154
1 parent 837da9d
commit 2181154
Show file tree

Hide file tree

Showing 7 changed files with 2,533 additions and 11 deletions.
diff --git a/recipe_scrapers/nihhealthyeating.py b/recipe_scrapers/nihhealthyeating.py
@@ -1,17 +1,27 @@
 # mypy: disallow_untyped_defs=False
+from typing import List, Optional
+from attr import dataclass
 from ._abstract import AbstractScraper
 from ._exceptions import ElementNotFoundInHtml
 from ._utils import get_minutes, get_yields, normalize_string
 
 
+@dataclass
+class IngredientGroup:
+    ingredients: List[str]
+    purpose: Optional[
+        str
+    ] = None  # this group of ingredients is {purpose} (e.g. "For the dressing")
+
+
 class NIHHealthyEating(AbstractScraper):
     @classmethod
     def host(cls):
         return "healthyeating.nhlbi.nih.gov"
 
     def title(self):
         # This content must be present for all recipes on this website.
-        return self.soup.h1.get_text().strip()
+        return normalize_string(self.soup.h1.get_text())
 
     def total_time(self):
         # This content must be present for all recipes on this website.
@@ -60,20 +70,66 @@ def image(self):
 
         return image_relative_url
 
-    def ingredients(self):
+    def ingredient_groups(self) -> List[IngredientGroup]:
         # This content must be present for recipes on this website.
         ingredients_div = self.soup.find("div", {"id": "ingredients"})
+        section = []
 
         if ingredients_div is None:
             raise ElementNotFoundInHtml("Ingredients not found.")
 
+        # Find more than one lists of ingredients
+        ingredients_h4_sections = ingredients_div.find_all("h4")
+
+        # Ingredients are broken down into sections
+        # https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=11&cId=1&rId=5
+
+        if len(ingredients_h4_sections) >= 2:
+            ingredients_sections = ingredients_div.find_all("tr")
+            for ingredients_section in ingredients_sections:
+                items = ingredients_section.find("p").get_text().strip().split("\n")
+                # create ingredient group for each section
+                res = IngredientGroup(
+                    ingredients=items,
+                    purpose=normalize_string(ingredients_section.find("h4").get_text()),
+                )
+                section.append(res)
+            return section
+
+        # Default case
         ingredients_p = ingredients_div.findAll("p")
         ingredients = [normalize_string(para.get_text()) for para in ingredients_p]
-
-        return [
+        ingredients_list = [
             ing for ing in ingredients if not ing.lower().startswith("recipe cards")
         ]
 
+        # Edge case: ingredents are a mix for single main ingredients and a single sub section
+        # https://healthyeating.nhlbi.nih.gov/recipedetail.aspx?linkId=0&cId=10&rId=163
+
+        if len(ingredients_h4_sections) == 1:
+            items = (
+                ingredients_div.find("h4")
+                .find_next_sibling("p")
+                .get_text()
+                .strip()
+                .split("\n")
+            )
+            group = IngredientGroup(
+                purpose=normalize_string(ingredients_h4_sections[0].get_text()),
+                ingredients=items,
+            )
+            section.append(group)
+            section.append(IngredientGroup(ingredients=ingredients_list[:-1]))
+            return section
+
+        return [IngredientGroup(ingredients_list)]
+
+    def ingredients(self) -> List[str]:
+        results = []
+        for ingredient_group in self.ingredient_groups():
+            results.extend(ingredient_group.ingredients)
+        return results
+
     def instructions(self):
         # This content must be present for recipes on this website.
         directions_div = self.soup.find("div", {"id": "recipe_directions"})
@@ -95,7 +151,7 @@ def nutrients(self):
             self.soup.find("div", {"id": "nutrition_info"}).find("table").find_all("tr")
         ):
             for element in s.find_all("td"):
-                if element.text.strip() != "":
+                if element.get_text().strip() != "":
                     elements.append(normalize_string(element.get_text()))
 
         for i in range(0, len(elements), 2):
@@ -107,31 +163,50 @@ def nutrients(self):
 
     def description(self):
         return normalize_string(
-            self.soup.find("p", {"class": "recipe_detail_subtext"}).text.strip()
+            self.soup.find("p", {"class": "recipe_detail_subtext"}).get_text()
         )
 
     def prep_time(self):
         return get_minutes(
             self.soup.find("table", {"class": "recipe_time_table"})
             .find_all("td")[0]
-            .text.strip()
+            .get_text()
         )
 
     def cook_time(self):
         return get_minutes(
             self.soup.find("table", {"class": "recipe_time_table"})
             .find_all("td")[1]
-            .text.strip()
+            .get_text()
         )
 
     def serving_size(self):
         return normalize_string(
             self.soup.find("table", {"class": "recipe_time_table"})
             .find_all("td")[3]
-            .text.strip()
+            .get_text()
         )
 
     def recipe_source(self):
         return normalize_string(
-            self.soup.find("div", {"id": "Recipe_Source"}).text.split(": ")[1].strip()
+            self.soup.find("div", {"id": "Recipe_Source"}).get_text().split(": ")[1]
         )
+
+    def recipe_cards(self):
+        recipe_cards_maker = self.soup.find("strong", string="Recipe Cards:")
+
+        if recipe_cards_maker is None:
+            return None
+
+        recipe_cards = []
+        recipe_cards_maker_siblings = recipe_cards_maker.next_siblings
+        for recipe_cards_maker_sibling in recipe_cards_maker_siblings:
+            link = recipe_cards_maker_sibling.find("a")
+            if recipe_cards_maker_sibling.name == "li":
+                recipe_cards.append(
+                    {
+                        "size": normalize_string(recipe_cards_maker_sibling.get_text()),
+                        "url": link.get("href"),
+                    }
+                )
+        return recipe_cards
diff --git a/recipe_scrapers/usdamyplate.py b/recipe_scrapers/usdamyplate.py
@@ -69,3 +69,44 @@ def instructions(self):
         instructions = div.find("div", {"class", "field__item"})
 
         return "\n".join(instructions.stripped_strings)
+
+    def nutrients(self):
+        nutrition = {}
+
+        table = self.soup.find(
+            "form", {"class": "mp-recipe-full__nutrition-form"}
+        ).find("table")
+        rows = table.find_all("tr")
+
+        elements = []
+        for row in rows:
+            cols = row.find_all("td")
+            cols = [ele.text.strip() for ele in cols]
+            elements.append([ele for ele in cols if ele])
+
+        for el in elements:
+            if len(el) > 1:
+                nutrition[el[0]] = el[1]
+
+        return nutrition
+
+    def serving_size(self):
+        return normalize_string(
+            self.soup.find("div", {"class": "field--name-field-recipe-serving-size"})
+            .find("span", {"class": "field__item"})
+            .get_text()
+        )
+
+    def description(self):
+        return normalize_string(
+            self.soup.find("div", {"class": "mp-recipe-full__description"})
+            .find("p")
+            .get_text()
+        )
+
+    def recipe_source(self):
+        return normalize_string(
+            self.soup.find("span", {"class": "field--name-field-source"})
+            .find("p")
+            .get_text()
+        )
diff --git a/tests/test_data/nihhealthyeating.testhtml → tests/test_data/nihhealthyeating_1.testhtml b/tests/test_data/nihhealthyeating.testhtml → tests/test_data/nihhealthyeating_1.testhtml