Add README inclusion test (#1105)

* Add README inclusion test * Update tests/library/test_readme.py Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com> * Update tests/library/test_readme.py Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com> * Improve test README code * Start refactoring * Update readme test * Update test to account for full domain name customization * Update README to adhere to format * Improve messaging * Support changing subdomains in host * Include updated README instructions * Index-of-scrapers: refactor code for conciseness. * Refactor and start fixing tests * Fix tests * Update weightwatcherspublic.json * Tests: use py3.8-compatible string-prefix-removal implementation. Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com> * Syntax fixup for prefix-removal logic. Relates-to commit 56b4462. * Revert "Syntax fixup for prefix-removal logic." This reverts commit 14eb36f. * Revert "Tests: use py3.8-compatible string-prefix-removal implementation." This reverts commit 56b4462. * Tests: use py3.8-compatible string-prefix-removal implementation. Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com> * Tests: apply black formatting to test_readme module. * Update README.rst * Update test_readme.py * test class updates * fixes from upstream * Update how-to-develop-scraper.md slight doc formatting change --------- Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com> Co-authored-by: James Addison <james@reciperadar.com> Co-authored-by: James Addison <55152140+jayaddison@users.noreply.github.com>
hhursev · Jun 8, 2024 · c6887ca · c6887ca
1 parent 188c5fd
commit c6887ca
Show file tree

Hide file tree

Showing 9 changed files with 260 additions and 84 deletions.
diff --git a/README.rst b/README.rst
diff --git a/docs/how-to-develop-scraper.md b/docs/how-to-develop-scraper.md
@@ -205,7 +205,13 @@ Where `ClassName` is the name that you used earlier to generate the scraper.
 > [!TIP]
 > It is also recommended that you manually test the scraper with a couple of different recipes from the website, to check that there aren't any special cases the scraper will need to handle. You don't need to create test cases for each of these.
 
-## 6. Open a pull request
+## 6. Update the README
+
+Add the website's domain to the supported scraper list in README.rst, ensuring alphabetical order.
+
+If your site supports multiple top level domains (e.g. `.com.au`, `.co.ul`, `.at`, etc.) then list these on an indented entry under the primary domain (the default value of `host()` when no arguments are provided). For an example of this, check out the `hellofresh` listings.
+
+## 7. Open a pull request
 
 Once you have finished developing the scraper and test case, you can commit the files to git and push them to GitHub. You should also update the README.rst to list the site, alphabetically, under the [Scrapers available for:](https://github.com/hhursev/recipe-scrapers#scrapers-available-for) header.
 

diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -199,6 +199,7 @@
 from .ministryofcurry import MinistryOfCurry
 from .misya import Misya
 from .mob import Mob
+from .mobkitchen import MobKitchen
 from .modernhoney import ModernHoney
 from .momontimeout import MomOnTimeout
 from .momswithcrockpots import MomsWithCrockPots
@@ -591,8 +592,8 @@
     Minimalistbaker.host(): Minimalistbaker,
     MinistryOfCurry.host(): MinistryOfCurry,
     Misya.host(): Misya,
-    Mob.host(domain="mob.co.uk"): Mob,
-    Mob.host(domain="mobkitchen.co.uk"): Mob,
+    Mob.host(): Mob,
+    MobKitchen.host(): MobKitchen,
     MomsWithCrockPots.host(): MomsWithCrockPots,
     MonsieurCuisine.host(): MonsieurCuisine,
     MotherThyme.host(): MotherThyme,

diff --git a/recipe_scrapers/mob.py b/recipe_scrapers/mob.py
@@ -14,8 +14,8 @@ def __init__(self, *args, **kwargs):
         )["props"]["pageProps"]["recipe"]
 
     @classmethod
-    def host(cls, domain="mob.co.uk"):
-        return domain
+    def host(cls):
+        return "mob.co.uk"
 
     def author(self):
         chefs = self.recipe_json.get("chefs", [])

diff --git a/recipe_scrapers/mobkitchen.py b/recipe_scrapers/mobkitchen.py
@@ -0,0 +1,9 @@
+# mypy: allow-untyped-defs
+
+from .mob import Mob
+
+
+class MobKitchen(Mob):
+    @classmethod
+    def host(cls):
+        return "mobkitchen.co.uk"
diff --git a/recipe_scrapers/weightwatchers.py b/recipe_scrapers/weightwatchers.py
@@ -9,7 +9,7 @@
 class WeightWatchers(AbstractScraper):
     @classmethod
     def host(cls):
-        return "www.weightwatchers.com"
+        return "weightwatchers.com"
 
     def author(self):
         return "WeightWatchers"

diff --git a/recipe_scrapers/weightwatcherspublic.py b/recipe_scrapers/weightwatcherspublic.py
@@ -8,7 +8,7 @@
 class WeightWatchersPublic(WeightWatchers):
     @classmethod
     def host(cls):
-        return "www.weightwatchers.com"
+        return "weightwatchers.com"
 
     def _find_data_container(self):
         return self.soup.find("div", {"class": "HorizontalList_list__GESs0"})

diff --git a/tests/library/test_readme.py b/tests/library/test_readme.py
@@ -0,0 +1,162 @@
+import re
+import unittest
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+from recipe_scrapers import SCRAPERS, AbstractScraper
+
+START_LIST = "-----------------------"
+END_LIST = "(*) offline saved files only"
+
+ScraperIndex = Dict[str, Tuple[AbstractScraper, List[str]]]
+
+
+def get_scraper_domains():
+    scraper_domains = defaultdict(list)
+    for domain, scraper in SCRAPERS.items():
+        primary_domain = scraper.host()
+        if domain == primary_domain:
+            scraper_domains[scraper].insert(0, domain)
+        else:
+            scraper_domains[scraper].append(domain)
+    return scraper_domains
+
+
+def get_scraper_index() -> ScraperIndex:
+    scraper_index: ScraperIndex = {}
+    for scraper_instance, domains in get_scraper_domains().items():
+        shared_prefix = get_shared_prefix(domains)
+
+        if not shared_prefix:
+            # Treat all as primary domains
+            for domain in domains:
+                scraper_index[domain] = (scraper_instance, [domain])
+            continue
+
+        # Index the primary domain and include their secondary domains minus the shared prefix
+        primary_domain = scraper_instance.host()
+        secondary_domains = [
+            domain[len(shared_prefix) :] if domain.startswith(shared_prefix) else domain
+            for domain in domains
+            if domain != shared_prefix
+        ]
+        scraper_index[primary_domain] = (scraper_instance, secondary_domains)
+
+    # Produce the index sorted by primary domain name
+    return scraper_index
+
+
+def get_shared_prefix(domains: List[str]) -> str:
+    """
+    Find the longest-common-prefix of the domains
+    """
+    if not domains:
+        return ""
+
+    shared_prefix = domains[0]
+    for domain in domains[1:]:
+        while not domain.startswith(shared_prefix):
+            shared_prefix = shared_prefix[:-1]
+            if not shared_prefix:
+                return ""
+
+    if "." in shared_prefix:
+        shared_prefix, _ = shared_prefix.rsplit(".", 1)
+
+    return shared_prefix
+
+
+def get_secondary_domains(
+    scraper_index: ScraperIndex, primary_domain: str
+) -> List[str]:
+    _, suffixes = scraper_index[primary_domain]
+    return [suffix for suffix in suffixes if not primary_domain.endswith(suffix)]
+
+
+def parse_primary_line(line: str) -> Optional[Tuple[str, str]]:
+    match = re.search(
+        r"^- `https?://(?:www\.)?([^/\s]+)[^<]*<https?://(?:www\.)?([^/\s]*)[^>]*>`_(?: \(\*\))?$",
+        line,
+    )
+    if match:
+        groups = match.groups()
+        if len(groups) == 2:
+            return groups
+    return None
+
+
+def parse_secondary_line(line: str) -> List[Tuple[str, str]]:
+    return re.findall(r"`(\.[^\s]+)\s<https?://(?:www\.)?([^/>]+)[^>]*>`_", line)
+
+
+def get_list_lines() -> List[str]:
+    list_lines: List[str] = []
+    with open("README.rst") as f:
+        started_list = False
+        for line in f:
+            stripped_line = line.strip()
+            if stripped_line == START_LIST:
+                started_list = True
+                continue
+
+            if not started_list or not stripped_line:
+                continue
+
+            if stripped_line == END_LIST:
+                break
+
+            list_lines.append(line)
+    return list_lines
+
+
+class TestReadme(unittest.TestCase):
+
+    def test_includes(self):
+        scraper_index = get_scraper_index()
+        primary_domains = sorted(scraper_index.keys())
+        lines = get_list_lines()
+        current_line_index = 0
+
+        for primary_host in primary_domains:
+            current_line = lines[current_line_index]
+            parse_result = parse_primary_line(current_line)
+
+            if not parse_result:
+                self.fail(f"Invalid line: {current_line}")
+
+            name_host, value_host = parse_result
+            self.assertEqual(
+                name_host,
+                value_host,
+                "The name and value hyperlink portions have different hosts.",
+            )
+            self.assertEqual(
+                name_host,
+                primary_host,
+                f"The host ({name_host}) doesn't match the expected host ({primary_host})",
+            )
+
+            current_line_index += 1
+            secondary_hosts = get_secondary_domains(scraper_index, primary_host)
+
+            if secondary_hosts:
+                current_line = lines[current_line_index]
+                parse_result = parse_secondary_line(current_line)
+
+                if not parse_result:
+                    self.fail(f"Invalid line: {current_line}")
+
+                sorted_secondary_hosts = sorted(secondary_hosts)
+                for i, secondary_host in enumerate(sorted_secondary_hosts):
+                    if i >= len(parse_result):
+                        self.fail(
+                            f"Missing top level domain(s) for primary domain {primary_host}"
+                        )
+
+                    top_level_domain = parse_result[i][0]
+                    self.assertEqual(
+                        secondary_host,
+                        top_level_domain,
+                        f"Expected top level domain {secondary_host}, got {top_level_domain} for primary domain {primary_host}",
+                    )
+                current_line_index += 1
diff --git a/tests/test_data/weightwatchers.com/weightwatcherspublic.json b/tests/test_data/weightwatchers.com/weightwatcherspublic.json
@@ -1,23 +1,23 @@
 {
-  "author": null,
+  "author": "WeightWatchers",
   "canonical_url": "https://www.weightwatchers.com/de/rezept/kartoffelgulasch/562a9b02873e1afb2a3c4c13",
   "site_name": null,
   "host": "weightwatchers.com",
   "language": "en",
   "title": "Kartoffelgulasch",
   "ingredients": [
-    "800 g Kartoffeln vorwiegend festkochend",
-    "2 Stück, mittelgroß Zwiebel/n",
-    "2 Stück, rot Paprika",
-    "2 Stück, grün Paprika",
-    "1 EL, gehackt Petersilie",
-    "2 Stück, gelb Paprika",
+    "800 g Kartoffeln; vorwiegend festkochend",
+    "2 Stück Zwiebel/n; mittelgroß",
+    "2 Stück Paprika; rot",
+    "2 Stück Paprika; grün",
+    "1 EL Petersilie; gehackt",
+    "2 Stück Paprika; gelb",
     "250 g Tomaten, frisch",
     "4 Stück Wiener Würstchen",
     "2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
     "2 EL Tomatenmark",
-    "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
-    "1 TL Oregano gehackt",
+    "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
+    "1 TL Oregano; gehackt",
     "1 TL Paprikapulver",
     "1 Prise(n) Salz/Jodsalz",
     "1 Prise(n) Pfeffer",
@@ -26,18 +26,18 @@
   "ingredient_groups": [
     {
       "ingredients": [
-        "800 g Kartoffeln vorwiegend festkochend",
-        "2 Stück, mittelgroß Zwiebel/n",
-        "2 Stück, rot Paprika",
-        "2 Stück, grün Paprika",
-        "1 EL, gehackt Petersilie",
-        "2 Stück, gelb Paprika",
+        "800 g Kartoffeln; vorwiegend festkochend",
+        "2 Stück Zwiebel/n; mittelgroß",
+        "2 Stück Paprika; rot",
+        "2 Stück Paprika; grün",
+        "1 EL Petersilie; gehackt",
+        "2 Stück Paprika; gelb",
         "250 g Tomaten, frisch",
         "4 Stück Wiener Würstchen",
         "2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
         "2 EL Tomatenmark",
-        "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
-        "1 TL Oregano gehackt",
+        "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
+        "1 TL Oregano; gehackt",
         "1 TL Paprikapulver",
         "1 Prise(n) Salz/Jodsalz",
         "1 Prise(n) Pfeffer",
@@ -55,12 +55,12 @@
   "yields": "4 servings",
   "description": "Das Rezept zaubert ein saftiges, würziges Gericht auf den Tisch und schmeckt garantiert.",
   "total_time": 40,
-  "cook_time": null,
+  "cook_time": 0,
   "prep_time": 40,
   "nutrients": {
-    "calories": "137 kcal"
+    "points": "13 Points&reg; value"
   },
-  "image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/t_WINE_EXTRALARGE/h7wo0hbnwcleucj30sbw.jpg",
+  "image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/q_auto/h7wo0hbnwcleucj30sbw.jpg?auto=webp",
   "keywords": [
     "Ohne Meeresfrüchte",
     "Ohne Fisch",