Merge b55cf47 into 188c5fd

hhursev · Jun 8, 2024 · 0265b0a · 0265b0a
2 parents 188c5fd + b55cf47
commit 0265b0a
Show file tree

Hide file tree

Showing 9 changed files with 260 additions and 84 deletions.
diff --git a/README.rst b/README.rst
diff --git a/docs/how-to-develop-scraper.md b/docs/how-to-develop-scraper.md
@@ -205,7 +205,13 @@ Where `ClassName` is the name that you used earlier to generate the scraper.
 > [!TIP]
 > It is also recommended that you manually test the scraper with a couple of different recipes from the website, to check that there aren't any special cases the scraper will need to handle. You don't need to create test cases for each of these.
 
-## 6. Open a pull request
+## 6. Update the README
+
+Add the website's domain to the supported scraper list in README.rst, ensuring alphabetical order.
+
+If your site supports multiple top level domains (e.g. `.com.au`, `.co.ul`, `.at`, etc.) then list these on an indented entry under the primary domain (the default value of `host()` when no arguments are provided). For an example of this, check out the `hellofresh` listings.
+
+## 7. Open a pull request
 
 Once you have finished developing the scraper and test case, you can commit the files to git and push them to GitHub. You should also update the README.rst to list the site, alphabetically, under the [Scrapers available for:](https://github.com/hhursev/recipe-scrapers#scrapers-available-for) header.
 

diff --git a/recipe_scrapers/__init__.py b/recipe_scrapers/__init__.py
@@ -199,6 +199,7 @@
 from .ministryofcurry import MinistryOfCurry
 from .misya import Misya
 from .mob import Mob
+from .mobkitchen import MobKitchen
 from .modernhoney import ModernHoney
 from .momontimeout import MomOnTimeout
 from .momswithcrockpots import MomsWithCrockPots
@@ -591,8 +592,8 @@
     Minimalistbaker.host(): Minimalistbaker,
     MinistryOfCurry.host(): MinistryOfCurry,
     Misya.host(): Misya,
-    Mob.host(domain="mob.co.uk"): Mob,
-    Mob.host(domain="mobkitchen.co.uk"): Mob,
+    Mob.host(): Mob,
+    MobKitchen.host(): MobKitchen,
     MomsWithCrockPots.host(): MomsWithCrockPots,
     MonsieurCuisine.host(): MonsieurCuisine,
     MotherThyme.host(): MotherThyme,

diff --git a/recipe_scrapers/mob.py b/recipe_scrapers/mob.py
@@ -14,8 +14,8 @@ def __init__(self, *args, **kwargs):
         )["props"]["pageProps"]["recipe"]
 
     @classmethod
-    def host(cls, domain="mob.co.uk"):
-        return domain
+    def host(cls):
+        return "mob.co.uk"
 
     def author(self):
         chefs = self.recipe_json.get("chefs", [])

diff --git a/recipe_scrapers/mobkitchen.py b/recipe_scrapers/mobkitchen.py
@@ -0,0 +1,9 @@
+# mypy: allow-untyped-defs
+
+from .mob import Mob
+
+
+class MobKitchen(Mob):
+    @classmethod
+    def host(cls):
+        return "mobkitchen.co.uk"
diff --git a/recipe_scrapers/weightwatchers.py b/recipe_scrapers/weightwatchers.py
@@ -9,7 +9,7 @@
 class WeightWatchers(AbstractScraper):
     @classmethod
     def host(cls):
-        return "www.weightwatchers.com"
+        return "weightwatchers.com"
 
     def author(self):
         return "WeightWatchers"

diff --git a/recipe_scrapers/weightwatcherspublic.py b/recipe_scrapers/weightwatcherspublic.py
@@ -8,7 +8,7 @@
 class WeightWatchersPublic(WeightWatchers):
     @classmethod
     def host(cls):
-        return "www.weightwatchers.com"
+        return "weightwatchers.com"
 
     def _find_data_container(self):
         return self.soup.find("div", {"class": "HorizontalList_list__GESs0"})

diff --git a/tests/library/test_readme.py b/tests/library/test_readme.py
@@ -0,0 +1,162 @@
+import re
+import unittest
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+from recipe_scrapers import SCRAPERS, AbstractScraper
+
+START_LIST = "-----------------------"
+END_LIST = "(*) offline saved files only"
+
+ScraperIndex = Dict[str, Tuple[AbstractScraper, List[str]]]
+
+
+def get_scraper_domains():
+    scraper_domains = defaultdict(list)
+    for domain, scraper in SCRAPERS.items():
+        primary_domain = scraper.host()
+        if domain == primary_domain:
+            scraper_domains[scraper].insert(0, domain)
+        else:
+            scraper_domains[scraper].append(domain)
+    return scraper_domains
+
+
+def get_scraper_index() -> ScraperIndex:
+    scraper_index: ScraperIndex = {}
+    for scraper_instance, domains in get_scraper_domains().items():
+        shared_prefix = get_shared_prefix(domains)
+
+        if not shared_prefix:
+            # Treat all as primary domains
+            for domain in domains:
+                scraper_index[domain] = (scraper_instance, [domain])
+            continue
+
+        # Index the primary domain and include their secondary domains minus the shared prefix
+        primary_domain = scraper_instance.host()
+        secondary_domains = [
+            domain[len(shared_prefix) :] if domain.startswith(shared_prefix) else domain
+            for domain in domains
+            if domain != shared_prefix
+        ]
+        scraper_index[primary_domain] = (scraper_instance, secondary_domains)
+
+    # Produce the index sorted by primary domain name
+    return scraper_index
+
+
+def get_shared_prefix(domains: List[str]) -> str:
+    """
+    Find the longest-common-prefix of the domains
+    """
+    if not domains:
+        return ""
+
+    shared_prefix = domains[0]
+    for domain in domains[1:]:
+        while not domain.startswith(shared_prefix):
+            shared_prefix = shared_prefix[:-1]
+            if not shared_prefix:
+                return ""
+
+    if "." in shared_prefix:
+        shared_prefix, _ = shared_prefix.rsplit(".", 1)
+
+    return shared_prefix
+
+
+def get_secondary_domains(
+    scraper_index: ScraperIndex, primary_domain: str
+) -> List[str]:
+    _, suffixes = scraper_index[primary_domain]
+    return [suffix for suffix in suffixes if not primary_domain.endswith(suffix)]
+
+
+def parse_primary_line(line: str) -> Optional[Tuple[str, str]]:
+    match = re.search(
+        r"^- `https?://(?:www\.)?([^/\s]+)[^<]*<https?://(?:www\.)?([^/\s]*)[^>]*>`_(?: \(\*\))?$",
+        line,
+    )
+    if match:
+        groups = match.groups()
+        if len(groups) == 2:
+            return groups
+    return None
+
+
+def parse_secondary_line(line: str) -> List[Tuple[str, str]]:
+    return re.findall(r"`(\.[^\s]+)\s<https?://(?:www\.)?([^/>]+)[^>]*>`_", line)
+
+
+def get_list_lines() -> List[str]:
+    list_lines: List[str] = []
+    with open("README.rst") as f:
+        started_list = False
+        for line in f:
+            stripped_line = line.strip()
+            if stripped_line == START_LIST:
+                started_list = True
+                continue
+
+            if not started_list or not stripped_line:
+                continue
+
+            if stripped_line == END_LIST:
+                break
+
+            list_lines.append(line)
+    return list_lines
+
+
+class TestReadme(unittest.TestCase):
+
+    def test_includes(self):
+        scraper_index = get_scraper_index()
+        primary_domains = sorted(scraper_index.keys())
+        lines = get_list_lines()
+        current_line_index = 0
+
+        for primary_host in primary_domains:
+            current_line = lines[current_line_index]
+            parse_result = parse_primary_line(current_line)
+
+            if not parse_result:
+                self.fail(f"Invalid line: {current_line}")
+
+            name_host, value_host = parse_result
+            self.assertEqual(
+                name_host,
+                value_host,
+                "The name and value hyperlink portions have different hosts.",
+            )
+            self.assertEqual(
+                name_host,
+                primary_host,
+                f"The host ({name_host}) doesn't match the expected host ({primary_host})",
+            )
+
+            current_line_index += 1
+            secondary_hosts = get_secondary_domains(scraper_index, primary_host)
+
+            if secondary_hosts:
+                current_line = lines[current_line_index]
+                parse_result = parse_secondary_line(current_line)
+
+                if not parse_result:
+                    self.fail(f"Invalid line: {current_line}")
+
+                sorted_secondary_hosts = sorted(secondary_hosts)
+                for i, secondary_host in enumerate(sorted_secondary_hosts):
+                    if i >= len(parse_result):
+                        self.fail(
+                            f"Missing top level domain(s) for primary domain {primary_host}"
+                        )
+
+                    top_level_domain = parse_result[i][0]
+                    self.assertEqual(
+                        secondary_host,
+                        top_level_domain,
+                        f"Expected top level domain {secondary_host}, got {top_level_domain} for primary domain {primary_host}",
+                    )
+                current_line_index += 1
diff --git a/tests/test_data/weightwatchers.com/weightwatcherspublic.json b/tests/test_data/weightwatchers.com/weightwatcherspublic.json
@@ -1,23 +1,23 @@
 {
-  "author": null,
+  "author": "WeightWatchers",
   "canonical_url": "https://www.weightwatchers.com/de/rezept/kartoffelgulasch/562a9b02873e1afb2a3c4c13",
   "site_name": null,
   "host": "weightwatchers.com",
   "language": "en",
   "title": "Kartoffelgulasch",
   "ingredients": [
-    "800 g Kartoffeln vorwiegend festkochend",
-    "2 Stück, mittelgroß Zwiebel/n",
-    "2 Stück, rot Paprika",
-    "2 Stück, grün Paprika",
-    "1 EL, gehackt Petersilie",
-    "2 Stück, gelb Paprika",
+    "800 g Kartoffeln; vorwiegend festkochend",
+    "2 Stück Zwiebel/n; mittelgroß",
+    "2 Stück Paprika; rot",
+    "2 Stück Paprika; grün",
+    "1 EL Petersilie; gehackt",
+    "2 Stück Paprika; gelb",
     "250 g Tomaten, frisch",
     "4 Stück Wiener Würstchen",
     "2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
     "2 EL Tomatenmark",
-    "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
-    "1 TL Oregano gehackt",
+    "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
+    "1 TL Oregano; gehackt",
     "1 TL Paprikapulver",
     "1 Prise(n) Salz/Jodsalz",
     "1 Prise(n) Pfeffer",
@@ -26,18 +26,18 @@
   "ingredient_groups": [
     {
       "ingredients": [
-        "800 g Kartoffeln vorwiegend festkochend",
-        "2 Stück, mittelgroß Zwiebel/n",
-        "2 Stück, rot Paprika",
-        "2 Stück, grün Paprika",
-        "1 EL, gehackt Petersilie",
-        "2 Stück, gelb Paprika",
+        "800 g Kartoffeln; vorwiegend festkochend",
+        "2 Stück Zwiebel/n; mittelgroß",
+        "2 Stück Paprika; rot",
+        "2 Stück Paprika; grün",
+        "1 EL Petersilie; gehackt",
+        "2 Stück Paprika; gelb",
         "250 g Tomaten, frisch",
         "4 Stück Wiener Würstchen",
         "2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
         "2 EL Tomatenmark",
-        "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
-        "1 TL Oregano gehackt",
+        "250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
+        "1 TL Oregano; gehackt",
         "1 TL Paprikapulver",
         "1 Prise(n) Salz/Jodsalz",
         "1 Prise(n) Pfeffer",
@@ -55,12 +55,12 @@
   "yields": "4 servings",
   "description": "Das Rezept zaubert ein saftiges, würziges Gericht auf den Tisch und schmeckt garantiert.",
   "total_time": 40,
-  "cook_time": null,
+  "cook_time": 0,
   "prep_time": 40,
   "nutrients": {
-    "calories": "137 kcal"
+    "points": "13 Points&reg; value"
   },
-  "image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/t_WINE_EXTRALARGE/h7wo0hbnwcleucj30sbw.jpg",
+  "image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/q_auto/h7wo0hbnwcleucj30sbw.jpg?auto=webp",
   "keywords": [
     "Ohne Meeresfrüchte",
     "Ohne Fisch",