Skip to content

Commit

Permalink
Add README inclusion test (#1105)
Browse files Browse the repository at this point in the history
* Add README inclusion test

* Update tests/library/test_readme.py

Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com>

* Update tests/library/test_readme.py

Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com>

* Improve test README code

* Start refactoring

* Update readme test

* Update test to account for full domain name customization

* Update README to adhere to format

* Improve messaging

* Support changing subdomains in host

* Include updated README instructions

* Index-of-scrapers: refactor code for conciseness.

* Refactor and start fixing tests

* Fix tests

* Update weightwatcherspublic.json

* Tests: use py3.8-compatible string-prefix-removal implementation.

Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com>

* Syntax fixup for prefix-removal logic.

Relates-to commit 56b4462.

* Revert "Syntax fixup for prefix-removal logic."

This reverts commit 14eb36f.

* Revert "Tests: use py3.8-compatible string-prefix-removal implementation."

This reverts commit 56b4462.

* Tests: use py3.8-compatible string-prefix-removal implementation.

Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com>

* Tests: apply black formatting to test_readme module.

* Update README.rst

* Update test_readme.py

* test class updates

* fixes from upstream

* Update how-to-develop-scraper.md

slight doc formatting change

---------

Co-authored-by: Joey <7505194+jknndy@users.noreply.github.com>
Co-authored-by: James Addison <james@reciperadar.com>
Co-authored-by: James Addison <55152140+jayaddison@users.noreply.github.com>
  • Loading branch information
4 people committed Jun 8, 2024
1 parent 188c5fd commit c6887ca
Show file tree
Hide file tree
Showing 9 changed files with 260 additions and 84 deletions.
112 changes: 55 additions & 57 deletions README.rst

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion docs/how-to-develop-scraper.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,13 @@ Where `ClassName` is the name that you used earlier to generate the scraper.
> [!TIP]
> It is also recommended that you manually test the scraper with a couple of different recipes from the website, to check that there aren't any special cases the scraper will need to handle. You don't need to create test cases for each of these.
## 6. Open a pull request
## 6. Update the README

Add the website's domain to the supported scraper list in README.rst, ensuring alphabetical order.

If your site supports multiple top level domains (e.g. `.com.au`, `.co.ul`, `.at`, etc.) then list these on an indented entry under the primary domain (the default value of `host()` when no arguments are provided). For an example of this, check out the `hellofresh` listings.

## 7. Open a pull request

Once you have finished developing the scraper and test case, you can commit the files to git and push them to GitHub. You should also update the README.rst to list the site, alphabetically, under the [Scrapers available for:](https://github.com/hhursev/recipe-scrapers#scrapers-available-for) header.

Expand Down
5 changes: 3 additions & 2 deletions recipe_scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@
from .ministryofcurry import MinistryOfCurry
from .misya import Misya
from .mob import Mob
from .mobkitchen import MobKitchen
from .modernhoney import ModernHoney
from .momontimeout import MomOnTimeout
from .momswithcrockpots import MomsWithCrockPots
Expand Down Expand Up @@ -591,8 +592,8 @@
Minimalistbaker.host(): Minimalistbaker,
MinistryOfCurry.host(): MinistryOfCurry,
Misya.host(): Misya,
Mob.host(domain="mob.co.uk"): Mob,
Mob.host(domain="mobkitchen.co.uk"): Mob,
Mob.host(): Mob,
MobKitchen.host(): MobKitchen,
MomsWithCrockPots.host(): MomsWithCrockPots,
MonsieurCuisine.host(): MonsieurCuisine,
MotherThyme.host(): MotherThyme,
Expand Down
4 changes: 2 additions & 2 deletions recipe_scrapers/mob.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def __init__(self, *args, **kwargs):
)["props"]["pageProps"]["recipe"]

@classmethod
def host(cls, domain="mob.co.uk"):
return domain
def host(cls):
return "mob.co.uk"

def author(self):
chefs = self.recipe_json.get("chefs", [])
Expand Down
9 changes: 9 additions & 0 deletions recipe_scrapers/mobkitchen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# mypy: allow-untyped-defs

from .mob import Mob


class MobKitchen(Mob):
@classmethod
def host(cls):
return "mobkitchen.co.uk"
2 changes: 1 addition & 1 deletion recipe_scrapers/weightwatchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class WeightWatchers(AbstractScraper):
@classmethod
def host(cls):
return "www.weightwatchers.com"
return "weightwatchers.com"

def author(self):
return "WeightWatchers"
Expand Down
2 changes: 1 addition & 1 deletion recipe_scrapers/weightwatcherspublic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class WeightWatchersPublic(WeightWatchers):
@classmethod
def host(cls):
return "www.weightwatchers.com"
return "weightwatchers.com"

def _find_data_container(self):
return self.soup.find("div", {"class": "HorizontalList_list__GESs0"})
Expand Down
162 changes: 162 additions & 0 deletions tests/library/test_readme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import re
import unittest
from collections import defaultdict
from typing import Dict, List, Optional, Tuple

from recipe_scrapers import SCRAPERS, AbstractScraper

START_LIST = "-----------------------"
END_LIST = "(*) offline saved files only"

ScraperIndex = Dict[str, Tuple[AbstractScraper, List[str]]]


def get_scraper_domains():
scraper_domains = defaultdict(list)
for domain, scraper in SCRAPERS.items():
primary_domain = scraper.host()
if domain == primary_domain:
scraper_domains[scraper].insert(0, domain)
else:
scraper_domains[scraper].append(domain)
return scraper_domains


def get_scraper_index() -> ScraperIndex:
scraper_index: ScraperIndex = {}
for scraper_instance, domains in get_scraper_domains().items():
shared_prefix = get_shared_prefix(domains)

if not shared_prefix:
# Treat all as primary domains
for domain in domains:
scraper_index[domain] = (scraper_instance, [domain])
continue

# Index the primary domain and include their secondary domains minus the shared prefix
primary_domain = scraper_instance.host()
secondary_domains = [
domain[len(shared_prefix) :] if domain.startswith(shared_prefix) else domain
for domain in domains
if domain != shared_prefix
]
scraper_index[primary_domain] = (scraper_instance, secondary_domains)

# Produce the index sorted by primary domain name
return scraper_index


def get_shared_prefix(domains: List[str]) -> str:
"""
Find the longest-common-prefix of the domains
"""
if not domains:
return ""

shared_prefix = domains[0]
for domain in domains[1:]:
while not domain.startswith(shared_prefix):
shared_prefix = shared_prefix[:-1]
if not shared_prefix:
return ""

if "." in shared_prefix:
shared_prefix, _ = shared_prefix.rsplit(".", 1)

return shared_prefix


def get_secondary_domains(
scraper_index: ScraperIndex, primary_domain: str
) -> List[str]:
_, suffixes = scraper_index[primary_domain]
return [suffix for suffix in suffixes if not primary_domain.endswith(suffix)]


def parse_primary_line(line: str) -> Optional[Tuple[str, str]]:
match = re.search(
r"^- `https?://(?:www\.)?([^/\s]+)[^<]*<https?://(?:www\.)?([^/\s]*)[^>]*>`_(?: \(\*\))?$",
line,
)
if match:
groups = match.groups()
if len(groups) == 2:
return groups
return None


def parse_secondary_line(line: str) -> List[Tuple[str, str]]:
return re.findall(r"`(\.[^\s]+)\s<https?://(?:www\.)?([^/>]+)[^>]*>`_", line)


def get_list_lines() -> List[str]:
list_lines: List[str] = []
with open("README.rst") as f:
started_list = False
for line in f:
stripped_line = line.strip()
if stripped_line == START_LIST:
started_list = True
continue

if not started_list or not stripped_line:
continue

if stripped_line == END_LIST:
break

list_lines.append(line)
return list_lines


class TestReadme(unittest.TestCase):

def test_includes(self):
scraper_index = get_scraper_index()
primary_domains = sorted(scraper_index.keys())
lines = get_list_lines()
current_line_index = 0

for primary_host in primary_domains:
current_line = lines[current_line_index]
parse_result = parse_primary_line(current_line)

if not parse_result:
self.fail(f"Invalid line: {current_line}")

name_host, value_host = parse_result
self.assertEqual(
name_host,
value_host,
"The name and value hyperlink portions have different hosts.",
)
self.assertEqual(
name_host,
primary_host,
f"The host ({name_host}) doesn't match the expected host ({primary_host})",
)

current_line_index += 1
secondary_hosts = get_secondary_domains(scraper_index, primary_host)

if secondary_hosts:
current_line = lines[current_line_index]
parse_result = parse_secondary_line(current_line)

if not parse_result:
self.fail(f"Invalid line: {current_line}")

sorted_secondary_hosts = sorted(secondary_hosts)
for i, secondary_host in enumerate(sorted_secondary_hosts):
if i >= len(parse_result):
self.fail(
f"Missing top level domain(s) for primary domain {primary_host}"
)

top_level_domain = parse_result[i][0]
self.assertEqual(
secondary_host,
top_level_domain,
f"Expected top level domain {secondary_host}, got {top_level_domain} for primary domain {primary_host}",
)
current_line_index += 1
40 changes: 20 additions & 20 deletions tests/test_data/weightwatchers.com/weightwatcherspublic.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"author": null,
"author": "WeightWatchers",
"canonical_url": "https://www.weightwatchers.com/de/rezept/kartoffelgulasch/562a9b02873e1afb2a3c4c13",
"site_name": null,
"host": "weightwatchers.com",
"language": "en",
"title": "Kartoffelgulasch",
"ingredients": [
"800 g Kartoffeln vorwiegend festkochend",
"2 Stück, mittelgroß Zwiebel/n",
"2 Stück, rot Paprika",
"2 Stück, grün Paprika",
"1 EL, gehackt Petersilie",
"2 Stück, gelb Paprika",
"800 g Kartoffeln; vorwiegend festkochend",
"2 Stück Zwiebel/n; mittelgroß",
"2 Stück Paprika; rot",
"2 Stück Paprika; grün",
"1 EL Petersilie; gehackt",
"2 Stück Paprika; gelb",
"250 g Tomaten, frisch",
"4 Stück Wiener Würstchen",
"2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
"2 EL Tomatenmark",
"250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
"1 TL Oregano gehackt",
"250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
"1 TL Oregano; gehackt",
"1 TL Paprikapulver",
"1 Prise(n) Salz/Jodsalz",
"1 Prise(n) Pfeffer",
Expand All @@ -26,18 +26,18 @@
"ingredient_groups": [
{
"ingredients": [
"800 g Kartoffeln vorwiegend festkochend",
"2 Stück, mittelgroß Zwiebel/n",
"2 Stück, rot Paprika",
"2 Stück, grün Paprika",
"1 EL, gehackt Petersilie",
"2 Stück, gelb Paprika",
"800 g Kartoffeln; vorwiegend festkochend",
"2 Stück Zwiebel/n; mittelgroß",
"2 Stück Paprika; rot",
"2 Stück Paprika; grün",
"1 EL Petersilie; gehackt",
"2 Stück Paprika; gelb",
"250 g Tomaten, frisch",
"4 Stück Wiener Würstchen",
"2 TL Pflanzenöl, Rapsöl/Sonnenblumenöl",
"2 EL Tomatenmark",
"250 ml Gemüsebouillon/Gemüsebrühe, zubereitet (1 TL Instantpulver)",
"1 TL Oregano gehackt",
"250 ml Gemüsebouillon/Gemüsebrühe, zubereitet; (1 TL Instantpulver)",
"1 TL Oregano; gehackt",
"1 TL Paprikapulver",
"1 Prise(n) Salz/Jodsalz",
"1 Prise(n) Pfeffer",
Expand All @@ -55,12 +55,12 @@
"yields": "4 servings",
"description": "Das Rezept zaubert ein saftiges, würziges Gericht auf den Tisch und schmeckt garantiert.",
"total_time": 40,
"cook_time": null,
"cook_time": 0,
"prep_time": 40,
"nutrients": {
"calories": "137 kcal"
"points": "13 Points&reg; value"
},
"image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/t_WINE_EXTRALARGE/h7wo0hbnwcleucj30sbw.jpg",
"image": "https://cmx.weightwatchers.com/assets-proxy/weight-watchers/image/upload/q_auto/h7wo0hbnwcleucj30sbw.jpg?auto=webp",
"keywords": [
"Ohne Meeresfrüchte",
"Ohne Fisch",
Expand Down

0 comments on commit c6887ca

Please sign in to comment.