Skip to content

Commit

Permalink
Merge pull request #292 from jodal/update-datasets
Browse files Browse the repository at this point in the history
Update GS1 datasets
  • Loading branch information
jodal authored Jun 30, 2024
2 parents 646044d + 2252df1 commit 74bd7b6
Show file tree
Hide file tree
Showing 10 changed files with 453 additions and 434 deletions.
57 changes: 16 additions & 41 deletions scripts/download_gs1_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
import json
from typing import List

import bs4
import httpx

from biip.gs1 import GS1ApplicationIdentifier

AI_URL = "https://www.gs1.org/standards/barcodes/application-identifiers"
AI_URL = "https://ref.gs1.org/ai/GS1_Application_Identifiers.jsonld"


def main() -> None:
Expand All @@ -24,58 +23,34 @@ def download(url: str) -> bytes:
return httpx.get(url, timeout=30).content


def parse(html_content: bytes) -> List[GS1ApplicationIdentifier]:
def parse(json_content: bytes) -> List[GS1ApplicationIdentifier]:
"""Parse the data from HTML to GS1ApplicationIdentifier objects."""
result: List[GS1ApplicationIdentifier] = []

page = bs4.BeautifulSoup(html_content, "html.parser")
datatable = page.find("table", {"class": ["datatable"]})
assert isinstance(datatable, bs4.element.Tag)
tbody = datatable.find("tbody")
assert isinstance(tbody, bs4.element.Tag)
data = json.loads(json_content)

for row in tbody.find_all("tr"):
columns = row.find_all("td")
for row in data["applicationIdentifiers"]:
if "applicationIdentifier" not in row:
continue
result.append(
GS1ApplicationIdentifier(
ai=columns[0].text.strip(),
description=columns[1].text.strip(),
format=columns[2].text.strip(),
data_title=_fix_data_title(columns[3].text.strip()),
fnc1_required=columns[4].text.strip() == "Yes",
pattern=_fix_pattern(columns[5].text.strip()),
ai=row["applicationIdentifier"],
description=row["description"],
format=row["formatString"],
data_title=row["title"],
fnc1_required=row["fnc1required"],
pattern=rf"^{row['applicationIdentifier']}{_fix_pattern(row['regex'])}$",
)
)

return result


def _fix_data_title(value: str) -> str:
"""Remove HTML elements from the data title."""
if "<sup>" in value:
value = value.replace("<sup>", "")
if "</sup>" in value:
value = value.replace("</sup>", "")

return value


def _fix_pattern(value: str) -> str:
"""Fix regular expression metacharacters that are missing their slash prefix."""
if r"(d" in value:
value = value.replace(r"(d", r"(\d")

if "x" in value:
parts = value.split("x")
new_parts: List[str] = []
for part in parts[:-1]:
if part.endswith("\\"):
new_parts.append(part)
else:
new_parts.append(part + "\\")
new_parts.append(parts[-1])
value = "x".join(new_parts)

"""Fix errors in regex patterns."""
# Add missing opening square bracket to the regex for AI 723x
if value == r"(!%-?A-Z_a-z\x22]{3,30})":
return r"([!%-?A-Z_a-z\x22]{3,30})"
return value


Expand Down
4 changes: 2 additions & 2 deletions src/biip/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
- GTIN: Failed to parse '123' as GTIN: Expected 8, 12, 13, or 14 digits, got 3.
- UPC: Failed to parse '123' as UPC: Expected 6, 7, 8, or 12 digits, got 3.
- SSCC: Failed to parse '123' as SSCC: Expected 18 digits, got 3.
- GS1: Failed to match '123' with GS1 AI (12) pattern '^12(\d{6})$'.
"""
- GS1: Failed to match '123' with GS1 AI (12) pattern '^12(\d{2}(?:0\d|1[0-2])(?:[0-2]\d|3[01]))$'.
""" # noqa: E501

from importlib.metadata import ( # pyright: ignore[reportMissingImports]
PackageNotFoundError, # pyright: ignore[reportUnknownVariableType]
Expand Down
Loading

0 comments on commit 74bd7b6

Please sign in to comment.