WIP

kurtmckee · Apr 21, 2023 · 7fa6924 · 7fa6924
1 parent d8a1986
commit 7fa6924
Show file tree

Hide file tree

Showing 84 changed files with 964 additions and 41 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -8,5 +8,5 @@ indent_style = space
 insert_final_newline = true
 trim_trailing_whitespace = true
 
-[{*.yaml,*.yml}]
+[{*.yaml,*.yml,*.json}]
 indent_size = 2
diff --git a/feedparser/api.py b/feedparser/api.py
@@ -327,7 +327,7 @@ def _parse_file_inplace(
         try:
             saxparser.parse(source)
         except xml.sax.SAXException as e:
-            result["bozo"] = 1
+            result["bozo"] = True
             result["bozo_exception"] = feed_parser.exc or e
             use_strict_parser = False
 
@@ -358,14 +358,17 @@ def _parse_file_inplace(
         # flag that the JSON parser should be tried.
         if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
             use_json_parser = True
+            result["bozo"] = False
+            result.pop("bozo_exception", None)
 
     if use_json_parser:
         result["version"] = None
         feed_parser = JSONParser(baseuri, baselang, "utf-8")
+        feed_parser.sanitize_html = sanitize_html
         try:
             feed_parser.feed(stream_factory.get_file())
         except Exception as e:
-            result["bozo"] = 1
+            result["bozo"] = True
             result["bozo_exception"] = e
 
     result["feed"] = feed_parser.feeddata

diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py
@@ -29,20 +29,15 @@
 
 from ..datetimes import _parse_date
 from ..sanitizer import sanitize_html
-from ..util import FeedParserDict
+from ..util import FeedParserDict, looks_like_html
+
+JSON_VERSIONS = {
+    "https://jsonfeed.org/version/1": "json1",
+    "https://jsonfeed.org/version/1.1": "json11",
+}
 
 
 class JSONParser:
-    VERSIONS = {
-        "https://jsonfeed.org/version/1": "json1",
-        "https://jsonfeed.org/version/1.1": "json11",
-    }
-    FEED_FIELDS = (
-        ("title", "title"),
-        ("icon", "image"),
-        ("home_page_url", "link"),
-        ("description", "description"),
-    )
     ITEM_FIELDS = (
         ("title", "title"),
         ("id", "guid"),
@@ -55,6 +50,7 @@ def __init__(self, baseuri=None, baselang=None, encoding=None):
         self.baseuri = baseuri or ""
         self.lang = baselang or None
         self.encoding = encoding or "utf-8"  # character encoding
+        self.sanitize_html = False
 
         self.version = None
         self.feeddata = FeedParserDict()
@@ -64,20 +60,144 @@ def __init__(self, baseuri=None, baselang=None, encoding=None):
     def feed(self, file):
         data = json.load(file)
 
-        v = data.get("version", "")
+        # If the file parses as JSON, assume it's a JSON feed.
+        self.version = "json"
         try:
-            self.version = self.VERSIONS[v]
-        except KeyError:
-            raise ValueError("Unrecognized JSONFeed version '%s'" % v)
+            self.version = JSON_VERSIONS[data["version"].strip()]
+        except (AttributeError, KeyError, TypeError):
+            pass
+
+        # Handle `title`, if it exists.
+        title = data.get("title")
+        if isinstance(title, str):
+            title = title.strip()
+            is_html = looks_like_html(title)
+            content_type = "text/html" if is_html else "text/plain"
+            if is_html and self.sanitize_html:
+                title = sanitize_html(title, encoding=None, _type=content_type)
+            self.feeddata["title"] = title
+            self.feeddata["title_detail"] = {
+                "value": title,
+                "type": content_type,
+            }
+
+        # Handle `description`, if it exists.
+        description = data.get("description")
+        if isinstance(description, str):
+            description = description.strip()
+            is_html = looks_like_html(description)
+            content_type = "text/html" if is_html else "text/plain"
+            if is_html and self.sanitize_html:
+                description = sanitize_html(
+                    description, encoding=None, _type=content_type
+                )
+            self.feeddata["subtitle"] = description
+            self.feeddata["subtitle_detail"] = {
+                "value": description,
+                "type": content_type,
+            }
+
+        # Handle `feed_url`, if it exists.
+        feed_url = data.get("feed_url")
+        if isinstance(feed_url, str):
+            feed_url = feed_url.strip()
+            # The feed URL is also...sigh...the feed ID.
+            self.feeddata["id"] = feed_url
+            self.feeddata.setdefault("links", []).append(
+                {
+                    "href": feed_url,
+                    "rel": "self",
+                }
+            )
+            if "title" in self.feeddata:
+                self.feeddata["links"][-1]["title"] = self.feeddata["title"]
+
+        # Handle `home_page_url`, if it exists.
+        home_page_url = data.get("home_page_url")
+        if isinstance(home_page_url, str):
+            home_page_url = home_page_url.strip()
+            self.feeddata["link"] = home_page_url
+            self.feeddata.setdefault("links", []).append(
+                {
+                    "href": home_page_url,
+                    "rel": "alternate",
+                }
+            )
+
+        # Handle `icon`, if it exists.
+        icon = data.get("icon")
+        if isinstance(icon, str):
+            self.feeddata["image"] = {"href": icon.strip()}
 
-        for src, dst in self.FEED_FIELDS:
-            if src in data:
-                self.feeddata[dst] = data[src]
-        if "author" in data:
-            self.parse_author(data["author"], self.feeddata)
-        # TODO: hubs; expired has no RSS equivalent
+        # Handle `favicon`, if it exists.
+        favicon = data.get("favicon")
+        if isinstance(favicon, str):
+            self.feeddata["icon"] = favicon.strip()
 
-        self.entries = [self.parse_entry(e) for e in data["items"]]
+        # Handle `user_comment`, if it exists.
+        user_comment = data.get("user_comment")
+        if isinstance(user_comment, str):
+            user_comment = user_comment.strip()
+            is_html = looks_like_html(user_comment)
+            content_type = "text/html" if is_html else "text/plain"
+            if is_html and self.sanitize_html:
+                user_comment = sanitize_html(
+                    user_comment, encoding=None, _type=content_type
+                )
+            self.feeddata["info"] = user_comment
+            self.feeddata["info_detail"] = {
+                "value": user_comment,
+                "type": content_type,
+            }
+
+        # Handle `next_url`, if it exists.
+        next_url = data.get("next_url")
+        if isinstance(next_url, str):
+            next_url = next_url.strip()
+            self.feeddata.setdefault("links", []).append(
+                {
+                    "href": next_url,
+                    "rel": "next",
+                }
+            )
+
+        # Handle `expired`, if it exists.
+        expired = data.get("expired", ...)
+        if expired is not ...:
+            # The spec claims that only boolean true means "finished".
+            self.feeddata["complete"] = expired is True
+
+        # Handle `hubs`, if it exists.
+        hubs = data.get("hubs", ...)
+        if hubs is not ...:
+            self.feeddata["hubs"] = []
+            if isinstance(hubs, list):
+                for hub in hubs:
+                    if not isinstance(hub, dict):
+                        continue
+                    url = hub.get("url")
+                    type_ = hub.get("type")
+                    if not (isinstance(url, str) and isinstance(type_, str)):
+                        continue
+                    self.feeddata["hubs"].append(
+                        {
+                            "url": url.strip(),
+                            "type": type_.strip(),
+                        }
+                    )
+
+        # TODO: TEST AUTHOR PARSING THOROUGHLY
+        # TODO: REFACTOR AUTHOR.NAME TESTS SO *THEY* TEST MISSING NAME KEYS.
+        author_singular = data.get("author")
+        if isinstance(author_singular, dict):
+            parsed_author = self._parse_author(author_singular)
+            if parsed_author:
+                self.feeddata["authors"] = [parsed_author]
+                self.feeddata["author_detail"] = parsed_author
+            if "name" in parsed_author:
+                self.feeddata["author"] = parsed_author["name"]
+
+        self.entries = [self.parse_entry(e) for e in data.get("items", ())]
 
     def parse_entry(self, e):
         entry = FeedParserDict()
@@ -107,23 +227,39 @@ def parse_entry(self, e):
             entry["category"] = e["tags"]
 
         if "author" in e:
-            self.parse_author(e["author"], entry)
+            self._parse_author(e["author"])
 
         if "attachments" in e:
             entry["enclosures"] = [self.parse_attachment(a) for a in e["attachments"]]
 
         return entry
 
     @staticmethod
-    def parse_author(parent, dest):
-        dest["author_detail"] = detail = FeedParserDict()
-        if "name" in parent:
-            dest["author"] = detail["name"] = parent["name"]
-        if "url" in parent:
-            if parent["url"].startswith("mailto:"):
-                detail["email"] = parent["url"][7:]
-            else:
-                detail["href"] = parent["url"]
+    def _parse_author(info: dict[str, str]) -> dict[str, str]:
+        parsed_author: dict[str, str] = {}
+
+        name = info.get("name")
+        if isinstance(name, str):
+            parsed_author["name"] = name.strip()
+
+        url = info.get("url")
+        if isinstance(url, str):
+            url = url.strip()
+            parsed_author["href"] = url
+            # URLs can be email addresses.
+            # However, only a "mailto:" URI supports options like:
+            #
+            #   mailto:user@domain.example?subject=Feed
+            #
+            # Caution is required when converting the URL to an email.
+            if url.startswith("mailto:"):
+                parsed_author["email"], _, _ = url[7:].partition("?")
+
+        avatar = info.get("avatar")
+        if isinstance(avatar, str):
+            parsed_author["image"] = avatar.strip()
+
+        return parsed_author
 
     @staticmethod
     def parse_attachment(attachment):

diff --git a/tests/json/README.rst b/tests/json/README.rst
@@ -0,0 +1,47 @@
+JSON feed tests
+===============
+
+The files in this directory exercise the JSON feed parser.
+
+
+``*.json``
+----------
+
+Files in this directory contain JSON objects.
+They will contain a ``__tests`` key with a list of expected conditions.
+Each condition will be evaluated after parsing the feed.
+
+For example:
+
+..  code-block:: json
+
+    {
+      "__tests": [
+        "version == 'json11'"
+      ],
+      "version": "https://jsonfeed.org/version/1.1"
+    }
+
+Files in this directory are automatically found, parsed, and tested.
+
+When writing tests please consider the following:
+
+*   Test incorrect value types that do not match the JSON feed requirements.
+    For example, test null values where a string is required.
+
+*   Test incorrect value types that are not hashable in Python.
+    For example, test lists where a string is required.
+
+*   Test incorrect value types that are also iterable in Python.
+    For example, test a JSON object where a list is required.
+
+*   Write ``eval()`` strings using Unicode escape sequences as needed.
+    This forces the test and JSON feed content to agree.
+    For example,
+
+    ..  code-block:: json
+
+        {
+          "__tests": ["title == '\ud83d\ude0e'"],
+          "title": "😎"
+        }
diff --git a/tests/json/author-avatar-no-name.json b/tests/json/author-avatar-no-name.json
@@ -0,0 +1,12 @@
+{
+  "__description": "author.avatar: missing a `name` key",
+  "__tests": [
+    "len(feed['author_detail'].keys()) == 1",
+    "feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
+    "len(feed['authors'][0].keys()) == 1",
+    "feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
+  ],
+  "author": {
+    "avatar": "https://domain.example/i.jpeg"
+  }
+}
diff --git a/tests/json/author-avatar-whitespace.json b/tests/json/author-avatar-whitespace.json
@@ -0,0 +1,11 @@
+{
+  "__description": "author.avatar: whitespace must be stripped",
+  "__tests": [
+    "feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
+    "feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
+  ],
+  "author": {
+    "name": "abc",
+    "avatar": " https://domain.example/i.jpeg "
+  }
+}
diff --git a/tests/json/author-avatar.json b/tests/json/author-avatar.json
@@ -0,0 +1,14 @@
+{
+  "__description": "author.avatar: valid value",
+  "__tests": [
+    "len(feed['author_detail'].keys()) == 2",
+    "feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
+    "len(feed['authors']) == 1",
+    "len(feed['authors'][0].keys()) == 2",
+    "feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
+  ],
+  "author": {
+    "name": "abc",
+    "avatar": "https://domain.example/i.jpeg"
+  }
+}
diff --git a/tests/json/author-empty.json b/tests/json/author-empty.json
@@ -0,0 +1,9 @@
+{
+  "__description": "author: ignored when the object is empty",
+  "__tests": [
+    "'author' not in feed",
+    "'author_detail' not in feed",
+    "'authors' not in feed"
+  ],
+  "author": {}
+}
diff --git a/tests/json/author-name-whitespace.json b/tests/json/author-name-whitespace.json
@@ -0,0 +1,11 @@
+{
+  "__description": "author.name: whitespace must be stripped",
+  "__tests": [
+    "feed['author'] == 'abc'",
+    "feed['author_detail']['name'] == 'abc'",
+    "feed['authors'][0]['name'] == 'abc'"
+  ],
+  "author": {
+    "name": " abc "
+  }
+}