Merge pull request #363 from kurtmckee/fix-json-parsing

Fix JSON feed parsing
kurtmckee · Apr 13, 2023 · 859ac57 · 859ac57
2 parents 6d032b8 + 7a6f1f4
commit 859ac57
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 68 deletions.
diff --git a/changelog.d/20230413_090924_kurtmckee_fix_json_parsing.rst b/changelog.d/20230413_090924_kurtmckee_fix_json_parsing.rst
@@ -0,0 +1,9 @@
+Fixed
+-----
+
+*   Fix a bug that prevented JSON feeds from being parsed.
+
+    A comparison was failing due to incompatible types,
+    which allowed XML declarations to be added to JSON feeds.
+
+*   Fallback to JSON feed parsing if the XML parsers completely fail.
diff --git a/feedparser/api.py b/feedparser/api.py
@@ -274,13 +274,14 @@ def _parse_file_inplace(
     # because the SAX parser closes the file when done;
     # we don't want that, since we might try again with the loose parser.
 
-    use_json_parser = result["content-type"] == "application/json"
-    use_strict_parser = result["encoding"] and True or False
+    use_json_parser = False
+    if result["content-type"] in {"application/json", "application/feed+json"}:
+        use_json_parser = True
+    use_strict_parser = bool(result["encoding"])
 
-    if not use_json_parser:
-        result["version"], stream_factory.prefix, entities = replace_doctype(
-            stream_factory.prefix
-        )
+    result["version"], stream_factory.prefix, entities = replace_doctype(
+        stream_factory.prefix
+    )
 
     # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
     contentloc = result["headers"].get("content-location", "")
@@ -300,16 +301,7 @@ def _parse_file_inplace(
 
     feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]
 
-    if use_json_parser:
-        result["version"] = None
-        feed_parser = JSONParser(baseuri, baselang, "utf-8")
-        try:
-            feed_parser.feed(stream_factory.get_file())
-        except Exception as e:
-            result["bozo"] = 1
-            result["bozo_exception"] = e
-
-    elif use_strict_parser:
+    if use_strict_parser and not use_json_parser:
         # Initialize the SAX parser.
         feed_parser = StrictFeedParser(baseuri, baselang, "utf-8")
         feed_parser.resolve_relative_uris = resolve_relative_uris
@@ -339,9 +331,9 @@ def _parse_file_inplace(
             result["bozo_exception"] = feed_parser.exc or e
             use_strict_parser = False
 
-    # The loose XML parser will be tried if the JSON parser was not used,
-    # and if the strict XML parser was not used (or if it failed).
-    if not use_json_parser and not use_strict_parser:
+    # The loose XML parser will be tried if the strict XML parser was not used
+    # (or if it failed to parse the feed).
+    if not use_strict_parser and not use_json_parser:
         feed_parser = LooseFeedParser(baseuri, baselang, "utf-8", entities)
         feed_parser.resolve_relative_uris = resolve_relative_uris
         feed_parser.sanitize_html = sanitize_html
@@ -362,6 +354,20 @@ def _parse_file_inplace(
 
         feed_parser.feed(data)
 
+        # If parsing with the loose XML parser resulted in no information,
+        # flag that the JSON parser should be tried.
+        if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
+            use_json_parser = True
+
+    if use_json_parser:
+        result["version"] = None
+        feed_parser = JSONParser(baseuri, baselang, "utf-8")
+        try:
+            feed_parser.feed(stream_factory.get_file())
+        except Exception as e:
+            result["bozo"] = 1
+            result["bozo_exception"] = e
+
     result["feed"] = feed_parser.feeddata
     result["entries"] = feed_parser.entries
     result["version"] = result["version"] or feed_parser.version

diff --git a/feedparser/encodings.py b/feedparser/encodings.py
@@ -26,10 +26,12 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
+from __future__ import annotations
+
 import codecs
 import io
 import re
-import typing as t
+import typing
 
 try:
     try:
@@ -47,6 +49,7 @@ def lazy_chardet_encoding(data):
 from .exceptions import (
     CharacterEncodingOverride,
     CharacterEncodingUnknown,
+    FeedparserError,
     NonXMLContentType,
 )
 
@@ -58,7 +61,7 @@ def lazy_chardet_encoding(data):
 UTF32BE_MARKER = b"\x00\x00\x00\x3C"
 UTF32LE_MARKER = b"\x3C\x00\x00\x00"
 
-ZERO_BYTES = "\x00\x00"
+ZERO_BYTES = b"\x00\x00"
 
 # Match the opening XML declaration.
 # Example: <?xml version="1.0" encoding="utf-8"?>
@@ -69,7 +72,7 @@ def lazy_chardet_encoding(data):
 RE_XML_PI_ENCODING = re.compile(rb'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
 
 
-def parse_content_type(line: str) -> t.Tuple[str, str]:
+def parse_content_type(line: str) -> tuple[str, str]:
     """Parse an HTTP Content-Type header.
 
     The return value will be a tuple of strings:
@@ -93,11 +96,10 @@ def parse_content_type(line: str) -> t.Tuple[str, str]:
     return mime_type, charset_value
 
 
-def convert_to_utf8(http_headers, data, result):
-    """Detect and convert the character encoding to UTF-8.
-
-    http_headers is a dictionary
-    data is a raw string (not Unicode)"""
+def convert_to_utf8(
+    http_headers: dict[str, str], data: bytes, result: dict[str, typing.Any]
+) -> bytes:
+    """Detect and convert the character encoding to UTF-8."""
 
     # This is so much trickier than it sounds, it's not even funny.
     # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
@@ -136,9 +138,7 @@ def convert_to_utf8(http_headers, data, result):
 
     # Of course, none of this guarantees that we will be able to parse the
     # feed in the declared character encoding (assuming it was declared
-    # correctly, which many are not).  iconv_codec can help a lot;
-    # you should definitely install it if you can.
-    # http://cjkpython.i18n.org/
+    # correctly, which many are not).
 
     bom_encoding = ""
     xml_encoding = ""
@@ -239,7 +239,7 @@ def convert_to_utf8(http_headers, data, result):
         acceptable_content_type = 1
         rfc3023_encoding = http_encoding or "us-ascii"
     elif http_content_type in json_content_types or (
-        not http_content_type and data and data.lstrip()[0] == "{"
+        not http_content_type and data and data.lstrip().startswith(b"{")
     ):
         http_content_type = json_content_types[0]
         acceptable_content_type = 1
@@ -264,7 +264,7 @@ def convert_to_utf8(http_headers, data, result):
     # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
     # - rfc3023_encoding is the actual encoding, as per RFC 3023
     #   and a variety of other conflicting specifications
-    error = None
+    error: FeedparserError | None = None
 
     if http_headers and (not acceptable_content_type):
         if "content-type" in http_headers:
@@ -274,10 +274,10 @@ def convert_to_utf8(http_headers, data, result):
         error = NonXMLContentType(msg)
 
     # determine character encoding
-    known_encoding = 0
+    known_encoding = False
     tried_encodings = []
     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
-    for proposed_encoding in (
+    for encoding_to_try in (
         rfc3023_encoding,
         xml_encoding,
         bom_encoding,
@@ -286,28 +286,31 @@ def convert_to_utf8(http_headers, data, result):
         "windows-1252",
         "iso-8859-2",
     ):
-        if callable(proposed_encoding):
-            proposed_encoding = proposed_encoding(data)
+        if callable(encoding_to_try):
+            proposed_encoding = encoding_to_try(data)
+        else:
+            proposed_encoding = encoding_to_try
         if not proposed_encoding:
             continue
         if proposed_encoding in tried_encodings:
             continue
         tried_encodings.append(proposed_encoding)
         try:
-            data = data.decode(proposed_encoding)
+            text = data.decode(proposed_encoding)
         except (UnicodeDecodeError, LookupError):
-            pass
-        else:
-            known_encoding = 1
-            if not json:
-                # Update the encoding in the opening XML processing instruction.
-                new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
-                if RE_XML_DECLARATION.search(data):
-                    data = RE_XML_DECLARATION.sub(new_declaration, data)
-                else:
-                    data = new_declaration + "\n" + data
-            data = data.encode("utf-8")
-            break
+            continue
+
+        known_encoding = True
+        if not json:
+            # Update the encoding in the opening XML processing instruction.
+            new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
+            if RE_XML_DECLARATION.search(text):
+                text = RE_XML_DECLARATION.sub(new_declaration, text)
+            else:
+                text = new_declaration + "\n" + text
+        data = text.encode("utf-8")
+        break
+
     # if still no luck, give up
     if not known_encoding:
         error = CharacterEncodingUnknown(
@@ -348,7 +351,7 @@ def convert_file_to_utf8(
 ):
     """Like convert_to_utf8(), but for a stream.
 
-    Unlike convert_to_utf8(), do not read the the entire file in memory;
+    Unlike convert_to_utf8(), do not read the entire file in memory;
     instead, return a text stream that decodes it on the fly.
     This should consume significantly less memory,
     because it avoids (repeatedly) converting the entire file contents
@@ -421,7 +424,7 @@ def convert_file_to_utf8(
 
 def convert_file_prefix_to_utf8(
     http_headers,
-    file: t.IO[bytes],
+    file: typing.IO[bytes],
     result,
     *,
     prefix_len: int = CONVERT_FILE_PREFIX_LEN,
@@ -458,7 +461,7 @@ def convert_file_prefix_to_utf8(
 
         prefix += byte
 
-        fake_result: t.Any = {}
+        fake_result: typing.Any = {}
         converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
 
         # an encoding was detected successfully, keep it
@@ -495,7 +498,7 @@ def key(candidate):
     return converted_prefix
 
 
-def read_to_after_ascii_byte(file: t.IO[bytes], max_len: int) -> bytes:
+def read_to_after_ascii_byte(file: typing.IO[bytes], max_len: int) -> bytes:
     offset = file.tell()
     buffer = b""
 

diff --git a/feedparser/sanitizer.py b/feedparser/sanitizer.py
@@ -25,6 +25,8 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
+from __future__ import annotations
+
 import re
 
 from .html import BaseHTMLProcessor
@@ -914,20 +916,33 @@ def sanitize_html(html_source, encoding, _type):
 RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
 
 
-def replace_doctype(data):
-    """Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
+def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
+    """Strip and replaces the DOCTYPE.
+
+    One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
+    Therefore, this function must identify that version while replacing the DOCTYPE.
+
+    As a convenience to the loose XML parser, entities are pre-computed and returned.
 
-    rss_version may be 'rss091n' or None
-    stripped_data is the same XML document with a replaced DOCTYPE
+    The tuple that is returned has the following values, in order:
+
+    1.  The version extracted from the XML DOCTYPE.
+        The value will either be "rss091n" or None.
+    2.  Binary XML content with a replaced DOCTYPE.
+    3.  A dictionary of entities and replacements.
     """
 
+    # Verify this looks like an XML feed.
+    if not re.match(rb"^\s*<", data):
+        return None, data, {}
+
     # Divide the document into two groups by finding the location
     # of the first element that doesn't begin with '<?' or '<!'.
-    start = re.search(rb"<\w", data)
-    start = start and start.start() or -1
-    head, data = data[: start + 1], data[start + 1 :]
+    match = re.search(rb"<\w", data)
+    first_element = match.start() + 1 if match is not None else 0
+    head, data = data[:first_element], data[first_element:]
 
-    # Save and then remove all of the ENTITY declarations.
+    # Save, and then remove, any ENTITY declarations.
     entity_results = RE_ENTITY_PATTERN.findall(head)
     head = RE_ENTITY_PATTERN.sub(b"", head)
 
@@ -952,8 +967,8 @@ def replace_doctype(data):
     data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
 
     # Precompute the safe entities for the loose parser.
-    safe_entities = {
+    entities = {
         k.decode("utf-8"): v.decode("utf-8")
         for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
     }
-    return version, data, safe_entities
+    return version, data, entities
diff --git a/tests/test_json.py b/tests/test_json.py
@@ -7,10 +7,6 @@
 paths = pathlib.Path("tests/json").rglob("*.json")
 
 
-# TODO: The JSON tests never executed in the old test harness.
-#       Now that they are running, it is clear they never actually worked.
-#       They must be fixed!
-@pytest.mark.xfail
 @pytest.mark.parametrize("path", paths)
 def test_json(path):
     text = path.read_text()

diff --git a/tox.ini b/tox.ini
@@ -26,7 +26,7 @@ deps =
     responses
     chardet: chardet
 commands =
-    coverage run -m pytest
+    coverage run -m pytest {posargs:}
 
 
 [testenv:coverage_erase]