Skip to content

Commit

Permalink
Merge pull request #363 from kurtmckee/fix-json-parsing
Browse files Browse the repository at this point in the history
Fix JSON feed parsing
  • Loading branch information
kurtmckee committed Apr 13, 2023
2 parents 6d032b8 + 7a6f1f4 commit 859ac57
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 68 deletions.
9 changes: 9 additions & 0 deletions changelog.d/20230413_090924_kurtmckee_fix_json_parsing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Fixed
-----

* Fix a bug that prevented JSON feeds from being parsed.

A comparison was failing due to incompatible types,
which allowed XML declarations to be added to JSON feeds.

* Fallback to JSON feed parsing if the XML parsers completely fail.
44 changes: 25 additions & 19 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,14 @@ def _parse_file_inplace(
# because the SAX parser closes the file when done;
# we don't want that, since we might try again with the loose parser.

use_json_parser = result["content-type"] == "application/json"
use_strict_parser = result["encoding"] and True or False
use_json_parser = False
if result["content-type"] in {"application/json", "application/feed+json"}:
use_json_parser = True
use_strict_parser = bool(result["encoding"])

if not use_json_parser:
result["version"], stream_factory.prefix, entities = replace_doctype(
stream_factory.prefix
)
result["version"], stream_factory.prefix, entities = replace_doctype(
stream_factory.prefix
)

# Ensure that baseuri is an absolute URI using an acceptable URI scheme.
contentloc = result["headers"].get("content-location", "")
Expand All @@ -300,16 +301,7 @@ def _parse_file_inplace(

feed_parser: Union[JSONParser, StrictFeedParser, LooseFeedParser]

if use_json_parser:
result["version"] = None
feed_parser = JSONParser(baseuri, baselang, "utf-8")
try:
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result["bozo"] = 1
result["bozo_exception"] = e

elif use_strict_parser:
if use_strict_parser and not use_json_parser:
# Initialize the SAX parser.
feed_parser = StrictFeedParser(baseuri, baselang, "utf-8")
feed_parser.resolve_relative_uris = resolve_relative_uris
Expand Down Expand Up @@ -339,9 +331,9 @@ def _parse_file_inplace(
result["bozo_exception"] = feed_parser.exc or e
use_strict_parser = False

# The loose XML parser will be tried if the JSON parser was not used,
# and if the strict XML parser was not used (or if it failed).
if not use_json_parser and not use_strict_parser:
# The loose XML parser will be tried if the strict XML parser was not used
# (or if it failed to parse the feed).
if not use_strict_parser and not use_json_parser:
feed_parser = LooseFeedParser(baseuri, baselang, "utf-8", entities)
feed_parser.resolve_relative_uris = resolve_relative_uris
feed_parser.sanitize_html = sanitize_html
Expand All @@ -362,6 +354,20 @@ def _parse_file_inplace(

feed_parser.feed(data)

# If parsing with the loose XML parser resulted in no information,
# flag that the JSON parser should be tried.
if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
use_json_parser = True

if use_json_parser:
result["version"] = None
feed_parser = JSONParser(baseuri, baselang, "utf-8")
try:
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result["bozo"] = 1
result["bozo_exception"] = e

result["feed"] = feed_parser.feeddata
result["entries"] = feed_parser.entries
result["version"] = result["version"] or feed_parser.version
Expand Down
71 changes: 37 additions & 34 deletions feedparser/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

from __future__ import annotations

import codecs
import io
import re
import typing as t
import typing

try:
try:
Expand All @@ -47,6 +49,7 @@ def lazy_chardet_encoding(data):
from .exceptions import (
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
NonXMLContentType,
)

Expand All @@ -58,7 +61,7 @@ def lazy_chardet_encoding(data):
UTF32BE_MARKER = b"\x00\x00\x00\x3C"
UTF32LE_MARKER = b"\x3C\x00\x00\x00"

ZERO_BYTES = "\x00\x00"
ZERO_BYTES = b"\x00\x00"

# Match the opening XML declaration.
# Example: <?xml version="1.0" encoding="utf-8"?>
Expand All @@ -69,7 +72,7 @@ def lazy_chardet_encoding(data):
RE_XML_PI_ENCODING = re.compile(rb'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')


def parse_content_type(line: str) -> t.Tuple[str, str]:
def parse_content_type(line: str) -> tuple[str, str]:
"""Parse an HTTP Content-Type header.
The return value will be a tuple of strings:
Expand All @@ -93,11 +96,10 @@ def parse_content_type(line: str) -> t.Tuple[str, str]:
return mime_type, charset_value


def convert_to_utf8(http_headers, data, result):
"""Detect and convert the character encoding to UTF-8.
http_headers is a dictionary
data is a raw string (not Unicode)"""
def convert_to_utf8(
http_headers: dict[str, str], data: bytes, result: dict[str, typing.Any]
) -> bytes:
"""Detect and convert the character encoding to UTF-8."""

# This is so much trickier than it sounds, it's not even funny.
# According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
Expand Down Expand Up @@ -136,9 +138,7 @@ def convert_to_utf8(http_headers, data, result):

# Of course, none of this guarantees that we will be able to parse the
# feed in the declared character encoding (assuming it was declared
# correctly, which many are not). iconv_codec can help a lot;
# you should definitely install it if you can.
# http://cjkpython.i18n.org/
# correctly, which many are not).

bom_encoding = ""
xml_encoding = ""
Expand Down Expand Up @@ -239,7 +239,7 @@ def convert_to_utf8(http_headers, data, result):
acceptable_content_type = 1
rfc3023_encoding = http_encoding or "us-ascii"
elif http_content_type in json_content_types or (
not http_content_type and data and data.lstrip()[0] == "{"
not http_content_type and data and data.lstrip().startswith(b"{")
):
http_content_type = json_content_types[0]
acceptable_content_type = 1
Expand All @@ -264,7 +264,7 @@ def convert_to_utf8(http_headers, data, result):
# - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
# - rfc3023_encoding is the actual encoding, as per RFC 3023
# and a variety of other conflicting specifications
error = None
error: FeedparserError | None = None

if http_headers and (not acceptable_content_type):
if "content-type" in http_headers:
Expand All @@ -274,10 +274,10 @@ def convert_to_utf8(http_headers, data, result):
error = NonXMLContentType(msg)

# determine character encoding
known_encoding = 0
known_encoding = False
tried_encodings = []
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
for proposed_encoding in (
for encoding_to_try in (
rfc3023_encoding,
xml_encoding,
bom_encoding,
Expand All @@ -286,28 +286,31 @@ def convert_to_utf8(http_headers, data, result):
"windows-1252",
"iso-8859-2",
):
if callable(proposed_encoding):
proposed_encoding = proposed_encoding(data)
if callable(encoding_to_try):
proposed_encoding = encoding_to_try(data)
else:
proposed_encoding = encoding_to_try
if not proposed_encoding:
continue
if proposed_encoding in tried_encodings:
continue
tried_encodings.append(proposed_encoding)
try:
data = data.decode(proposed_encoding)
text = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError):
pass
else:
known_encoding = 1
if not json:
# Update the encoding in the opening XML processing instruction.
new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
if RE_XML_DECLARATION.search(data):
data = RE_XML_DECLARATION.sub(new_declaration, data)
else:
data = new_declaration + "\n" + data
data = data.encode("utf-8")
break
continue

known_encoding = True
if not json:
# Update the encoding in the opening XML processing instruction.
new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
if RE_XML_DECLARATION.search(text):
text = RE_XML_DECLARATION.sub(new_declaration, text)
else:
text = new_declaration + "\n" + text
data = text.encode("utf-8")
break

# if still no luck, give up
if not known_encoding:
error = CharacterEncodingUnknown(
Expand Down Expand Up @@ -348,7 +351,7 @@ def convert_file_to_utf8(
):
"""Like convert_to_utf8(), but for a stream.
Unlike convert_to_utf8(), do not read the the entire file in memory;
Unlike convert_to_utf8(), do not read the entire file in memory;
instead, return a text stream that decodes it on the fly.
This should consume significantly less memory,
because it avoids (repeatedly) converting the entire file contents
Expand Down Expand Up @@ -421,7 +424,7 @@ def convert_file_to_utf8(

def convert_file_prefix_to_utf8(
http_headers,
file: t.IO[bytes],
file: typing.IO[bytes],
result,
*,
prefix_len: int = CONVERT_FILE_PREFIX_LEN,
Expand Down Expand Up @@ -458,7 +461,7 @@ def convert_file_prefix_to_utf8(

prefix += byte

fake_result: t.Any = {}
fake_result: typing.Any = {}
converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)

# an encoding was detected successfully, keep it
Expand Down Expand Up @@ -495,7 +498,7 @@ def key(candidate):
return converted_prefix


def read_to_after_ascii_byte(file: t.IO[bytes], max_len: int) -> bytes:
def read_to_after_ascii_byte(file: typing.IO[bytes], max_len: int) -> bytes:
offset = file.tell()
buffer = b""

Expand Down
35 changes: 25 additions & 10 deletions feedparser/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

from __future__ import annotations

import re

from .html import BaseHTMLProcessor
Expand Down Expand Up @@ -914,20 +916,33 @@ def sanitize_html(html_source, encoding, _type):
RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')


def replace_doctype(data):
"""Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
"""Strip and replaces the DOCTYPE.
One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
Therefore, this function must identify that version while replacing the DOCTYPE.
As a convenience to the loose XML parser, entities are pre-computed and returned.
rss_version may be 'rss091n' or None
stripped_data is the same XML document with a replaced DOCTYPE
The tuple that is returned has the following values, in order:
1. The version extracted from the XML DOCTYPE.
The value will either be "rss091n" or None.
2. Binary XML content with a replaced DOCTYPE.
3. A dictionary of entities and replacements.
"""

# Verify this looks like an XML feed.
if not re.match(rb"^\s*<", data):
return None, data, {}

# Divide the document into two groups by finding the location
# of the first element that doesn't begin with '<?' or '<!'.
start = re.search(rb"<\w", data)
start = start and start.start() or -1
head, data = data[: start + 1], data[start + 1 :]
match = re.search(rb"<\w", data)
first_element = match.start() + 1 if match is not None else 0
head, data = data[:first_element], data[first_element:]

# Save and then remove all of the ENTITY declarations.
# Save, and then remove, any ENTITY declarations.
entity_results = RE_ENTITY_PATTERN.findall(head)
head = RE_ENTITY_PATTERN.sub(b"", head)

Expand All @@ -952,8 +967,8 @@ def replace_doctype(data):
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data

# Precompute the safe entities for the loose parser.
safe_entities = {
entities = {
k.decode("utf-8"): v.decode("utf-8")
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
}
return version, data, safe_entities
return version, data, entities
4 changes: 0 additions & 4 deletions tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
paths = pathlib.Path("tests/json").rglob("*.json")


# TODO: The JSON tests never executed in the old test harness.
# Now that they are running, it is clear they never actually worked.
# They must be fixed!
@pytest.mark.xfail
@pytest.mark.parametrize("path", paths)
def test_json(path):
text = path.read_text()
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ deps =
responses
chardet: chardet
commands =
coverage run -m pytest
coverage run -m pytest {posargs:}


[testenv:coverage_erase]
Expand Down

0 comments on commit 859ac57

Please sign in to comment.