Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
kurtmckee committed Apr 21, 2023
1 parent d8a1986 commit 7fa6924
Show file tree
Hide file tree
Showing 84 changed files with 964 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true

[{*.yaml,*.yml}]
[{*.yaml,*.yml,*.json}]
indent_size = 2
7 changes: 5 additions & 2 deletions feedparser/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def _parse_file_inplace(
try:
saxparser.parse(source)
except xml.sax.SAXException as e:
result["bozo"] = 1
result["bozo"] = True
result["bozo_exception"] = feed_parser.exc or e
use_strict_parser = False

Expand Down Expand Up @@ -358,14 +358,17 @@ def _parse_file_inplace(
# flag that the JSON parser should be tried.
if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
use_json_parser = True
result["bozo"] = False
result.pop("bozo_exception", None)

if use_json_parser:
result["version"] = None
feed_parser = JSONParser(baseuri, baselang, "utf-8")
feed_parser.sanitize_html = sanitize_html
try:
feed_parser.feed(stream_factory.get_file())
except Exception as e:
result["bozo"] = 1
result["bozo"] = True
result["bozo_exception"] = e

result["feed"] = feed_parser.feeddata
Expand Down
200 changes: 168 additions & 32 deletions feedparser/parsers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,15 @@

from ..datetimes import _parse_date
from ..sanitizer import sanitize_html
from ..util import FeedParserDict
from ..util import FeedParserDict, looks_like_html

JSON_VERSIONS = {
"https://jsonfeed.org/version/1": "json1",
"https://jsonfeed.org/version/1.1": "json11",
}


class JSONParser:
VERSIONS = {
"https://jsonfeed.org/version/1": "json1",
"https://jsonfeed.org/version/1.1": "json11",
}
FEED_FIELDS = (
("title", "title"),
("icon", "image"),
("home_page_url", "link"),
("description", "description"),
)
ITEM_FIELDS = (
("title", "title"),
("id", "guid"),
Expand All @@ -55,6 +50,7 @@ def __init__(self, baseuri=None, baselang=None, encoding=None):
self.baseuri = baseuri or ""
self.lang = baselang or None
self.encoding = encoding or "utf-8" # character encoding
self.sanitize_html = False

self.version = None
self.feeddata = FeedParserDict()
Expand All @@ -64,20 +60,144 @@ def __init__(self, baseuri=None, baselang=None, encoding=None):
def feed(self, file):
data = json.load(file)

v = data.get("version", "")
# If the file parses as JSON, assume it's a JSON feed.
self.version = "json"
try:
self.version = self.VERSIONS[v]
except KeyError:
raise ValueError("Unrecognized JSONFeed version '%s'" % v)
self.version = JSON_VERSIONS[data["version"].strip()]
except (AttributeError, KeyError, TypeError):
pass

# Handle `title`, if it exists.
title = data.get("title")
if isinstance(title, str):
title = title.strip()
is_html = looks_like_html(title)
content_type = "text/html" if is_html else "text/plain"
if is_html and self.sanitize_html:
title = sanitize_html(title, encoding=None, _type=content_type)
self.feeddata["title"] = title
self.feeddata["title_detail"] = {
"value": title,
"type": content_type,
}

# Handle `description`, if it exists.
description = data.get("description")
if isinstance(description, str):
description = description.strip()
is_html = looks_like_html(description)
content_type = "text/html" if is_html else "text/plain"
if is_html and self.sanitize_html:
description = sanitize_html(
description, encoding=None, _type=content_type
)
self.feeddata["subtitle"] = description
self.feeddata["subtitle_detail"] = {
"value": description,
"type": content_type,
}

# Handle `feed_url`, if it exists.
feed_url = data.get("feed_url")
if isinstance(feed_url, str):
feed_url = feed_url.strip()
# The feed URL is also...sigh...the feed ID.
self.feeddata["id"] = feed_url
self.feeddata.setdefault("links", []).append(
{
"href": feed_url,
"rel": "self",
}
)
if "title" in self.feeddata:
self.feeddata["links"][-1]["title"] = self.feeddata["title"]

# Handle `home_page_url`, if it exists.
home_page_url = data.get("home_page_url")
if isinstance(home_page_url, str):
home_page_url = home_page_url.strip()
self.feeddata["link"] = home_page_url
self.feeddata.setdefault("links", []).append(
{
"href": home_page_url,
"rel": "alternate",
}
)

# Handle `icon`, if it exists.
icon = data.get("icon")
if isinstance(icon, str):
self.feeddata["image"] = {"href": icon.strip()}

for src, dst in self.FEED_FIELDS:
if src in data:
self.feeddata[dst] = data[src]
if "author" in data:
self.parse_author(data["author"], self.feeddata)
# TODO: hubs; expired has no RSS equivalent
# Handle `favicon`, if it exists.
favicon = data.get("favicon")
if isinstance(favicon, str):
self.feeddata["icon"] = favicon.strip()

self.entries = [self.parse_entry(e) for e in data["items"]]
# Handle `user_comment`, if it exists.
user_comment = data.get("user_comment")
if isinstance(user_comment, str):
user_comment = user_comment.strip()
is_html = looks_like_html(user_comment)
content_type = "text/html" if is_html else "text/plain"
if is_html and self.sanitize_html:
user_comment = sanitize_html(
user_comment, encoding=None, _type=content_type
)
self.feeddata["info"] = user_comment
self.feeddata["info_detail"] = {
"value": user_comment,
"type": content_type,
}

# Handle `next_url`, if it exists.
next_url = data.get("next_url")
if isinstance(next_url, str):
next_url = next_url.strip()
self.feeddata.setdefault("links", []).append(
{
"href": next_url,
"rel": "next",
}
)

# Handle `expired`, if it exists.
expired = data.get("expired", ...)
if expired is not ...:
# The spec claims that only boolean true means "finished".
self.feeddata["complete"] = expired is True

# Handle `hubs`, if it exists.
hubs = data.get("hubs", ...)
if hubs is not ...:
self.feeddata["hubs"] = []
if isinstance(hubs, list):
for hub in hubs:
if not isinstance(hub, dict):
continue
url = hub.get("url")
type_ = hub.get("type")
if not (isinstance(url, str) and isinstance(type_, str)):
continue
self.feeddata["hubs"].append(
{
"url": url.strip(),
"type": type_.strip(),
}
)

# TODO: TEST AUTHOR PARSING THOROUGHLY
# TODO: REFACTOR AUTHOR.NAME TESTS SO *THEY* TEST MISSING NAME KEYS.
author_singular = data.get("author")
if isinstance(author_singular, dict):
parsed_author = self._parse_author(author_singular)
if parsed_author:
self.feeddata["authors"] = [parsed_author]
self.feeddata["author_detail"] = parsed_author
if "name" in parsed_author:
self.feeddata["author"] = parsed_author["name"]

self.entries = [self.parse_entry(e) for e in data.get("items", ())]

def parse_entry(self, e):
entry = FeedParserDict()
Expand Down Expand Up @@ -107,23 +227,39 @@ def parse_entry(self, e):
entry["category"] = e["tags"]

if "author" in e:
self.parse_author(e["author"], entry)
self._parse_author(e["author"])

if "attachments" in e:
entry["enclosures"] = [self.parse_attachment(a) for a in e["attachments"]]

return entry

@staticmethod
def parse_author(parent, dest):
dest["author_detail"] = detail = FeedParserDict()
if "name" in parent:
dest["author"] = detail["name"] = parent["name"]
if "url" in parent:
if parent["url"].startswith("mailto:"):
detail["email"] = parent["url"][7:]
else:
detail["href"] = parent["url"]
def _parse_author(info: dict[str, str]) -> dict[str, str]:
parsed_author: dict[str, str] = {}

name = info.get("name")
if isinstance(name, str):
parsed_author["name"] = name.strip()

url = info.get("url")
if isinstance(url, str):
url = url.strip()
parsed_author["href"] = url
# URLs can be email addresses.
# However, only a "mailto:" URI supports options like:
#
# mailto:user@domain.example?subject=Feed
#
# Caution is required when converting the URL to an email.
if url.startswith("mailto:"):
parsed_author["email"], _, _ = url[7:].partition("?")

avatar = info.get("avatar")
if isinstance(avatar, str):
parsed_author["image"] = avatar.strip()

return parsed_author

@staticmethod
def parse_attachment(attachment):
Expand Down
47 changes: 47 additions & 0 deletions tests/json/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
JSON feed tests
===============

The files in this directory exercise the JSON feed parser.


``*.json``
----------

Files in this directory contain JSON objects.
They will contain a ``__tests`` key with a list of expected conditions.
Each condition will be evaluated after parsing the feed.

For example:

.. code-block:: json
{
"__tests": [
"version == 'json11'"
],
"version": "https://jsonfeed.org/version/1.1"
}
Files in this directory are automatically found, parsed, and tested.

When writing tests please consider the following:

* Test incorrect value types that do not match the JSON feed requirements.
For example, test null values where a string is required.

* Test incorrect value types that are not hashable in Python.
For example, test lists where a string is required.

* Test incorrect value types that are also iterable in Python.
For example, test a JSON object where a list is required.

* Write ``eval()`` strings using Unicode escape sequences as needed.
This forces the test and JSON feed content to agree.
For example,

.. code-block:: json
{
"__tests": ["title == '\ud83d\ude0e'"],
"title": "😎"
}
12 changes: 12 additions & 0 deletions tests/json/author-avatar-no-name.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"__description": "author.avatar: missing a `name` key",
"__tests": [
"len(feed['author_detail'].keys()) == 1",
"feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
"len(feed['authors'][0].keys()) == 1",
"feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
],
"author": {
"avatar": "https://domain.example/i.jpeg"
}
}
11 changes: 11 additions & 0 deletions tests/json/author-avatar-whitespace.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"__description": "author.avatar: whitespace must be stripped",
"__tests": [
"feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
"feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
],
"author": {
"name": "abc",
"avatar": " https://domain.example/i.jpeg "
}
}
14 changes: 14 additions & 0 deletions tests/json/author-avatar.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"__description": "author.avatar: valid value",
"__tests": [
"len(feed['author_detail'].keys()) == 2",
"feed['author_detail']['image'] == 'https://domain.example/i.jpeg'",
"len(feed['authors']) == 1",
"len(feed['authors'][0].keys()) == 2",
"feed['authors'][0]['image'] == 'https://domain.example/i.jpeg'"
],
"author": {
"name": "abc",
"avatar": "https://domain.example/i.jpeg"
}
}
9 changes: 9 additions & 0 deletions tests/json/author-empty.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"__description": "author: ignored when the object is empty",
"__tests": [
"'author' not in feed",
"'author_detail' not in feed",
"'authors' not in feed"
],
"author": {}
}
11 changes: 11 additions & 0 deletions tests/json/author-name-whitespace.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"__description": "author.name: whitespace must be stripped",
"__tests": [
"feed['author'] == 'abc'",
"feed['author_detail']['name'] == 'abc'",
"feed['authors'][0]['name'] == 'abc'"
],
"author": {
"name": " abc "
}
}

0 comments on commit 7fa6924

Please sign in to comment.