Skip to content

Commit

Permalink
Use CSafeLoader where available, begin configurable anchor handling
Browse files Browse the repository at this point in the history
  • Loading branch information
kislyuk committed Nov 29, 2021
1 parent 85b1152 commit a5d2c57
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 38 deletions.
2 changes: 1 addition & 1 deletion test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_yq(self):
self.assertEqual(self.run_yq("- понедельник\n- вторник\n", ["-y", "."]), "- понедельник\n- вторник\n")

def test_yq_err(self):
err = ('yq: Error running jq: ScannerError: while scanning for the next token\nfound character \'%\' that '
err = ('yq: Error running jq: ScannerError: while scanning for the next token\nfound character that '
'cannot start any token\n in "<file>", line 1, column 3.')
self.run_yq("- %", ["."], expect_exit_codes={err, 2})

Expand Down
59 changes: 41 additions & 18 deletions yq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

# PYTHON_ARGCOMPLETE_OK

import os, sys, argparse, subprocess, json
from collections import OrderedDict
import os, sys, argparse, subprocess, json, io
from datetime import datetime, date, time

import yaml, argcomplete
Expand Down Expand Up @@ -132,9 +131,31 @@ def exit_handler(arg=None):
else:
yq(**yq_args)

def load_yaml_docs(in_stream, out_stream, jq, loader_class, max_expansion_factor, exit_func, prog):
loader = loader_class(in_stream)
last_loader_pos = 0
try:
while loader.check_node():
node = loader.get_node()
doc = loader.construct_document(node)
loader_pos = node.end_mark.index
doc_len = loader_pos - last_loader_pos
doc_bytes_written = 0
for chunk in JSONDateTimeEncoder().iterencode(doc):
doc_bytes_written += len(chunk)
if doc_bytes_written > doc_len * max_expansion_factor:
if jq:
jq.kill()
exit_func("{}: Error: detected unsafe YAML entity expansion".format(prog))
out_stream.write(chunk)
out_stream.write("\n")
last_loader_pos = loader_pos
finally:
loader.dispose()

def yq(input_streams=None, output_stream=None, input_format="yaml", output_format="json",
program_name="yq", width=None, indentless_lists=False, xml_root=None, xml_dtd=False, xml_force_list=frozenset(),
explicit_start=False, explicit_end=False, jq_args=frozenset(), exit_func=None):
explicit_start=False, explicit_end=False, max_expansion_factor=1024, jq_args=frozenset(), exit_func=None):
if not input_streams:
input_streams = [sys.stdin]
if not output_stream:
Expand All @@ -161,23 +182,26 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma
# subprocess.Popen._communicate, etc.)
# See https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python
use_annotations = True if output_format == "annotated_yaml" else False
input_docs = []
json_buffer = io.StringIO()
for input_stream in input_streams:
if input_format == "yaml":
loader = get_loader(use_annotations=use_annotations)
input_docs.extend(yaml.load_all(input_stream, Loader=loader))
loader_class = get_loader(use_annotations=use_annotations)
load_yaml_docs(in_stream=input_stream, out_stream=json_buffer, jq=None, loader_class=loader_class,
max_expansion_factor=max_expansion_factor, exit_func=exit_func, prog=program_name)
elif input_format == "xml":
import xmltodict
input_docs.append(xmltodict.parse(input_stream.read(), disable_entities=True,
force_list=xml_force_list))
doc = xmltodict.parse(input_stream.read(), disable_entities=True, force_list=xml_force_list)
json.dump(doc, json_buffer, cls=JSONDateTimeEncoder)
json_buffer.write("\n")
elif input_format == "toml":
import toml
input_docs.append(toml.load(input_stream))
doc = toml.load(input_stream)
json.dump(doc, json_buffer, cls=JSONDateTimeEncoder)
json_buffer.write("\n")
else:
raise Exception("Unknown input format")
input_payload = "\n".join(json.dumps(doc, cls=JSONDateTimeEncoder) for doc in input_docs)
jq_out, jq_err = jq.communicate(input_payload)
json_decoder = json.JSONDecoder(object_pairs_hook=OrderedDict)
jq_out, jq_err = jq.communicate(json_buffer.getvalue())
json_decoder = json.JSONDecoder()
if output_format == "yaml" or output_format == "annotated_yaml":
yaml.dump_all(decode_docs(jq_out, json_decoder), stream=output_stream,
Dumper=get_dumper(use_annotations=use_annotations, indentless=indentless_lists),
Expand All @@ -188,7 +212,7 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma
for doc in decode_docs(jq_out, json_decoder):
if xml_root:
doc = {xml_root: doc}
elif not isinstance(doc, OrderedDict):
elif not isinstance(doc, dict):
msg = ("{}: Error converting JSON to XML: cannot represent non-object types at top level. "
"Use --xml-root=name to envelope your output with a root element.")
exit_func(msg.format(program_name))
Expand All @@ -205,17 +229,16 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma
elif output_format == "toml":
import toml
for doc in decode_docs(jq_out, json_decoder):
if not isinstance(doc, OrderedDict):
if not isinstance(doc, dict):
msg = "{}: Error converting JSON to TOML: cannot represent non-object types at top level."
exit_func(msg.format(program_name))
toml.dump(doc, output_stream)
else:
if input_format == "yaml":
loader = get_loader(use_annotations=False)
loader_class = get_loader(use_annotations=False)
for input_stream in input_streams:
for doc in yaml.load_all(input_stream, Loader=loader):
json.dump(doc, jq.stdin, cls=JSONDateTimeEncoder)
jq.stdin.write("\n")
load_yaml_docs(in_stream=input_stream, out_stream=jq.stdin, jq=jq, loader_class=loader_class,
max_expansion_factor=max_expansion_factor, exit_func=exit_func, prog=program_name)
elif input_format == "xml":
import xmltodict
for input_stream in input_streams:
Expand Down
10 changes: 8 additions & 2 deletions yq/dumper.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import re
from collections import OrderedDict

import yaml
# try:
# from yaml import CSafeDumper as default_dumper
# except ImportError:
# from yaml import SafeDumper as default_dumper

from .loader import hash_key

Expand All @@ -19,6 +22,9 @@ def ignore_aliases(self, data):
yaml_item_annotation_re = re.compile(r"^__yq_(?P<type>tag|style)_(?P<key>\d+)_(?P<value>.+)__$")

def get_dumper(use_annotations=False, indentless=False):
# if not (use_annotations or indentless):
# return default_dumper

def represent_dict(dumper, data):
pairs, custom_styles, custom_tags = [], {}, {}
for k, v in data.items():
Expand Down Expand Up @@ -69,6 +75,6 @@ def represent_list(dumper, data):
return sequence

dumper = OrderedIndentlessDumper if indentless else OrderedDumper
dumper.add_representer(OrderedDict, represent_dict)
dumper.add_representer(dict, represent_dict)
dumper.add_representer(list, represent_list)
return dumper
51 changes: 34 additions & 17 deletions yq/loader.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
from base64 import b64encode
from collections import OrderedDict
from hashlib import sha224

import yaml
# from yaml.tokens import AliasToken, ScalarToken
from yaml.tokens import AliasToken, AnchorToken, ScalarToken
try:
from yaml import CSafeLoader as default_loader
except ImportError:
from yaml import SafeLoader as default_loader

def hash_key(key):
return b64encode(sha224(key.encode() if isinstance(key, str) else key).digest()).decode()

class OrderedLoader(yaml.SafeLoader):
def scan_anchor(self, token_class):
return self.scan_plain()

# def check_token(self, *choices):
# if choices == (AliasToken, ):
# return False
# if choices == (ScalarToken, ) and super().check_token(AliasToken):
# return True
# return super().check_token(*choices)
class CustomLoader(yaml.SafeLoader):
expand_aliases = False

def get_loader(use_annotations=False):
def fetch_alias(self):
if self.expand_aliases:
return super().fetch_alias()
self.save_possible_simple_key()
self.allow_simple_key = False
alias_token = self.scan_anchor(AliasToken)
# FIXME: turning alias into a string is not ideal, but probably the only reasonable solution
# FIXME: use magic tags (__yq_alias/__yq_anchor) to preserve with -Y
self.tokens.append(ScalarToken(value='*' + alias_token.value,
plain=True,
start_mark=alias_token.start_mark,
end_mark=alias_token.end_mark))

def fetch_anchor(self):
if self.expand_aliases:
return super().fetch_anchor()
self.save_possible_simple_key()
self.allow_simple_key = False
self.scan_anchor(AnchorToken)

def get_loader(use_annotations=False, expand_aliases=True):
def construct_sequence(loader, node):
annotations = []
for i, v_node in enumerate(node.value):
Expand Down Expand Up @@ -48,7 +64,7 @@ def construct_mapping(loader, node):
pairs.append(("__yq_style_{}__".format(hash_key(key)), v_node.style))
elif isinstance(v_node, (yaml.nodes.SequenceNode, yaml.nodes.MappingNode)) and v_node.flow_style is True:
pairs.append(("__yq_style_{}__".format(hash_key(key)), "flow"))
return OrderedDict(pairs)
return dict(pairs)

def parse_unknown_tags(loader, tag_suffix, node):
if isinstance(node, yaml.nodes.ScalarNode):
Expand All @@ -58,7 +74,8 @@ def parse_unknown_tags(loader, tag_suffix, node):
elif isinstance(node, yaml.nodes.MappingNode):
return construct_mapping(loader, node)

OrderedLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
OrderedLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_SEQUENCE_TAG, construct_sequence)
OrderedLoader.add_multi_constructor('', parse_unknown_tags)
return OrderedLoader
loader_class = default_loader if expand_aliases else CustomLoader
loader_class.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping)
loader_class.add_constructor(yaml.resolver.BaseResolver.DEFAULT_SEQUENCE_TAG, construct_sequence)
loader_class.add_multi_constructor('', parse_unknown_tags)
return loader_class

0 comments on commit a5d2c57

Please sign in to comment.