From a5d2c572ee69009380a54a67662cf781f3036450 Mon Sep 17 00:00:00 2001 From: Andrey Kislyuk Date: Sun, 28 Nov 2021 16:19:30 -0800 Subject: [PATCH] Use CSafeLoader where available, begin configurable anchor handling --- test/test.py | 2 +- yq/__init__.py | 59 +++++++++++++++++++++++++++++++++++--------------- yq/dumper.py | 10 +++++++-- yq/loader.py | 51 ++++++++++++++++++++++++++++--------------- 4 files changed, 84 insertions(+), 38 deletions(-) diff --git a/test/test.py b/test/test.py index f25dced..e8b3d25 100755 --- a/test/test.py +++ b/test/test.py @@ -62,7 +62,7 @@ def test_yq(self): self.assertEqual(self.run_yq("- понедельник\n- вторник\n", ["-y", "."]), "- понедельник\n- вторник\n") def test_yq_err(self): - err = ('yq: Error running jq: ScannerError: while scanning for the next token\nfound character \'%\' that ' + err = ('yq: Error running jq: ScannerError: while scanning for the next token\nfound character that ' 'cannot start any token\n in "", line 1, column 3.') self.run_yq("- %", ["."], expect_exit_codes={err, 2}) diff --git a/yq/__init__.py b/yq/__init__.py index 91212d0..37e3ba9 100755 --- a/yq/__init__.py +++ b/yq/__init__.py @@ -7,8 +7,7 @@ # PYTHON_ARGCOMPLETE_OK -import os, sys, argparse, subprocess, json -from collections import OrderedDict +import os, sys, argparse, subprocess, json, io from datetime import datetime, date, time import yaml, argcomplete @@ -132,9 +131,31 @@ def exit_handler(arg=None): else: yq(**yq_args) +def load_yaml_docs(in_stream, out_stream, jq, loader_class, max_expansion_factor, exit_func, prog): + loader = loader_class(in_stream) + last_loader_pos = 0 + try: + while loader.check_node(): + node = loader.get_node() + doc = loader.construct_document(node) + loader_pos = node.end_mark.index + doc_len = loader_pos - last_loader_pos + doc_bytes_written = 0 + for chunk in JSONDateTimeEncoder().iterencode(doc): + doc_bytes_written += len(chunk) + if doc_bytes_written > doc_len * max_expansion_factor: + if jq: + jq.kill() + exit_func("{}: Error: detected unsafe YAML entity expansion".format(prog)) + out_stream.write(chunk) + out_stream.write("\n") + last_loader_pos = loader_pos + finally: + loader.dispose() + def yq(input_streams=None, output_stream=None, input_format="yaml", output_format="json", program_name="yq", width=None, indentless_lists=False, xml_root=None, xml_dtd=False, xml_force_list=frozenset(), - explicit_start=False, explicit_end=False, jq_args=frozenset(), exit_func=None): + explicit_start=False, explicit_end=False, max_expansion_factor=1024, jq_args=frozenset(), exit_func=None): if not input_streams: input_streams = [sys.stdin] if not output_stream: @@ -161,23 +182,26 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma # subprocess.Popen._communicate, etc.) # See https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python use_annotations = True if output_format == "annotated_yaml" else False - input_docs = [] + json_buffer = io.StringIO() for input_stream in input_streams: if input_format == "yaml": - loader = get_loader(use_annotations=use_annotations) - input_docs.extend(yaml.load_all(input_stream, Loader=loader)) + loader_class = get_loader(use_annotations=use_annotations) + load_yaml_docs(in_stream=input_stream, out_stream=json_buffer, jq=None, loader_class=loader_class, + max_expansion_factor=max_expansion_factor, exit_func=exit_func, prog=program_name) elif input_format == "xml": import xmltodict - input_docs.append(xmltodict.parse(input_stream.read(), disable_entities=True, - force_list=xml_force_list)) + doc = xmltodict.parse(input_stream.read(), disable_entities=True, force_list=xml_force_list) + json.dump(doc, json_buffer, cls=JSONDateTimeEncoder) + json_buffer.write("\n") elif input_format == "toml": import toml - input_docs.append(toml.load(input_stream)) + doc = toml.load(input_stream) + json.dump(doc, json_buffer, cls=JSONDateTimeEncoder) + json_buffer.write("\n") else: raise Exception("Unknown input format") - input_payload = "\n".join(json.dumps(doc, cls=JSONDateTimeEncoder) for doc in input_docs) - jq_out, jq_err = jq.communicate(input_payload) - json_decoder = json.JSONDecoder(object_pairs_hook=OrderedDict) + jq_out, jq_err = jq.communicate(json_buffer.getvalue()) + json_decoder = json.JSONDecoder() if output_format == "yaml" or output_format == "annotated_yaml": yaml.dump_all(decode_docs(jq_out, json_decoder), stream=output_stream, Dumper=get_dumper(use_annotations=use_annotations, indentless=indentless_lists), @@ -188,7 +212,7 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma for doc in decode_docs(jq_out, json_decoder): if xml_root: doc = {xml_root: doc} - elif not isinstance(doc, OrderedDict): + elif not isinstance(doc, dict): msg = ("{}: Error converting JSON to XML: cannot represent non-object types at top level. " "Use --xml-root=name to envelope your output with a root element.") exit_func(msg.format(program_name)) @@ -205,17 +229,16 @@ def yq(input_streams=None, output_stream=None, input_format="yaml", output_forma elif output_format == "toml": import toml for doc in decode_docs(jq_out, json_decoder): - if not isinstance(doc, OrderedDict): + if not isinstance(doc, dict): msg = "{}: Error converting JSON to TOML: cannot represent non-object types at top level." exit_func(msg.format(program_name)) toml.dump(doc, output_stream) else: if input_format == "yaml": - loader = get_loader(use_annotations=False) + loader_class = get_loader(use_annotations=False) for input_stream in input_streams: - for doc in yaml.load_all(input_stream, Loader=loader): - json.dump(doc, jq.stdin, cls=JSONDateTimeEncoder) - jq.stdin.write("\n") + load_yaml_docs(in_stream=input_stream, out_stream=jq.stdin, jq=jq, loader_class=loader_class, + max_expansion_factor=max_expansion_factor, exit_func=exit_func, prog=program_name) elif input_format == "xml": import xmltodict for input_stream in input_streams: diff --git a/yq/dumper.py b/yq/dumper.py index 307129a..f920193 100644 --- a/yq/dumper.py +++ b/yq/dumper.py @@ -1,7 +1,10 @@ import re -from collections import OrderedDict import yaml +# try: +# from yaml import CSafeDumper as default_dumper +# except ImportError: +# from yaml import SafeDumper as default_dumper from .loader import hash_key @@ -19,6 +22,9 @@ def ignore_aliases(self, data): yaml_item_annotation_re = re.compile(r"^__yq_(?Ptag|style)_(?P\d+)_(?P.+)__$") def get_dumper(use_annotations=False, indentless=False): + # if not (use_annotations or indentless): + # return default_dumper + def represent_dict(dumper, data): pairs, custom_styles, custom_tags = [], {}, {} for k, v in data.items(): @@ -69,6 +75,6 @@ def represent_list(dumper, data): return sequence dumper = OrderedIndentlessDumper if indentless else OrderedDumper - dumper.add_representer(OrderedDict, represent_dict) + dumper.add_representer(dict, represent_dict) dumper.add_representer(list, represent_list) return dumper diff --git a/yq/loader.py b/yq/loader.py index ea02f74..2fcf1d5 100644 --- a/yq/loader.py +++ b/yq/loader.py @@ -1,25 +1,41 @@ from base64 import b64encode -from collections import OrderedDict from hashlib import sha224 import yaml -# from yaml.tokens import AliasToken, ScalarToken +from yaml.tokens import AliasToken, AnchorToken, ScalarToken +try: + from yaml import CSafeLoader as default_loader +except ImportError: + from yaml import SafeLoader as default_loader def hash_key(key): return b64encode(sha224(key.encode() if isinstance(key, str) else key).digest()).decode() -class OrderedLoader(yaml.SafeLoader): - def scan_anchor(self, token_class): - return self.scan_plain() -# def check_token(self, *choices): -# if choices == (AliasToken, ): -# return False -# if choices == (ScalarToken, ) and super().check_token(AliasToken): -# return True -# return super().check_token(*choices) +class CustomLoader(yaml.SafeLoader): + expand_aliases = False -def get_loader(use_annotations=False): + def fetch_alias(self): + if self.expand_aliases: + return super().fetch_alias() + self.save_possible_simple_key() + self.allow_simple_key = False + alias_token = self.scan_anchor(AliasToken) + # FIXME: turning alias into a string is not ideal, but probably the only reasonable solution + # FIXME: use magic tags (__yq_alias/__yq_anchor) to preserve with -Y + self.tokens.append(ScalarToken(value='*' + alias_token.value, + plain=True, + start_mark=alias_token.start_mark, + end_mark=alias_token.end_mark)) + + def fetch_anchor(self): + if self.expand_aliases: + return super().fetch_anchor() + self.save_possible_simple_key() + self.allow_simple_key = False + self.scan_anchor(AnchorToken) + +def get_loader(use_annotations=False, expand_aliases=True): def construct_sequence(loader, node): annotations = [] for i, v_node in enumerate(node.value): @@ -48,7 +64,7 @@ def construct_mapping(loader, node): pairs.append(("__yq_style_{}__".format(hash_key(key)), v_node.style)) elif isinstance(v_node, (yaml.nodes.SequenceNode, yaml.nodes.MappingNode)) and v_node.flow_style is True: pairs.append(("__yq_style_{}__".format(hash_key(key)), "flow")) - return OrderedDict(pairs) + return dict(pairs) def parse_unknown_tags(loader, tag_suffix, node): if isinstance(node, yaml.nodes.ScalarNode): @@ -58,7 +74,8 @@ def parse_unknown_tags(loader, tag_suffix, node): elif isinstance(node, yaml.nodes.MappingNode): return construct_mapping(loader, node) - OrderedLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping) - OrderedLoader.add_constructor(yaml.resolver.BaseResolver.DEFAULT_SEQUENCE_TAG, construct_sequence) - OrderedLoader.add_multi_constructor('', parse_unknown_tags) - return OrderedLoader + loader_class = default_loader if expand_aliases else CustomLoader + loader_class.add_constructor(yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping) + loader_class.add_constructor(yaml.resolver.BaseResolver.DEFAULT_SEQUENCE_TAG, construct_sequence) + loader_class.add_multi_constructor('', parse_unknown_tags) + return loader_class