diff --git a/idr/.snapshots/TestReferenceTestTreeWithJSONify1 b/idr/.snapshots/TestReferenceTestTreeWithJSONify1 new file mode 100644 index 0000000..22aaf5d --- /dev/null +++ b/idr/.snapshots/TestReferenceTestTreeWithJSONify1 @@ -0,0 +1,140 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 11", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild11)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild11", + "FirstChild": "(TextNode 'data 11')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 11')", + "NextSibling": "(ElementNode grandChild12)", + "Parent": "(ElementNode child1)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": null, + "Data": "data 12", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild12)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild12", + "FirstChild": "(TextNode 'data 12')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 12')", + "NextSibling": null, + "Parent": "(ElementNode child1)", + "PrevSibling": "(ElementNode grandChild11)", + "Type": "ElementNode" + } + ], + "Data": "child1", + "FirstChild": "(ElementNode grandChild11)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild12)", + "NextSibling": "(ElementNode child2)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 21", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild21)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild21", + "FirstChild": "(TextNode 'data 21')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 21')", + "NextSibling": null, + "Parent": "(ElementNode child2)", + "PrevSibling": null, + "Type": "ElementNode" + } + ], + "Data": "child2", + "FirstChild": "(ElementNode grandChild21)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild21)", + "NextSibling": "(ElementNode child3)", + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child1)", + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "attr 31", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(AttributeNode grandChild31)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild31", + "FirstChild": "(TextNode 'attr 31')", + "FormatSpecific": null, + "LastChild": "(TextNode 'attr 31')", + "NextSibling": null, + "Parent": "(ElementNode child3)", + "PrevSibling": null, + "Type": "AttributeNode" + } + ], + "Data": "child3", + "FirstChild": "(AttributeNode grandChild31)", + "FormatSpecific": null, + "LastChild": "(AttributeNode grandChild31)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child2)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child1)", + "FormatSpecific": null, + "LastChild": "(ElementNode child3)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_first_child_but_not_the_last b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_first_child_but_not_the_last new file mode 100644 index 0000000..9413b37 --- /dev/null +++ b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_first_child_but_not_the_last @@ -0,0 +1,82 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 21", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild21)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild21", + "FirstChild": "(TextNode 'data 21')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 21')", + "NextSibling": null, + "Parent": "(ElementNode child2)", + "PrevSibling": null, + "Type": "ElementNode" + } + ], + "Data": "child2", + "FirstChild": "(ElementNode grandChild21)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild21)", + "NextSibling": "(ElementNode child3)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "attr 31", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(AttributeNode grandChild31)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild31", + "FirstChild": "(TextNode 'attr 31')", + "FormatSpecific": null, + "LastChild": "(TextNode 'attr 31')", + "NextSibling": null, + "Parent": "(ElementNode child3)", + "PrevSibling": null, + "Type": "AttributeNode" + } + ], + "Data": "child3", + "FirstChild": "(AttributeNode grandChild31)", + "FormatSpecific": null, + "LastChild": "(AttributeNode grandChild31)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child2)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child2)", + "FormatSpecific": null, + "LastChild": "(ElementNode child3)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_last_child_but_not_the_first b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_last_child_but_not_the_first new file mode 100644 index 0000000..b84f0ef --- /dev/null +++ b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_last_child_but_not_the_first @@ -0,0 +1,105 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 11", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild11)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild11", + "FirstChild": "(TextNode 'data 11')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 11')", + "NextSibling": "(ElementNode grandChild12)", + "Parent": "(ElementNode child1)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": null, + "Data": "data 12", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild12)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild12", + "FirstChild": "(TextNode 'data 12')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 12')", + "NextSibling": null, + "Parent": "(ElementNode child1)", + "PrevSibling": "(ElementNode grandChild11)", + "Type": "ElementNode" + } + ], + "Data": "child1", + "FirstChild": "(ElementNode grandChild11)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild12)", + "NextSibling": "(ElementNode child2)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 21", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild21)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild21", + "FirstChild": "(TextNode 'data 21')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 21')", + "NextSibling": null, + "Parent": "(ElementNode child2)", + "PrevSibling": null, + "Type": "ElementNode" + } + ], + "Data": "child2", + "FirstChild": "(ElementNode grandChild21)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild21)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child1)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child1)", + "FormatSpecific": null, + "LastChild": "(ElementNode child2)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_middle_child_not_the_first_not_the_last b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_middle_child_not_the_first_not_the_last new file mode 100644 index 0000000..906adad --- /dev/null +++ b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_middle_child_not_the_first_not_the_last @@ -0,0 +1,105 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 11", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild11)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild11", + "FirstChild": "(TextNode 'data 11')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 11')", + "NextSibling": "(ElementNode grandChild12)", + "Parent": "(ElementNode child1)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": null, + "Data": "data 12", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild12)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild12", + "FirstChild": "(TextNode 'data 12')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 12')", + "NextSibling": null, + "Parent": "(ElementNode child1)", + "PrevSibling": "(ElementNode grandChild11)", + "Type": "ElementNode" + } + ], + "Data": "child1", + "FirstChild": "(ElementNode grandChild11)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild12)", + "NextSibling": "(ElementNode child3)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "attr 31", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(AttributeNode grandChild31)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild31", + "FirstChild": "(TextNode 'attr 31')", + "FormatSpecific": null, + "LastChild": "(TextNode 'attr 31')", + "NextSibling": null, + "Parent": "(ElementNode child3)", + "PrevSibling": null, + "Type": "AttributeNode" + } + ], + "Data": "child3", + "FirstChild": "(AttributeNode grandChild31)", + "FormatSpecific": null, + "LastChild": "(AttributeNode grandChild31)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child1)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child1)", + "FormatSpecific": null, + "LastChild": "(ElementNode child3)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_only_child b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_only_child new file mode 100644 index 0000000..8b39d31 --- /dev/null +++ b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_node_who_is_its_parents_only_child @@ -0,0 +1,116 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 11", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild11)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild11", + "FirstChild": "(TextNode 'data 11')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 11')", + "NextSibling": "(ElementNode grandChild12)", + "Parent": "(ElementNode child1)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": null, + "Data": "data 12", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild12)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild12", + "FirstChild": "(TextNode 'data 12')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 12')", + "NextSibling": null, + "Parent": "(ElementNode child1)", + "PrevSibling": "(ElementNode grandChild11)", + "Type": "ElementNode" + } + ], + "Data": "child1", + "FirstChild": "(ElementNode grandChild11)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild12)", + "NextSibling": "(ElementNode child2)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": null, + "Data": "child2", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": "(ElementNode child3)", + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child1)", + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "attr 31", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(AttributeNode grandChild31)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild31", + "FirstChild": "(TextNode 'attr 31')", + "FormatSpecific": null, + "LastChild": "(TextNode 'attr 31')", + "NextSibling": null, + "Parent": "(ElementNode child3)", + "PrevSibling": null, + "Type": "AttributeNode" + } + ], + "Data": "child3", + "FirstChild": "(AttributeNode grandChild31)", + "FormatSpecific": null, + "LastChild": "(AttributeNode grandChild31)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child2)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child1)", + "FormatSpecific": null, + "LastChild": "(ElementNode child3)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_root_does_nothing b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_root_does_nothing new file mode 100644 index 0000000..22aaf5d --- /dev/null +++ b/idr/.snapshots/TestRemoveNodeAndSubTree-remove_a_root_does_nothing @@ -0,0 +1,140 @@ +{ + "Children": [ + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 11", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild11)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild11", + "FirstChild": "(TextNode 'data 11')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 11')", + "NextSibling": "(ElementNode grandChild12)", + "Parent": "(ElementNode child1)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": null, + "Data": "data 12", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild12)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild12", + "FirstChild": "(TextNode 'data 12')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 12')", + "NextSibling": null, + "Parent": "(ElementNode child1)", + "PrevSibling": "(ElementNode grandChild11)", + "Type": "ElementNode" + } + ], + "Data": "child1", + "FirstChild": "(ElementNode grandChild11)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild12)", + "NextSibling": "(ElementNode child2)", + "Parent": "(DocumentNode)", + "PrevSibling": null, + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "data 21", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(ElementNode grandChild21)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild21", + "FirstChild": "(TextNode 'data 21')", + "FormatSpecific": null, + "LastChild": "(TextNode 'data 21')", + "NextSibling": null, + "Parent": "(ElementNode child2)", + "PrevSibling": null, + "Type": "ElementNode" + } + ], + "Data": "child2", + "FirstChild": "(ElementNode grandChild21)", + "FormatSpecific": null, + "LastChild": "(ElementNode grandChild21)", + "NextSibling": "(ElementNode child3)", + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child1)", + "Type": "ElementNode" + }, + { + "Children": [ + { + "Children": [ + { + "Children": null, + "Data": "attr 31", + "FirstChild": null, + "FormatSpecific": null, + "LastChild": null, + "NextSibling": null, + "Parent": "(AttributeNode grandChild31)", + "PrevSibling": null, + "Type": "TextNode" + } + ], + "Data": "grandChild31", + "FirstChild": "(TextNode 'attr 31')", + "FormatSpecific": null, + "LastChild": "(TextNode 'attr 31')", + "NextSibling": null, + "Parent": "(ElementNode child3)", + "PrevSibling": null, + "Type": "AttributeNode" + } + ], + "Data": "child3", + "FirstChild": "(AttributeNode grandChild31)", + "FormatSpecific": null, + "LastChild": "(AttributeNode grandChild31)", + "NextSibling": null, + "Parent": "(DocumentNode)", + "PrevSibling": "(ElementNode child2)", + "Type": "ElementNode" + } + ], + "Data": "root", + "FirstChild": "(ElementNode child1)", + "FormatSpecific": null, + "LastChild": "(ElementNode child3)", + "NextSibling": null, + "Parent": null, + "PrevSibling": null, + "Type": "DocumentNode" +} diff --git a/idr/README.md b/idr/README.md new file mode 100644 index 0000000..0e5d126 --- /dev/null +++ b/idr/README.md @@ -0,0 +1,97 @@ +# IDR + +**IDR** == **I**ntermediate **D**ata **R**epresentation or **I**n-memory **D**ata **R**epresentation + +IDR is an intermediate data structure used by omniparser ingesters to store raw data read from various +formats of inputs, including CSV/txt/XML/EDI/JSON/etc, and then used by schema handlers to perform +transforms. It is flexible and versatile to represent all kinds of data formats supported (or to be +supported) by omniparser. + +*Credit:* The basic data structures and various operations and algorithms used by IDR are mostly +inherited/adapted from, modified based on, and inspired by works done in https://github.com/antchfx/xmlquery +and https://github.com/antchfx/xpath. Thank you very much! + +The basic building block of an IDR is a `Node` and an IDR is in fact a `Node` tree. Each `Node` has +two parts (see actual code [here](./node.go)): +``` +type Node struct { + Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node + + Type NodeType + Data string + + FormatSpecific interface{} +} +``` +The first part of a `Node` contains the input format agnostic fields, such as tree pointers (like +`Parent`, `FirstChild`, etc), `Type` and `Data`, which we'll explain more in details later. The second +part of a `Node` is format specific data blob. The blob not only offers a place to store format specific +data it also gives IDR code and algorithms a hint on what input format the `Node` is about. + +Below we'll go through each input format we support and show what its corresponding IDR looks like. + +## XML + +Since XML is the most complex input format we have for IDR, let's cover it first. + +Here is a simple XML (from [this sample](../samples/omniv2/xml/1_datetime_parse_and_format.input.xml)): +``` + + 2020/09/22 + 09/22/2020 12:34:56 + +``` +This is a simple XML blob with no non-default namespaces and with no attributes. Its corresponding IDR +looks like this (with empty field omitted and tree pointers omitted for clarity): +``` +Node(Type: DocumentNode, FormatSpecific: XMLSpecific()) + Node(Type: ElementNode, Data: "Root", FormatSpecific: XMLSpecific()) + Node(Type: TextNode, Data: "\n", FormatSpecific: XMLSpecific()) + Node(Type: ElementNode, Data: "JustData", FormatSpecific: XMLSpecific()) + Node(Type: TextNode, Data: "2020/09/22", FormatSpecific: XMLSpecific()) + Node(Type: TextNode, Data: "\n", FormatSpecific: XMLSpecific()) + Node(Type: ElementNode, Data: "DateTimeWithNoTZ", FormatSpecific: XMLSpecific()) + Node(Type: TextNode, Data: "09/22/2020 12:34:56", FormatSpecific: XMLSpecific()) + Node(Type: TextNode, Data: "\n", FormatSpecific: XMLSpecific()) +``` +Most of the IDR is quite self-explanatory, but what about those `TextNode`'s with `\n` as `Data`? Turns +out [`xml.Decoder`](https://golang.org/pkg/encoding/xml/#Decoder) treats anything in between two XML +element nodes as text, as long as the two elements are not directly adjacent to each other. Since +there is a newline `'\n'` after the XML element `` and before ``, the `'\n'` is captured +as a `TextNode`. + +Also note in this simple case, each of the `Node` has an empty but none-nil `FormatSpecific`, typed as +[`XMLSpecific`](./xmlnode.go). `XMLSpecific` contains XML namespace information for each of the node, +which we'll see in the [next example](../samples/omniv2/xml/2_multiple_objects.input.xml): +``` + + + + J. K. Rowling + + + +``` +In this example, we'll see how IDR deals with XML namespaces, as well as attributes. + +The IDR for the example above looks like the following (note those "dummy" text nodes sprinkled +in between element nodes are omitted here for clarity; also not including empty `XMLSpecific`): +``` +Node(Type: DocumentNode) + Node(Type: ElementNode, Data: "library", FormatSpecific: XMLSpecific(NamespacePrefix: "lb0", NamespaceURI: "uri://something")) + Node(Type: ElementNode, Data: "books", FormatSpecific: XMLSpecific(NamespacePrefix: "lb0", NamespaceURI: "uri://something")) + Node(Type: ElementNode, Data: "book") + Node(Type: AttributeNode, Data: "title") + Node(Type: TextNode, Data: "Harry Potter and the Philosopher's Stone") + Node(Type: ElementNode, Data: "author") + Node(Type: TextNode, Data: "J. K. Rowling") +``` +Both `Node`'s representing `` and `` include non-empty `XMLSpecific`'s which +contain their namespace prefixes and full URIs while their `Node.Data` contain the element names without +the namespace prefixes. + +Note XML attributes on elements are represented as `Node`'s as well, with `Type: AttributeNode` +specifically. If an attribute is namespace-prefixed, the `AttributeNode` typed `Node` will have a non-empty +`XMLSpecific` set as well. An attribute's actual value is placed as a `TextNode` underneath its `ElementNode`. +`AttributeNode`'s are guaranteed to be placed before any other child nodes (`TextNode`, or `ElementNode`) +by IDR's XML reader. diff --git a/idr/marshal1.go b/idr/marshal1.go new file mode 100644 index 0000000..229e47c --- /dev/null +++ b/idr/marshal1.go @@ -0,0 +1,55 @@ +package idr + +import ( + "fmt" + + "github.com/jf-tech/go-corelib/jsons" + "github.com/jf-tech/go-corelib/strs" +) + +// j1NodePtrName returns a categorized name for a *Node pointer used in JSONify1. +func j1NodePtrName(n *Node) *string { + if n == nil { + return nil + } + name := func(n *Node) string { + if IsXML(n) && XMLSpecificOf(n).NamespacePrefix != "" { + return XMLSpecificOf(n).NamespacePrefix + ":" + n.Data + } + return n.Data + } + switch n.Type { + case DocumentNode: + return strs.StrPtr(fmt.Sprintf("(%s)", n.Type)) + case ElementNode, AttributeNode: + return strs.StrPtr(fmt.Sprintf("(%s %s)", n.Type, name(n))) + case TextNode: + return strs.StrPtr(fmt.Sprintf("(%s '%s')", n.Type, n.Data)) + default: + return strs.StrPtr(fmt.Sprintf("(unknown '%s')", n.Data)) + } +} + +// j1NodeToInterface converts *Node into an interface{} suitable for json marshaling used in JSONify1. +func j1NodeToInterface(n *Node) interface{} { + m := make(map[string]interface{}) + m["Parent"] = j1NodePtrName(n.Parent) + m["FirstChild"] = j1NodePtrName(n.FirstChild) + m["LastChild"] = j1NodePtrName(n.LastChild) + m["PrevSibling"] = j1NodePtrName(n.PrevSibling) + m["NextSibling"] = j1NodePtrName(n.NextSibling) + m["Type"] = n.Type.String() + m["Data"] = n.Data + m["FormatSpecific"] = n.FormatSpecific + var children []interface{} + for child := n.FirstChild; child != nil; child = child.NextSibling { + children = append(children, j1NodeToInterface(child)) + } + m["Children"] = children + return m +} + +// JSONify1 json marshals a *Node verbatim. Mostly used in test for snapshotting. +func JSONify1(n *Node) string { + return jsons.BPM(j1NodeToInterface(n)) +} diff --git a/idr/marshal1_test.go b/idr/marshal1_test.go new file mode 100644 index 0000000..0c3dc63 --- /dev/null +++ b/idr/marshal1_test.go @@ -0,0 +1,31 @@ +package idr + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestJ1NodePtrName(t *testing.T) { + for _, test := range []struct { + name string + n *Node + expected string + }{ + {name: "nil", n: nil, expected: ""}, + {name: "root", n: CreateNode(DocumentNode, "test"), expected: "(DocumentNode)"}, + {name: "elem w/o ns", n: CreateNode(ElementNode, "A"), expected: "(ElementNode A)"}, + {name: "elem w/ ns", n: CreateXMLNode(ElementNode, "A", XMLSpecific{"ns", "uri://"}), expected: "(ElementNode ns:A)"}, + {name: "text", n: CreateNode(TextNode, "data"), expected: "(TextNode 'data')"}, + {name: "attr", n: CreateNode(AttributeNode, "attr"), expected: "(AttributeNode attr)"}, + {name: "unknown", n: CreateNode(NodeType(99999), "what"), expected: "(unknown 'what')"}, + } { + t.Run(test.name, func(t *testing.T) { + if test.expected == "" { + assert.Nil(t, j1NodePtrName(test.n)) + } else { + assert.Equal(t, test.expected, *j1NodePtrName(test.n)) + } + }) + } +} diff --git a/idr/node.go b/idr/node.go new file mode 100644 index 0000000..1d66b20 --- /dev/null +++ b/idr/node.go @@ -0,0 +1,124 @@ +package idr + +import ( + "fmt" + "strings" +) + +// NodeType is the type of a Node in an IDR. +type NodeType uint + +const ( + // DocumentNode is the type of the root Node in an IDR tree. + DocumentNode NodeType = iota + // ElementNode is the type of an element Node in an IDR tree. + ElementNode + // TextNode is the type of an text/data Node in an IDR tree. + TextNode + // AttributeNode is the type of an attribute Node in an IDR tree. + AttributeNode +) + +// String converts NodeType to a string. +func (nt NodeType) String() string { + switch nt { + case DocumentNode: + return "DocumentNode" + case ElementNode: + return "ElementNode" + case TextNode: + return "TextNode" + case AttributeNode: + return "AttributeNode" + default: + return fmt.Sprintf("(unknown NodeType: %d)", nt) + } +} + +// Node represents a node of element/data in an IDR (intermediate data representation) ingested and created +// by the omniparser. +// Credit: this is by and large a copy and some adaptation from +// https://github.com/antchfx/xmlquery/blob/master/node.go. The reasons we want to have our own struct: +// - more stability +// - one struct to represent XML/JSON/EDI/CSV/txt/etc. Vs antchfx's work have one struct (in each repo) +// for each format. +// - Node allocation recycling. +type Node struct { + Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node + + Type NodeType + Data string + + FormatSpecific interface{} +} + +// CreateNode creates a generic *Node. +func CreateNode(ntype NodeType, data string) *Node { + return &Node{ + Type: ntype, + Data: data, + } +} + +// InnerText returns a Node's children's texts concatenated. +// Note (in an XML IDR tree) none of the AttributeNode's text will be included. +func (n *Node) InnerText() string { + var s strings.Builder + var captureText func(*Node) + captureText = func(a *Node) { + switch a.Type { + case TextNode: + s.WriteString(a.Data) + default: + for child := a.FirstChild; child != nil; child = child.NextSibling { + if child.Type != AttributeNode { + captureText(child) + } + } + } + } + captureText(n) + return s.String() +} + +// AddChild adds 'n' as the new last child to 'parent'. +func AddChild(parent, n *Node) { + n.Parent = parent + n.NextSibling = nil + if parent.FirstChild == nil { + parent.FirstChild = n + n.PrevSibling = nil + } else { + parent.LastChild.NextSibling = n + n.PrevSibling = parent.LastChild + } + parent.LastChild = n +} + +// RemoveFromTree removes a node and its subtree from an IDR +// tree it is in. If the node is the root of the tree, it's a no-op. +func RemoveFromTree(n *Node) { + if n.Parent == nil { + return + } + if n.Parent.FirstChild == n { + if n.Parent.LastChild == n { + n.Parent.FirstChild = nil + n.Parent.LastChild = nil + } else { + n.Parent.FirstChild = n.NextSibling + n.NextSibling.PrevSibling = nil + } + } else { + if n.Parent.LastChild == n { + n.Parent.LastChild = n.PrevSibling + n.PrevSibling.NextSibling = nil + } else { + n.PrevSibling.NextSibling = n.NextSibling + n.NextSibling.PrevSibling = n.PrevSibling + } + } + n.Parent = nil + n.PrevSibling = nil + n.NextSibling = nil +} diff --git a/idr/node_test.go b/idr/node_test.go new file mode 100644 index 0000000..4c3480e --- /dev/null +++ b/idr/node_test.go @@ -0,0 +1,171 @@ +package idr + +import ( + "testing" + + "github.com/bradleyjkemp/cupaloy" + "github.com/stretchr/testify/assert" +) + +func TestNodeType_String(t *testing.T) { + assert.Equal(t, "DocumentNode", DocumentNode.String()) + assert.Equal(t, "ElementNode", ElementNode.String()) + assert.Equal(t, "TextNode", TextNode.String()) + assert.Equal(t, "AttributeNode", AttributeNode.String()) + assert.Equal(t, "(unknown NodeType: 99)", NodeType(99).String()) +} + +func findRoot(n *Node) *Node { + for ; n != nil && n.Parent != nil; n = n.Parent { + } + return n +} + +func checkPointersInTree(t *testing.T, n *Node) { + if n == nil { + return + } + if n.FirstChild != nil { + assert.True(t, n == n.FirstChild.Parent) + } + if n.LastChild != nil { + assert.True(t, n == n.LastChild.Parent) + } + checkPointersInTree(t, n.FirstChild) + // There is no need to call checkPointersInTree(t, n.LastChild) + // because checkPointersInTree(t, n.FirstChild) will traverse all its + // siblings to the end, and if the last one isn't n.LastChild then it will fail. + parent := n.Parent // could be nil if n is the root of a tree. + // Verify the PrevSibling chain + cur, prev := n, n.PrevSibling + for ; prev != nil; cur, prev = prev, prev.PrevSibling { + assert.True(t, prev.Parent == parent) + assert.True(t, prev.NextSibling == cur) + } + assert.True(t, cur.PrevSibling == nil) + assert.True(t, parent == nil || parent.FirstChild == cur) + // Verify the NextSibling chain + cur, next := n, n.NextSibling + for ; next != nil; cur, next = next, next.NextSibling { + assert.True(t, next.Parent == parent) + assert.True(t, next.PrevSibling == cur) + } + assert.True(t, cur.NextSibling == nil) + assert.True(t, parent == nil || parent.LastChild == cur) +} + +type testTree struct { + // + // root + // child1 child2 child3 + // grandChild11E, grandchild12E grandChild21E grandChild31A + // grandChild11T, grandchild12T grandChild21T grandChild31T + root *Node + child1, child2, child3 *Node + grandChild11E, grandChild11T *Node + grandChild12E, grandChild12T *Node + grandChild21E, grandChild21T *Node + grandChild31A, grandChild31T *Node +} + +func newTestTree(t *testing.T) *testTree { + root := CreateNode(DocumentNode, "root") + child1 := CreateNode(ElementNode, "child1") + child2 := CreateNode(ElementNode, "child2") + child3 := CreateNode(ElementNode, "child3") + grandChild11E := CreateNode(ElementNode, "grandChild11") + grandChild11T := CreateNode(TextNode, "data 11") + grandChild12E := CreateNode(ElementNode, "grandChild12") + grandChild12T := CreateNode(TextNode, "data 12") + grandChild21E := CreateNode(ElementNode, "grandChild21") + grandChild21T := CreateNode(TextNode, "data 21") + grandChild31A := CreateNode(AttributeNode, "grandChild31") + grandChild31T := CreateNode(TextNode, "attr 31") + + AddChild(root, child1) + AddChild(root, child2) + AddChild(root, child3) + AddChild(child1, grandChild11E) + AddChild(child1, grandChild12E) + AddChild(child2, grandChild21E) + AddChild(child3, grandChild31A) + AddChild(grandChild11E, grandChild11T) + AddChild(grandChild12E, grandChild12T) + AddChild(grandChild21E, grandChild21T) + AddChild(grandChild31A, grandChild31T) + + checkPointersInTree(t, root) + checkPointersInTree(t, child1) + checkPointersInTree(t, child2) + checkPointersInTree(t, child3) + checkPointersInTree(t, grandChild11E) + checkPointersInTree(t, grandChild12E) + checkPointersInTree(t, grandChild21E) + checkPointersInTree(t, grandChild31A) + checkPointersInTree(t, grandChild11T) + checkPointersInTree(t, grandChild12T) + checkPointersInTree(t, grandChild21T) + checkPointersInTree(t, grandChild31T) + + return &testTree{ + root: root, + child1: child1, + child2: child2, + child3: child3, + grandChild11E: grandChild11E, + grandChild12E: grandChild12E, + grandChild21E: grandChild21E, + grandChild31A: grandChild31A, + grandChild11T: grandChild11T, + grandChild12T: grandChild12T, + grandChild21T: grandChild21T, + grandChild31T: grandChild31T, + } +} + +func TestReferenceTestTreeWithJSONify1(t *testing.T) { + cupaloy.SnapshotT(t, JSONify1(newTestTree(t).root)) +} + +func TestInnerText(t *testing.T) { + tt := newTestTree(t) + assert.Equal(t, tt.grandChild11T.Data+tt.grandChild12T.Data, tt.child1.InnerText()) + assert.Equal(t, tt.grandChild11T.Data+tt.grandChild12T.Data+tt.grandChild21T.Data, tt.root.InnerText()) +} + +func TestRemoveNodeAndSubTree(t *testing.T) { + t.Run("remove a node who is its parents only child", func(t *testing.T) { + tt := newTestTree(t) + RemoveFromTree(tt.grandChild21E) + checkPointersInTree(t, tt.root) + cupaloy.SnapshotT(t, JSONify1(tt.root)) + }) + + t.Run("remove a node who is its parents first child but not the last", func(t *testing.T) { + tt := newTestTree(t) + RemoveFromTree(tt.child1) + checkPointersInTree(t, tt.root) + cupaloy.SnapshotT(t, JSONify1(tt.root)) + }) + + t.Run("remove a node who is its parents middle child not the first not the last", func(t *testing.T) { + tt := newTestTree(t) + RemoveFromTree(tt.child2) + checkPointersInTree(t, tt.root) + cupaloy.SnapshotT(t, JSONify1(tt.root)) + }) + + t.Run("remove a node who is its parents last child but not the first", func(t *testing.T) { + tt := newTestTree(t) + RemoveFromTree(tt.child3) + checkPointersInTree(t, tt.root) + cupaloy.SnapshotT(t, JSONify1(tt.root)) + }) + + t.Run("remove a root does nothing", func(t *testing.T) { + tt := newTestTree(t) + RemoveFromTree(tt.root) + checkPointersInTree(t, tt.root) + cupaloy.SnapshotT(t, JSONify1(tt.root)) + }) +} diff --git a/idr/xmlnode.go b/idr/xmlnode.go new file mode 100644 index 0000000..08b0798 --- /dev/null +++ b/idr/xmlnode.go @@ -0,0 +1,29 @@ +package idr + +// XMLSpecific contains XML IDR Node specific information such as namespace. +type XMLSpecific struct { + NamespacePrefix string + NamespaceURI string +} + +// IsXML checks if a Node is of XML. +func IsXML(n *Node) bool { + _, ok := n.FormatSpecific.(XMLSpecific) + return ok +} + +// XMLSpecificOf returns the XMLSpecific field of a Node. +// Note if the Node isn't of XML, this function will panic. +func XMLSpecificOf(n *Node) XMLSpecific { + if !IsXML(n) { + panic("node is not XML") + } + return n.FormatSpecific.(XMLSpecific) +} + +// CreateXMLNode creates an XML Node. +func CreateXMLNode(ntype NodeType, data string, xmlSpecific XMLSpecific) *Node { + n := CreateNode(ntype, data) + n.FormatSpecific = xmlSpecific + return n +} diff --git a/idr/xmlnode_test.go b/idr/xmlnode_test.go new file mode 100644 index 0000000..9c9d5cb --- /dev/null +++ b/idr/xmlnode_test.go @@ -0,0 +1,26 @@ +package idr + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIsXML(t *testing.T) { + assert.True(t, IsXML(CreateXMLNode(DocumentNode, "", XMLSpecific{}))) + assert.True(t, IsXML(CreateXMLNode(ElementNode, "A", XMLSpecific{NamespacePrefix: "ns", NamespaceURI: "uri://"}))) + assert.True(t, IsXML(CreateXMLNode(TextNode, "text", XMLSpecific{}))) + assert.True(t, IsXML(CreateXMLNode(AttributeNode, "A", XMLSpecific{}))) + assert.False(t, IsXML(CreateNode(ElementNode, "B"))) +} + +func TestXMLSpecificOf(t *testing.T) { + assert.Equal(t, XMLSpecific{}, XMLSpecificOf(CreateXMLNode(ElementNode, "A", XMLSpecific{}))) + assert.Equal(t, + XMLSpecific{NamespacePrefix: "ns", NamespaceURI: "uri"}, + XMLSpecificOf( + CreateXMLNode(ElementNode, "A", XMLSpecific{NamespacePrefix: "ns", NamespaceURI: "uri"}))) + assert.PanicsWithValue(t, "node is not XML", func() { + XMLSpecificOf(CreateNode(ElementNode, "A")) + }) +} diff --git a/samples/omniv2/xml/2_multiple_objects.input.xml b/samples/omniv2/xml/2_multiple_objects.input.xml index 1f47c0e..e43990e 100644 --- a/samples/omniv2/xml/2_multiple_objects.input.xml +++ b/samples/omniv2/xml/2_multiple_objects.input.xml @@ -1,6 +1,6 @@ - +
Scholastic Press
@@ -13,8 +13,8 @@ 1998
Harry Potter Collection
-
- +
+
Harper & Brothers
@@ -27,5 +27,5 @@ 1900 - +
diff --git a/samples/omniv2/xml/2_multiple_objects.schema.json b/samples/omniv2/xml/2_multiple_objects.schema.json index 3ab0c52..d0a6aa7 100644 --- a/samples/omniv2/xml/2_multiple_objects.schema.json +++ b/samples/omniv2/xml/2_multiple_objects.schema.json @@ -4,7 +4,7 @@ "file_format_type": "xml" }, "transform_declarations": { - "FINAL_OUTPUT": { "xpath": "lb0:library/books", "object": { + "FINAL_OUTPUT": { "xpath": "lb0:library/lb0:books", "object": { "authors": { "array": [ { "xpath": "book/author" } ] }, "book_titles": { "array": [ { "xpath": "book/@title" } ] }, "books": { "array": [ { "xpath": "book", "template": "book_template" } ] },