diff --git a/go.mod b/go.mod index 00acfae..77837a6 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/antchfx/xmlquery v1.3.0 github.com/antchfx/xpath v1.1.10 github.com/bradleyjkemp/cupaloy v2.3.0+incompatible + github.com/google/uuid v1.1.2 github.com/hashicorp/golang-lru v0.5.4 github.com/jf-tech/iohelper v1.0.3 github.com/stretchr/testify v1.6.1 diff --git a/go.sum b/go.sum index 9c3cf29..472da1d 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e h1:1r7pUrabqp18hOBcwBwiTsbnFeTZHV9eER/QT5JVZxY= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= +github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/golang-lru v0.5.4 h1:YDjusn29QI/Das2iO9M0BHnIbxPeyuCHsjMW+lJfyTc= github.com/hashicorp/golang-lru v0.5.4/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/jf-tech/iohelper v1.0.3 h1:304dQL6ZKJEmDkbIivqCGrhmJTi7k7+1e2CC+WiERS4= diff --git a/omniparser/schemaplugin/omni/v2/plugin.go b/omniparser/schemaplugin/omni/v2/plugin.go index fef6025..f1072cd 100644 --- a/omniparser/schemaplugin/omni/v2/plugin.go +++ b/omniparser/schemaplugin/omni/v2/plugin.go @@ -3,7 +3,6 @@ package omniv2 import ( "github.com/jf-tech/omniparser/omniparser/errs" "github.com/jf-tech/omniparser/omniparser/schemaplugin" - "github.com/jf-tech/omniparser/omniparser/schemaplugin/omni/v2/transform" ) const ( @@ -11,11 +10,9 @@ const ( fileFormatXML = "xml" ) +// ParseSchema parses, validates and creates an omni-schema based schema plugin. func ParseSchema(_ *schemaplugin.ParseSchemaCtx) (schemaplugin.Plugin, error) { return nil, errs.ErrSchemaNotSupported } -type omniSchema struct { - schemaplugin.Header - Decls map[string]*transform.Decl `json:"transform_declarations"` -} +type schemaPlugin struct{} diff --git a/omniparser/schemaplugin/omni/v2/transform/.snapshots/TestValidateTransformDeclarations-success b/omniparser/schemaplugin/omni/v2/transform/.snapshots/TestValidateTransformDeclarations-success new file mode 100644 index 0000000..5b33b60 --- /dev/null +++ b/omniparser/schemaplugin/omni/v2/transform/.snapshots/TestValidateTransformDeclarations-success @@ -0,0 +1,235 @@ +{ + "object": { + "field1": { + "const": "value1", + "fqdn": "FINAL_OUTPUT.field1", + "kind": "const", + "parent": "FINAL_OUTPUT" + }, + "field2": { + "xpath_dynamic": { + "custom_func": { + "name": "test_func", + "args": [ + { + "xpath": "W/X", + "fqdn": "FINAL_OUTPUT.field2.xpath_dynamic.custom_func(test_func).arg[1]", + "kind": "field", + "parent": "(nil)" + } + ], + "fqdn": "FINAL_OUTPUT.field2.xpath_dynamic.custom_func(test_func)" + }, + "fqdn": "FINAL_OUTPUT.field2.xpath_dynamic", + "kind": "custom_func", + "children": [ + "FINAL_OUTPUT.field2.xpath_dynamic.custom_func(test_func).arg[1]" + ], + "parent": "(nil)" + }, + "fqdn": "FINAL_OUTPUT.field2", + "kind": "field", + "parent": "FINAL_OUTPUT" + }, + "field3": { + "xpath": "E/F/G", + "object": { + "field4": { + "array": [ + { + "const": "value4", + "fqdn": "FINAL_OUTPUT.field3.field4.elem[1]", + "kind": "const", + "parent": "FINAL_OUTPUT.field3.field4" + }, + { + "xpath": "H/I/J", + "fqdn": "FINAL_OUTPUT.field3.field4.elem[2]", + "kind": "field", + "parent": "FINAL_OUTPUT.field3.field4" + }, + { + "xpath": "K/L/M", + "object": { + "field5": { + "xpath": "N/O/P", + "fqdn": "FINAL_OUTPUT.field3.field4.elem[3].field5", + "kind": "field", + "parent": "FINAL_OUTPUT.field3.field4.elem[3]" + } + }, + "fqdn": "FINAL_OUTPUT.field3.field4.elem[3]", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field3.field4.elem[3].field5" + ], + "parent": "FINAL_OUTPUT.field3.field4" + }, + { + "xpath": "1/2/3", + "object": { + "field9": { + "xpath": "4/5/6", + "fqdn": "FINAL_OUTPUT.field3.field4.elem[4].field9", + "kind": "field", + "parent": "FINAL_OUTPUT.field3.field4.elem[4]" + } + }, + "fqdn": "FINAL_OUTPUT.field3.field4.elem[4]", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field3.field4.elem[4].field9" + ], + "parent": "FINAL_OUTPUT.field3.field4" + } + ], + "fqdn": "FINAL_OUTPUT.field3.field4", + "kind": "array", + "children": [ + "FINAL_OUTPUT.field3.field4.elem[1]", + "FINAL_OUTPUT.field3.field4.elem[2]", + "FINAL_OUTPUT.field3.field4.elem[3]", + "FINAL_OUTPUT.field3.field4.elem[4]" + ], + "parent": "FINAL_OUTPUT.field3" + } + }, + "fqdn": "FINAL_OUTPUT.field3", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field3.field4" + ], + "parent": "FINAL_OUTPUT" + }, + "field6": { + "custom_func": { + "name": "test_func", + "args": [ + { + "xpath": "Q/R/S", + "fqdn": "FINAL_OUTPUT.field6.custom_func(test_func).arg[1]", + "kind": "field", + "parent": "FINAL_OUTPUT.field6" + }, + { + "custom_func": { + "name": "test_func", + "args": [ + { + "xpath": "W/X", + "fqdn": "FINAL_OUTPUT.field6.custom_func(test_func).arg[2].custom_func(test_func).arg[1]", + "kind": "field", + "parent": "FINAL_OUTPUT.field6.custom_func(test_func).arg[2]" + } + ], + "fqdn": "FINAL_OUTPUT.field6.custom_func(test_func).arg[2].custom_func(test_func)" + }, + "fqdn": "FINAL_OUTPUT.field6.custom_func(test_func).arg[2]", + "kind": "custom_func", + "children": [ + "FINAL_OUTPUT.field6.custom_func(test_func).arg[2].custom_func(test_func).arg[1]" + ], + "parent": "FINAL_OUTPUT.field6" + } + ], + "fqdn": "FINAL_OUTPUT.field6.custom_func(test_func)" + }, + "fqdn": "FINAL_OUTPUT.field6", + "kind": "custom_func", + "children": [ + "FINAL_OUTPUT.field6.custom_func(test_func).arg[1]", + "FINAL_OUTPUT.field6.custom_func(test_func).arg[2]" + ], + "parent": "FINAL_OUTPUT" + }, + "field_10": { + "xpath_dynamic": { + "const": "X/Y/Z", + "fqdn": "FINAL_OUTPUT.field_10.xpath_dynamic", + "kind": "const", + "parent": "(nil)" + }, + "object": { + "field10": { + "const": "value10", + "fqdn": "FINAL_OUTPUT.field_10.field10", + "kind": "const", + "parent": "FINAL_OUTPUT.field_10" + } + }, + "fqdn": "FINAL_OUTPUT.field_10", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field_10.field10" + ], + "parent": "FINAL_OUTPUT" + }, + "field_11": { + "array": [ + { + "xpath": "T/U/V", + "fqdn": "FINAL_OUTPUT.field_11.elem[1]", + "kind": "field", + "parent": "FINAL_OUTPUT.field_11" + } + ], + "fqdn": "FINAL_OUTPUT.field_11", + "kind": "array", + "children": [ + "FINAL_OUTPUT.field_11.elem[1]" + ], + "parent": "FINAL_OUTPUT" + }, + "field_12": { + "custom_func": { + "name": "test_func", + "args": [ + { + "xpath": "W/X", + "fqdn": "FINAL_OUTPUT.field_12.custom_func(test_func).arg[1]", + "kind": "field", + "parent": "FINAL_OUTPUT.field_12" + } + ], + "fqdn": "FINAL_OUTPUT.field_12.custom_func(test_func)" + }, + "fqdn": "FINAL_OUTPUT.field_12", + "kind": "custom_func", + "children": [ + "FINAL_OUTPUT.field_12.custom_func(test_func).arg[1]" + ], + "parent": "FINAL_OUTPUT" + }, + "field_9": { + "xpath": "1/2/3", + "object": { + "field9": { + "xpath": "4/5/6", + "fqdn": "FINAL_OUTPUT.field_9.field9", + "kind": "field", + "parent": "FINAL_OUTPUT.field_9" + } + }, + "fqdn": "FINAL_OUTPUT.field_9", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field_9.field9" + ], + "parent": "FINAL_OUTPUT" + } + }, + "fqdn": "FINAL_OUTPUT", + "kind": "object", + "children": [ + "FINAL_OUTPUT.field1", + "FINAL_OUTPUT.field2", + "FINAL_OUTPUT.field3", + "FINAL_OUTPUT.field6", + "FINAL_OUTPUT.field_10", + "FINAL_OUTPUT.field_11", + "FINAL_OUTPUT.field_12", + "FINAL_OUTPUT.field_9" + ], + "parent": "(nil)" +} + diff --git a/omniparser/schemaplugin/omni/v2/transform/decl.go b/omniparser/schemaplugin/omni/v2/transform/decl.go index 279220f..c1ff057 100644 --- a/omniparser/schemaplugin/omni/v2/transform/decl.go +++ b/omniparser/schemaplugin/omni/v2/transform/decl.go @@ -6,9 +6,11 @@ import ( "github.com/jf-tech/omniparser/strs" ) +// Kind specifies the types of omni schema's input elements. type Kind string const ( + KindUnknown Kind = "unknown" KindConst Kind = "const" KindExternal Kind = "external" KindField Kind = "field" @@ -18,6 +20,7 @@ const ( KindTemplate Kind = "template" ) +// ResultType specifies the types of omni schema's output elements. type ResultType string const ( @@ -31,9 +34,12 @@ const ( ) const ( - FinalOutput = "FINAL_OUTPUT" + // finalOutput is the special name of a Decl that is designated for the output + // for an omni schema. + finalOutput = "FINAL_OUTPUT" ) +// CustomFuncDecl is the decl for a "custom_func". type CustomFuncDecl struct { Name string `json:"name,omitempty"` Args []*Decl `json:"args,omitempty"` @@ -41,6 +47,7 @@ type CustomFuncDecl struct { fqdn string // internal; never unmarshaled from a schema. } +// MarshalJSON is the custom JSON marshaler for CustomFuncDecl. func (d CustomFuncDecl) MarshalJSON() ([]byte, error) { type Alias CustomFuncDecl return json.Marshal(&struct { @@ -64,28 +71,30 @@ func (d *CustomFuncDecl) deepCopy() *CustomFuncDecl { return dest } -// This is the struct will be unmarshaled from `transform_declarations` section of an omni schema. +// Decl is the type for omni schema's `transform_declarations` declarations. type Decl struct { - // Applicable for KindConst. + // Const indicates the input element is a cost. Const *string `json:"const,omitempty"` - // Applicable for KindExternal + // External indicates the input element is from an external property. External *string `json:"external,omitempty"` - // Applicable for KindField, KindObject, KindTemplate, KindCustomFunc + // XPath specifies an xpath for an input element. XPath *string `json:"xpath,omitempty"` - // Applicable for KindField, KindObject, KindTemplate, KindCustomFunc + // XPathDynamic specifies a dynamically constructed xpath for an input element. XPathDynamic *Decl `json:"xpath_dynamic,omitempty"` - // Applicable for KindCustomFunc. + // CustomFunc specifies the input element is a custom function. CustomFunc *CustomFuncDecl `json:"custom_func,omitempty"` - // Applicable for KindTemplate. + // Template specifies the input element is a template. Template *string `json:"template,omitempty"` - // Applicable for KindObject. + // Object specifies the input element is an object. Object map[string]*Decl `json:"object,omitempty"` - // Applicable for KindArray. + // Array specifies the input element is an array. Array []*Decl `json:"array,omitempty"` - // Applicable for KindConst, KindExternal, KindField or KindCustomFunc. - ResultType *ResultType `json:"result_type,omitempty"` - KeepLeadingTrailingSpace bool `json:"keep_leading_trailing_space,omitempty"` - KeepEmptyOrNull bool `json:"keep_empty_or_null,omitempty"` + // ResultType specifies the desired output type of an element. + ResultType *ResultType `json:"result_type,omitempty"` + // KeepLeadingTrailingSpace specifies space trimming in string value of the output element. + KeepLeadingTrailingSpace bool `json:"keep_leading_trailing_space,omitempty"` + // KeepEmptyOrNull specifies whether or not keep an empty/null output or not. + KeepEmptyOrNull bool `json:"keep_empty_or_null,omitempty"` // Internal runtime fields that are not unmarshaled from a schema. fqdn string @@ -95,6 +104,7 @@ type Decl struct { parent *Decl } +// MarshalJSON is the custom JSON marshaler for Decl. func (d Decl) MarshalJSON() ([]byte, error) { emptyToNil := func(s string) string { return strs.FirstNonBlank(s, "(nil)") @@ -161,7 +171,8 @@ func (d *Decl) isXPathSet() bool { return d.XPath != nil || d.XPathDynamic != nil } -// Note only deep-copy all the public fields, those internal computed fields are not copied. +// Note only deep-copy all the public fields, those internal computed fields MUST not be copied: +// see explanation in validate.go's computeDeclHash(). func (d *Decl) deepCopy() *Decl { dest := &Decl{} dest.Const = strs.CopyStrPtr(d.Const) diff --git a/omniparser/schemaplugin/omni/v2/transform/validate.go b/omniparser/schemaplugin/omni/v2/transform/validate.go new file mode 100644 index 0000000..a9e0502 --- /dev/null +++ b/omniparser/schemaplugin/omni/v2/transform/validate.go @@ -0,0 +1,277 @@ +package transform + +import ( + "encoding/json" + "fmt" + "sort" + "strings" + + "github.com/google/uuid" + + "github.com/jf-tech/omniparser/omniparser/customfuncs" + "github.com/jf-tech/omniparser/strs" +) + +type validateCtx struct { + Decls map[string]*Decl `json:"transform_declarations"` + customFuncs customfuncs.CustomFuncs + declHashes map[string]string +} + +// ValidateTransformDeclarations validates `transform_declarations` section of an omni schema and returns +// the `FINAL_OUTPUT` corresponding Decl. +func ValidateTransformDeclarations(schemaContent []byte, customFuncs customfuncs.CustomFuncs) (*Decl, error) { + var ctx validateCtx + // We did json schema validation earlier, so this unmarshal guarantees to succeed. + _ = json.Unmarshal(schemaContent, &ctx) + ctx.customFuncs = customFuncs + ctx.declHashes = map[string]string{} + + // We did json schema validation earlier, so "FINAL_OUTPUT" must exist. + finalOutputDecl, err := ctx.validateDecl(finalOutput, ctx.Decls[finalOutput], []string{finalOutput}) + if err != nil { + return nil, err + } + linkParent(finalOutputDecl) + return finalOutputDecl, nil +} + +// In order to detect circular template references (e.g. template A has a reference to template B which +// has a reference to C and C has one back to A), we need to keep a template reference stack, starting +// from the root template 'FINAL_OUTPUT'. Everytime we see a template, we push its name onto the stack. +// and check if it has appeared before or not. +func (ctx *validateCtx) validateDecl(fqdn string, decl *Decl, templateRefStack []string) (*Decl, error) { + err := ctx.validateXPath(fqdn, decl, templateRefStack) + if err != nil { + return nil, err + } + + decl.fqdn = fqdn + + decl.kind = detectKind(decl) + switch decl.kind { + case KindObject: + err := ctx.validateObject(fqdn, decl, templateRefStack) + if err != nil { + return nil, err + } + case KindArray: + err := ctx.validateArray(fqdn, decl, templateRefStack) + if err != nil { + return nil, err + } + case KindCustomFunc: + err := ctx.validateCustomFunc(fqdn, decl, templateRefStack) + if err != nil { + return nil, err + } + case KindTemplate: + decl, err = ctx.validateTemplate(fqdn, decl, templateRefStack) + if err != nil { + return nil, err + } + } + + decl.hash = computeDeclHash(decl, ctx.declHashes) + + return decl, nil +} + +func (ctx *validateCtx) validateXPath(fqdn string, decl *Decl, templateRefStack []string) error { + if decl.XPath != nil && decl.XPathDynamic != nil { + return fmt.Errorf("'%s' cannot set both 'xpath' and 'xpath_dynamic' at the same time", fqdn) + } + + // unlike `xpath` which is a constant string, `xpath_dynamic` value comes from the computation of + // regular decl and it can be of a const/field/custom_func/template/external, so we need to parse + // and validate the decl as well. + if decl.XPathDynamic != nil { + var err error + decl.XPathDynamic, err = ctx.validateDecl( + strs.BuildFQDN(fqdn, "xpath_dynamic"), decl.XPathDynamic, templateRefStack) + if err != nil { + return err + } + if decl.XPathDynamic.resultType() != ResultTypeString { + return fmt.Errorf("expected 'result_type' '%s' for '%s', but got '%s'", + ResultTypeString, decl.XPathDynamic.fqdn, decl.XPathDynamic.resultType()) + } + if !decl.XPathDynamic.isPrimitiveKind() { + return fmt.Errorf("expected primitive decl kind for '%s', but got '%s'", + decl.XPathDynamic.fqdn, decl.XPathDynamic.kind) + } + } + + return nil +} + +func (ctx *validateCtx) validateObject(fqdn string, decl *Decl, templateRefStack []string) error { + for childName, childDecl := range decl.Object { + childDecl, err := ctx.validateDecl( + strs.BuildFQDN(fqdn, childName), childDecl, templateRefStack) + if err != nil { + return err + } + decl.Object[childName] = childDecl + decl.children = append(decl.children, childDecl) + } + // Sort the `children` array for unit test snapshot stability. + // Given this schema parsing is usually done rarely in any use cases, so this sorting for testing + // shouldn't incur any major latency penalty. + if len(decl.children) > 0 { + sort.Slice(decl.children, func(i, j int) bool { return decl.children[i].fqdn < decl.children[j].fqdn }) + } + return nil +} + +func (ctx *validateCtx) validateArray(fqdn string, decl *Decl, templateRefStack []string) error { + for i, childDecl := range decl.Array { + childDecl, err := ctx.validateDecl( + strs.BuildFQDN(fqdn, fmt.Sprintf("elem[%d]", i+1)), childDecl, templateRefStack) + if err != nil { + return err + } + decl.Array[i] = childDecl + decl.children = append(decl.children, childDecl) + } + // sort the `children` array for unit test snapshot stability. + if len(decl.children) > 0 { + sort.Slice(decl.children, func(i, j int) bool { return decl.children[i].fqdn < decl.children[j].fqdn }) + } + return nil +} + +func (ctx *validateCtx) validateCustomFunc(fqdn string, decl *Decl, templateRefStack []string) error { + if _, found := ctx.customFuncs[decl.CustomFunc.Name]; !found { + return fmt.Errorf("unknown custom_func '%s' on '%s'", decl.CustomFunc.Name, fqdn) + } + + decl.CustomFunc.fqdn = strs.BuildFQDN(fqdn, fmt.Sprintf("custom_func(%s)", decl.CustomFunc.Name)) + for i := 0; i < len(decl.CustomFunc.Args); i++ { + argDecl, err := ctx.validateDecl( + strs.BuildFQDN(decl.CustomFunc.fqdn, fmt.Sprintf("arg[%d]", i+1)), + decl.CustomFunc.Args[i], + templateRefStack) + if err != nil { + return err + } + if argDecl.resultType() != ResultTypeString && argDecl.resultType() != ResultTypeArray { + return fmt.Errorf("expected 'result_type' '%s' or '%s' for '%s', but got '%s'", + ResultTypeString, ResultTypeArray, argDecl.fqdn, argDecl.resultType()) + } + if !argDecl.isPrimitiveKind() && argDecl.kind != KindArray { + return fmt.Errorf( + "expected primitive decl or array kind for '%s', but got '%s'", argDecl.fqdn, argDecl.kind) + } + decl.CustomFunc.Args[i] = argDecl + decl.children = append(decl.children, argDecl) + } + return nil +} + +func (ctx *validateCtx) validateTemplate(fqdn string, decl *Decl, templateRefStack []string) (*Decl, error) { + templateName := *decl.Template + templateDecl, found := ctx.Decls[templateName] + if !found { + return nil, fmt.Errorf( + "'%s' contains non-existing template reference '%s'", fqdn, templateName) + } + + // need to make a copy otherwise slice is passed by reference and append might alter + // the slice in place. + templateRefStack = append(strs.CopySlice(templateRefStack), templateName) + if strs.HasDup(templateRefStack) { + return nil, fmt.Errorf("template circular dependency detected on '%s': %s", + fqdn, strings.Join( + strs.NoErrMapSlice(templateRefStack, func(s string) string { return "'" + s + "'" }), + "->")) + } + + // Make a copy in case the template is referenced in multiple places. + declNew := templateDecl.deepCopy() + // between the template site and the template itself, there can only be one decl with xpath/xpath_dynamic set. + if declNew.isXPathSet() && decl.isXPathSet() { + return nil, fmt.Errorf( + "cannot specify 'xpath' or 'xpath_dynamic' on both '%s' and the template '%s' it references", + fqdn, templateName) + } + if decl.isXPathSet() { + declNew.XPath = decl.XPath + declNew.XPathDynamic = decl.XPathDynamic + } + + return ctx.validateDecl(fqdn, declNew, templateRefStack) +} + +func detectKind(decl *Decl) Kind { + switch { + case decl.Const != nil: + return KindConst + case decl.External != nil: + return KindExternal + case decl.CustomFunc != nil: + return KindCustomFunc + case decl.Object != nil: + return KindObject + case decl.Array != nil: + return KindArray + case decl.Template != nil: + return KindTemplate + // `xpath` or `xpath_dynamic` if used alone indicates this element in the schema is a field. + case decl.XPath != nil || decl.XPathDynamic != nil: + return KindField + default: + return KindUnknown + } +} + +func computeDeclHash(decl *Decl, declHashes map[string]string) string { + // We'd like to create a stable encoding of a decl then we can use it to lookup + // in declHashes. If we find an existing entry, then use that entry's hash id as + // the decl's hash. If we don't then create a new hash id for it and save into + // declHashes. + // + // The key and difficulty is to have a STABLE encoding of a decl. Remember in + // golang, the order of enumerating a map is non-deterministic, that makes the + // problem somewhat hard. Luckily, we have json.Marshal, which according to golang + // sorts the map's keys. So using json.Marshal gives us a stable encoding of a + // struct. + // + // However, here is a problem. Decl has all the regular exported fields, as well + // as those unexported runtime computation fields (such as `fqdn`, `kind`, even + // `hash` itself). Now we're facing a dilemma: + // - if we stick with the standard/built-in json marshaler for Decl, then those + // "hidden" computation fields won't be marshaled and that's bad for our unit + // test snapshots, in which we really want to see and ensure those computation + // fields' correctness. + // - but if we do a custom json marshaler to include those computation fields, + // then here when we try to marshal a Decl into a string, that string would + // include those "hidden" fields. Guess what? By including `fqdn` in a Decl's + // encoding would make any two Decls unique no matter how identical they look + // alike, thus defeating the purpose of this decl hash computation. + // + // In golang we can't really switch json marshaler easily for a given type at + // runtime (otherwise it would be easy: in unit tests, use custom json marshaler + // to include those hidden fields; and in production code, use standard json + // marshaler to ignore them). Solution: + // - define the custom json marshaler for Decl to include those hidden files + // so unit tests are happy. + // - here we first use deepCopy() to make a copy of the input Decl. Note + // deepCopy() only copies the public/exported fields. + // - then use json marshaler to encode the new Decl copy into a stable json str. + b, _ := json.Marshal(decl.deepCopy()) + declJson := string(b) + if hash, found := declHashes[declJson]; found { + return hash + } + declHash := uuid.New().String() + declHashes[declJson] = declHash + return declHash +} + +func linkParent(decl *Decl) { + for _, child := range decl.children { + child.parent = decl + linkParent(child) + } +} diff --git a/omniparser/schemaplugin/omni/v2/transform/validate_test.go b/omniparser/schemaplugin/omni/v2/transform/validate_test.go new file mode 100644 index 0000000..0eaa557 --- /dev/null +++ b/omniparser/schemaplugin/omni/v2/transform/validate_test.go @@ -0,0 +1,333 @@ +package transform + +import ( + "testing" + + "github.com/bradleyjkemp/cupaloy" + "github.com/stretchr/testify/assert" + + "github.com/jf-tech/omniparser/jsons" + "github.com/jf-tech/omniparser/omniparser/customfuncs" + "github.com/jf-tech/omniparser/strs" + "github.com/jf-tech/omniparser/testlib" +) + +func TestValidateTransformDeclarations(t *testing.T) { + for _, test := range []struct { + name string + transformDeclarationsJson string + expectedErr string + }{ + { + name: "success", + transformDeclarationsJson: ` { + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field1": { "const": "value1" }, + "field2": { "xpath_dynamic": { "template": "template12" } }, + "field3": { "xpath": "E/F/G", "object": { + "field4": { "array": [ + { "const": "value4" }, + { "xpath": "H/I/J" }, + { "xpath": "K/L/M", "object": { + "field5": { "xpath": "N/O/P" } + }}, + { "template": "template9" } + ]} + }}, + "field6": { + "custom_func": { + "name": "test_func", + "args": [ + { "xpath": "Q/R/S" }, + { "template": "template12" } + ] + } + }, + "field_9": { "template": "template9" }, + "field_10": { "xpath_dynamic": { "const": "X/Y/Z" }, "template": "template10" }, + "field_11": { "template": "template11" }, + "field_12": { "template": "template12" } + }}, + "template9": { "xpath": "1/2/3", "object": { + "field9": { "xpath": "4/5/6" } + }}, + "template10": { "object": { + "field10": { "const": "value10" } + }}, + "template11": { "array": [ + { "xpath": "T/U/V" } + ]}, + "template12": { "custom_func": { + "name": "test_func", + "args": [ { "xpath": "W/X" } ] + }} + } + }`, + expectedErr: "", + }, + { + name: "failure - xpath and xpath_dynamic specified at the same time", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field1": { "xpath": "A/B/C", "xpath_dynamic": { "const": "E/F/G" } } + }} + } + }`, + expectedErr: "'FINAL_OUTPUT.field1' cannot set both 'xpath' and 'xpath_dynamic' at the same time", + }, + { + name: "failure - xpath_dynamic validate fails: custom_func non-existing", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field1": { "xpath_dynamic": { "custom_func": { + "name": "non-existing", + "args": [] + }}} + }} + } + }`, + expectedErr: "unknown custom_func 'non-existing' on 'FINAL_OUTPUT.field1.xpath_dynamic'", + }, + { + name: "failure - xpath_dynamic result_type not string", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field1": { "xpath_dynamic": { "const": "123", "result_type": "int" } } + }} + } + }`, + expectedErr: "expected 'result_type' 'string' for 'FINAL_OUTPUT.field1.xpath_dynamic', but got 'int'", + }, + { + name: "failure - xpath_dynamic kind not primitive", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field1": { "xpath_dynamic": { "template": "template1" } } + }}, + "template1": { "result_type": "string", "object": { + "field2": { "const": "123" }, + "field3" : { "external": "something" } + }} + } + }`, + expectedErr: "expected primitive decl kind for 'FINAL_OUTPUT.field1.xpath_dynamic', but got 'object'", + }, + { + name: "failure - object template invalid", + transformDeclarationsJson: ` { + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field_12": { "template": "template12" } + }} + } + }`, + expectedErr: "'FINAL_OUTPUT.field_12' contains non-existing template reference 'template12'", + }, + { + name: "failure - array template invalid", + transformDeclarationsJson: ` { + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field4": { "array": [ + { "template": "template12" } + ]} + }} + } + }`, + expectedErr: "'FINAL_OUTPUT.field4.elem\\[1\\]' contains non-existing template reference 'template12'", + }, + { + name: "failure - custom_func arg decl validation failure", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "custom_func": { + "name": "test_func", + "args": [ { "template": "huh" } ] + }} + } + }`, + expectedErr: "'FINAL_OUTPUT.custom_func\\(test_func\\).arg\\[1\\]' contains non-existing template reference 'huh'", + }, + { + name: "failure - custom_func arg decl result_type not string or array", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "custom_func": { + "name": "test_func", + "args": [ { "const": "true", "result_type": "boolean" } ] + }} + } + }`, + expectedErr: "expected 'result_type' 'string' or 'array' for 'FINAL_OUTPUT.custom_func\\(test_func\\).arg\\[1\\]', but got 'boolean'", + }, + { + name: "failure - custom_func arg decl kind not primitive", + transformDeclarationsJson: `{ + "transform_declarations": { + "FINAL_OUTPUT": { "custom_func": { + "name": "test_func", + "args": [ { "result_type": "string", "object": {} } ] + }} + } + }`, + expectedErr: "expected primitive decl or array kind for 'FINAL_OUTPUT.custom_func\\(test_func\\).arg\\[1\\]', but got 'object'", + }, + { + name: "failure - circular template ref", + transformDeclarationsJson: ` { + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field_1": { "template": "template1" } + }}, + "template1": { "object": { + "field_2": { "template": "template2" } + }}, + "template2": { "object": { + "field_3": { "template": "template3" } + }}, + "template3": { "object": { + "field_circular": { "template": "template1" } + }} + } + }`, + expectedErr: "template circular dependency detected on 'FINAL_OUTPUT.field_1.field_2.field_3.field_circular': 'FINAL_OUTPUT'->'template1'->'template2'->'template3'->'template1'", + }, + { + name: "failure - xpath conflict for template reference", + transformDeclarationsJson: ` { + "transform_declarations": { + "FINAL_OUTPUT": { "object": { + "field_1": { "xpath": "abc", "template": "template1" } + }}, + "template1": { "xpath_dynamic": { "const": "efg" }, "object": { + "field_2": { "template": "template2" } + }} + } + }`, + expectedErr: "cannot specify 'xpath' or 'xpath_dynamic' on both 'FINAL_OUTPUT.field_1' and the template 'template1' it references", + }, + } { + t.Run(test.name, func(t *testing.T) { + finalOutputDecl, err := ValidateTransformDeclarations( + []byte(test.transformDeclarationsJson), + customfuncs.CustomFuncs{ + "test_func": func() {}, + }) + switch { + case strs.IsStrNonBlank(test.expectedErr): + assert.Error(t, err) + assert.Regexp(t, test.expectedErr, err.Error()) + assert.Nil(t, finalOutputDecl) + default: + assert.NoError(t, err) + cupaloy.SnapshotT(t, jsons.BPM(finalOutputDecl)) + } + }) + } +} + +func TestDetectKind(t *testing.T) { + for _, test := range []struct { + name string + decl *Decl + expectedKind Kind + }{ + { + name: "const", + decl: &Decl{Const: testlib.StrPtr("test")}, + expectedKind: KindConst, + }, + { + name: "external", + decl: &Decl{External: testlib.StrPtr("test")}, + expectedKind: KindExternal, + }, + { + name: "custom func", + decl: &Decl{CustomFunc: &CustomFuncDecl{Name: "test"}}, + expectedKind: KindCustomFunc, + }, + { + name: "object with empty map", + decl: &Decl{XPath: testlib.StrPtr("test"), Object: map[string]*Decl{}}, + expectedKind: KindObject, + }, + { + name: "object with non-empty map", + decl: &Decl{ + XPathDynamic: &Decl{}, + Object: map[string]*Decl{"a": {Const: testlib.StrPtr("test")}}, + }, + expectedKind: KindObject, + }, + { + name: "array", + decl: &Decl{ + Array: []*Decl{{Const: testlib.StrPtr("test")}}, + }, + expectedKind: KindArray, + }, + { + name: "template", + decl: &Decl{XPath: testlib.StrPtr("test"), Template: testlib.StrPtr("test")}, + expectedKind: KindTemplate, + }, + { + name: "field with xpath", + decl: &Decl{XPath: testlib.StrPtr("test")}, + expectedKind: KindField, + }, + { + name: "field with xpath_dynamic", + decl: &Decl{XPathDynamic: &Decl{}}, + expectedKind: KindField, + }, + { + name: "unknown", + decl: &Decl{}, + expectedKind: KindUnknown, + }, + } { + t.Run(test.name, func(t *testing.T) { + actualKind := detectKind(test.decl) + assert.Equal(t, test.expectedKind, actualKind) + }) + } +} + +func TestComputeDeclHash(t *testing.T) { + decl1 := &Decl{ + Object: map[string]*Decl{ + "field3": {Const: testlib.StrPtr("const"), kind: KindConst, fqdn: "root.field3", hash: "h3"}, + "field1": {External: testlib.StrPtr("external"), kind: KindExternal, fqdn: "root.field1", hash: "h1"}, + "field2": {Template: testlib.StrPtr("template"), kind: KindTemplate, fqdn: "root.field2", hash: "h2"}, + }, + kind: KindObject, + fqdn: "root", + hash: "h0", + } + decl1.children = []*Decl{decl1.Object["field3"], decl1.Object["field1"], decl1.Object["field2"]} + decl1.Object["field1"].parent = decl1 + decl1.Object["field2"].parent = decl1 + decl1.Object["field3"].parent = decl1 + + assert.Equal(t, "root", decl1.fqdn) + + declHashes := map[string]string{} + h0 := computeDeclHash(decl1, declHashes) + assert.Equal(t, 1, len(declHashes)) + + decl1Copy := decl1.deepCopy() + assert.Equal(t, "", decl1Copy.fqdn) + + h0prime := computeDeclHash(decl1Copy, declHashes) + assert.Equal(t, 1, len(declHashes)) + assert.Equal(t, h0, h0prime) + + assert.NotEqual(t, jsons.BPM(decl1), jsons.BPM(decl1Copy)) +} diff --git a/omniparser/schemaplugin/plugin.go b/omniparser/schemaplugin/plugin.go index e15048d..814de64 100644 --- a/omniparser/schemaplugin/plugin.go +++ b/omniparser/schemaplugin/plugin.go @@ -8,6 +8,7 @@ import ( "github.com/jf-tech/omniparser/omniparser/transformctx" ) +// ParseSchemaCtx is a context object used by schema plugins during schema parsing. type ParseSchemaCtx struct { Name string Header Header diff --git a/strs/strs.go b/strs/strs.go index eec4e9c..ec60831 100644 --- a/strs/strs.go +++ b/strs/strs.go @@ -39,3 +39,72 @@ func CopyStrPtr(sp *string) *string { s := *sp return &s } + +const ( + // FQDNDelimiter is the default FQDN delimiter. + FQDNDelimiter = "." +) + +// BuildFQDN builds an FQDN from a slice of namelet strings. +func BuildFQDN(namelets ...string) string { + return BuildFQDN2(FQDNDelimiter, namelets...) +} + +// BuildFQDN2 builds an FQDN from a slice of namelet strings and a given delimiter. +func BuildFQDN2(delimiter string, namelets ...string) string { + return strings.Join(namelets, delimiter) +} + +// CopySlice copies a string slice. The returned slice is guaranteed to be a different +// slice (thus the name Copy) so modifying the src from the caller side won't affect +// the returned slice. +func CopySlice(src []string) []string { + return MergeSlices(src, nil) +} + +// MergeSlices returns a new slice with two input slice content merged together. The result +// is guaranteed to be a new slice thus modifying a or b from the caller side won't affect +// the returned slice. +func MergeSlices(a, b []string) []string { + return append(append([]string(nil), a...), b...) +} + +// HasDup detects whether there are duplicates existing in the src slice. +func HasDup(src []string) bool { + seen := map[string]bool{} + for _, v := range src { + if _, found := seen[v]; found { + return true + } + seen[v] = true + } + return false +} + +// MapSlice returns a new string slice whose element is transformed from input slice's +// corresponding element by a transform func. If any error occurs during any transform, +// returned slice will be nil together with the error. +func MapSlice(src []string, f func(string) (string, error)) ([]string, error) { + if len(src) == 0 { + return nil, nil + } + result := make([]string, len(src)) + for i := 0; i < len(src); i++ { + s, err := f(src[i]) + if err != nil { + return nil, err + } + result[i] = s + } + return result, nil +} + +// NoErrMapSlice returns a new string slice whose element is transformed from input slice's +// corresponding element by a transform func. The transform func must not fail and NoErrMapSlice +// guarantees to succeed. +func NoErrMapSlice(src []string, f func(string) string) []string { + result, _ := MapSlice(src, func(s string) (string, error) { + return f(s), nil + }) + return result +} diff --git a/strs/strs_test.go b/strs/strs_test.go index 193b0d9..e2981ec 100644 --- a/strs/strs_test.go +++ b/strs/strs_test.go @@ -1,6 +1,7 @@ package strs import ( + "errors" "fmt" "testing" @@ -63,3 +64,217 @@ func TestCopyStrPtr(t *testing.T) { assert.Equal(t, *src, *dst) assert.True(t, fmt.Sprintf("%p", src) != fmt.Sprintf("%p", dst)) } + +func TestBuildFQDN(t *testing.T) { + for _, test := range []struct { + name string + namelets []string + expected string + }{ + { + name: "nil", + namelets: nil, + expected: "", + }, + { + name: "empty", + namelets: []string{}, + expected: "", + }, + { + name: "single", + namelets: []string{"one"}, + expected: "one", + }, + { + name: "multiple", + namelets: []string{"one", "", "three", "four"}, + expected: "one..three.four", + }, + } { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.expected, BuildFQDN(test.namelets...)) + }) + } +} + +func TestCopySlice(t *testing.T) { + for _, test := range []struct { + name string + input []string + expectedOutput []string + }{ + { + name: "nil", + input: nil, + expectedOutput: nil, + }, + { + name: "empty slice", + input: []string{}, + expectedOutput: nil, + }, + { + name: "non-empty slice", + input: []string{"abc", ""}, + expectedOutput: []string{"abc", ""}, + }, + } { + t.Run(test.name, func(t *testing.T) { + cp := CopySlice(test.input) + // First make sure the copy contains what's expected. + assert.Equal(t, test.expectedOutput, cp) + if len(test.input) >= 2 { + // Second test if modifying the original won't affect the copy + // (that's what this copy func is all about) + test.input[0] = test.input[1] + assert.NotEqual(t, test.input, cp) + } + }) + } +} + +func TestMergeSlices(t *testing.T) { + for _, test := range []struct { + name string + slice1 []string + slice2 []string + expected []string + }{ + { + name: "both nil", + slice1: nil, + slice2: nil, + expected: nil, + }, + { + name: "1 nil, 2 not nil", + slice1: nil, + slice2: []string{"", "abc"}, + expected: []string{"", "abc"}, + }, + { + name: "1 not nil, 2 nil", + slice1: []string{"abc", ""}, + slice2: nil, + expected: []string{"abc", ""}, + }, + { + name: "both not nil", + slice1: []string{"abc", ""}, + slice2: []string{"", "abc"}, + expected: []string{"abc", "", "", "abc"}, + }, + } { + t.Run(test.name, func(t *testing.T) { + merged := MergeSlices(test.slice1, test.slice2) + // also very importantly to make sure the resulting merged is a new copy so modifying + // the input slices won't affect the merged slice. + if len(test.slice1) > 0 { + test.slice1[0] = "modified" + } + if len(test.slice2) > 0 { + test.slice2[0] = "modified" + } + assert.Equal(t, test.expected, merged) + }) + } +} + +func TestHasDup(t *testing.T) { + for _, test := range []struct { + name string + input []string + expected bool + }{ + { + name: "nil", + input: nil, + expected: false, + }, + { + name: "empty slice", + input: []string{}, + expected: false, + }, + { + name: "non-empty slice with no dups", + input: []string{"abc", ""}, + expected: false, + }, + { + name: "non-empty slice with dups", + input: []string{"", "abc", ""}, + expected: true, + }, + } { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.expected, HasDup(test.input)) + }) + } +} + +func TestMapSlice(t *testing.T) { + t.Run("map error", func(t *testing.T) { + errorMap := func(_ string) (string, error) { + return "abc", errors.New("map error") + } + result, err := MapSlice([]string{"abc", ""}, errorMap) + assert.Error(t, err) + assert.Equal(t, "map error", err.Error()) + assert.Nil(t, result) + }) + + t.Run("map success", func(t *testing.T) { + input := []string{"abc", ""} + index := 0 + mirrorMap := func(_ string) (string, error) { + index++ + return input[len(input)-index], nil + } + result, err := MapSlice(input, mirrorMap) + assert.NoError(t, err) + assert.Equal(t, []string{"", "abc"}, result) + }) + + t.Run("map nil", func(t *testing.T) { + result, err := MapSlice(nil, func(s string) (string, error) { + return s + "...", nil + }) + assert.NoError(t, err) + assert.Nil(t, result) + }) +} + +func TestNoErrMapSlice(t *testing.T) { + for _, test := range []struct { + name string + input []string + expected []string + }{ + { + name: "nil", + input: nil, + expected: nil, + }, + { + name: "empty slice", + input: []string{}, + expected: nil, + }, + { + name: "non-empty slice", + input: []string{"abc", ""}, + expected: []string{"", "abc"}, + }, + } { + t.Run(test.name, func(t *testing.T) { + index := 0 + mirrorMap := func(s string) string { + index++ + return test.input[len(test.input)-index] + } + assert.Equal(t, test.expected, NoErrMapSlice(test.input, mirrorMap)) + }) + } +}