Adding Programmability doc. (#130)

And expose/export fileformat specific readers for non-omniparser usage.
jf-tech · Dec 12, 2020 · 6d22b60 · 6d22b60
1 parent f95defc
commit 6d22b60
Show file tree

Hide file tree

Showing 29 changed files with 452 additions and 175 deletions.
diff --git a/README.md b/README.md
@@ -22,8 +22,7 @@ is used, specially the all mighty `javascript` (and `javascript_with_context`).
 - [CSV Schema in Depth](./doc/csv_in_depth.md): everything about schemas for CSV input.
 - [Fixed-Length Schema in Depth](./doc/fixedlength_in_depth.md): everything about schemas for fixed-length (e.g. TXT)
 input
-- [JSON Schema in Depth](./doc/json_in_depth.md): everything about schemas for JSON input.
-- [XML Schema in Depth](./doc/xml_in_depth.md): everything about schemas for XML input.
+- [JSON/XML Schema in Depth](./doc/json_xml_in_depth.md): everything about schemas for JSON or XML input.
 - [EDI Schema in Depth](./doc/edi_in_depth.md): everything about schemas for EDI input.
 - [Programmability](./doc/programmability.md): Advanced techniques for using omniparser (or some of its components) in
 your code.

diff --git a/doc/gettingstarted.md b/doc/gettingstarted.md
@@ -714,6 +714,7 @@ for {
     if err == io.EOF {
         break
     }
+    if err != nil { ... }
     // output contains a []byte of the ingested and transformed record. 
 }
 ```
@@ -800,6 +801,7 @@ for {
     if err == io.EOF {
         break
     }
+    if err != nil { ... }
     // output contains a []byte of the ingested and transformed record. 
 }
 ```

diff --git a/doc/json_in_depth.md b/doc/json_in_depth.md
diff --git a/doc/json_xml_in_depth.md b/doc/json_xml_in_depth.md
@@ -0,0 +1,5 @@
+# JSON/XML Schema in "Depth" :blush:
+
+Omniparser schemas for JSON and XML inputs contain only two parts, `parser_settings` and
+`transform_declarations`, both of which we have covered in depth [here](./gettingstarted.md) and
+[here](./transforms.md).
diff --git a/doc/programmability.md b/doc/programmability.md
@@ -0,0 +1,254 @@
+* [Programmability of Omniparser](#programmability-of-omniparser)
+    * [Out\-of\-Box Basic Use Case](#out-of-box-basic-use-case)
+    * [Add A New custom\_func](#add-a-new-custom_func)
+    * [Add A New custom\_parse](#add-a-new-custom_parse)
+    * [Add A New File Format](#add-a-new-file-format)
+    * [Add A New Schema Handler](#add-a-new-schema-handler)
+    * [Put All Together](#put-all-together)
+    * [In Non\-Golang Environment](#in-non-golang-environment)
+* [Programmability of Some Components without Omniparser](#programmability-of-some-components-without-omniparser)
+    * [Functions](#functions)
+    * [IDR](#idr)
+    * [CSV Reader](#csv-reader)
+    * [Fixed\-Length Reader](#fixed-length-reader)
+    * [EDI Reader](#edi-reader)
+    * [JSON Reader](#json-reader)
+    * [XML Reader](#xml-reader)
+
+# Programmability of Omniparser
+
+There are many ways to use omniparser in your code/service/app programmatically.
+
+## Out-of-Box Basic Use Case
+
+This is covered in [Getting Started](./gettingstarted.md#using-omniparser-programmatically), copy it
+here for completeness.
+```
+schema, err := omniparser.NewSchema("your schema name", strings.NewReader("your schema content"))
+if err != nil { ... }
+transform, err := schema.NewTransform("your input name", strings.NewReader("your input content"), &transformctx.Ctx{})
+if err != nil { ... }
+for {
+    output, err := transform.Read()
+    if err == io.EOF {
+        break
+    }
+    if err != nil { ... }
+    // output contains a []byte of the ingested and transformed record.
+}
+```
+Note this out-of-box omniparser setup contains only the `omni.2.1` schema handler, meaning only schemas
+whose `parser_settings.version` is `omni.2.1` are supported. `omni.2.1.` schema handler's supported file
+formats include: delimited (CSV, TSV, etc), EDI, XML, JSON, fixed-length. `omni.2.1.` schema handler's
+supported built-in `custom_func`s are listed [here](./customfuncs.md).
+
+## Add A New `custom_func`
+
+If the built-in `custom_func`s are enough, you can add your own custom functions by
+[doing this](../extensions/omniv21/samples/customfileformats/jsonlog/sample_test.go) (note the linked
+sample does more than just adding a new `custom_func`):
+```
+schema, err := omniparser.NewSchema(
+    "your schema name",
+    strings.NewReader("your schema content"),
+    omniparser.Extension{
+        CreateSchemaHandler: omniv21.CreateSchemaHandler,
+        CustomFuncs: customfuncs.Merge(
+            customfuncs.CommonCustomFuncs,                // global custom_funcs
+            v21.OmniV21CustomFuncs,                       // omni.2.1 custom_funcs
+            customfuncs.CustomFuncs{
+                "normalize_severity": normalizeSeverity,  // <====== your own custom_funcs
+            })})
+if err != nil { ... }
+transform, err := schema.NewTransform("your input name", strings.NewReader("your input content"), &transformctx.Ctx{})
+if err != nil { ... }
+for {
+    output, err := transform.Read()
+    if err == io.EOF {
+        break
+    }
+    if err != nil { ... }
+    // output contains a []byte of the ingested and transformed record.
+}
+```
+
+Each `custom_func` must be a Golang function with the first param being `*transformctx.Ctx`. The rest
+params can be of any type, as long as they will match the types of data that are fed into the function
+in `transform_declarations`.
+
+## Add A New `custom_parse`
+
+There are several ways to customize transform logic, one of which is using the all mighty `custom_func`
+`javascript` (or its silibing `javascript_with_context`), see details
+[here](./use_of_custom_funcs.md#javascript-and-javascript_with_context).
+
+However currently we don't support multi-line javascript (yet), which makes writing complex transform
+logic in a single line javascript difficult to read and debug. Also there are situations where schema
+writers want the following:
+- native Golang code transform logic
+- logging/stats
+- better/thorough test coverage
+- more complexed operations like RPCs calls, encryption, etc, which isn't really suited/possible for
+javascript to handle.
+
+`custom_parse` provides an in-code transform plugin mechanism. In addition to a number of built-in
+transforms, such as field, `const`, `external`, `object`, `template`, `array`, and `custom_func`,
+`custom_parse` allows schema writer to be able to provide a Golang function that takes in the
+`*idr.Node` at the current IDR cursor (see more about IDR cursoring
+[here](./xpath.md#data-context-and-anchoring)), does whatever processing and transforms as it sees
+fit, and returns whatever the desired result to be embedded in place of the `custom_parse`.
+
+[This sample](../extensions/omniv21/samples/customparse/sample_test.go) gives a very detailed demo
+of how `custom_parse` works.
+
+## Add A New File Format
+
+While built-in `omni.2.1` schema handler already supports most popular file formats in a typical
+ETL pipeline, new file format(s) can be added into the schema handler, so it can ingest new formats
+while using the same extensible/capable transform (`transform_declarations`) logic.
+
+On a high level, a [`FileFormat`](../extensions/omniv21/fileformat/fileformat.go) is a component
+that knows how to ingest a data record, in streaming fashion, from a certain file format, and
+convert it into an `idr.Node` tree, for later processing and transform.
+
+Typically, a new [`FileFormat`](../extensions/omniv21/fileformat/fileformat.go) may require some
+additional information in a schema (usually in a `file_declaration` section), thus `omni.2.1` schema
+handler will give a new custom [`FileFormat`](../extensions/omniv21/fileformat/fileformat.go) a
+chance to validate a schema. Then the schema handler will ask
+the new [`FileFormat`](../extensions/omniv21/fileformat/fileformat.go) to create a format specific
+reader, whose job is to consume input stream, and convert each record into the IDR format.
+
+See [this example](../extensions/omniv21/samples/customfileformats) for how to add a new
+[`FileFormat`](../extensions/omniv21/fileformat/fileformat.go).
+
+## Add A New Schema Handler
+
+To complete omniparser's full extensibility picture, we allow adding complete new schema handlers,
+whether they're for major schema version upgrades that break backward-compatibility, or for brand-new
+parsing/transform paradigms. In fact, we utilize this customizability capability ourselves for
+integrating those legacy omniparser schema supports (schema versions that are older than `omni.2.1`
+and are not compatible with `omni.2.1`): take a glimpse at: https://github.com/jf-tech/omniparserlegacy.
+
+## Put All Together
+
+The most canonical use case of omniparser would be a (micro)service that is part of a larger ETL
+pipeline that gets different input files/streams from different external integration influx points,
+performs schema driven (thus codeless) parsing and transform to process and standardize the inputs
+into internal formats for later stage loading (L) part of ETL.
+
+Because omniparser's parsing and transform is schema driven and involves little/no coding, it enables
+faster and at-scale ETL integration possibly done by non-coding engineers or support staffs:
+
+![](./resources/typical_omnipasrser_service.png)
+
+First in your service, there needs to be a schema cache component that loads and refreshes all the
+schemas from a schema repository (could be a REST API, or a database, or some storage). These schemas
+are parsed, validated (by [`omniparser.NewSchema`](../schema.go) calls) and cached.
+
+As different integration partners' input streams are coming in, the service will, based on some
+criteria, such as partner IDs, select which schema to use for a particular input. Once schema
+selection is completed, the service calls [`schema.NewTransform`](../schema.go) to create an
+instance of a transform operation for this particular input, performs the parsing and transform, and
+sends the standardized output into a later stage in the ETL pipeline.
+
+## In Non-Golang Environment
+
+Omniparser is currently only implemented in Golang (we do want to port it to other languages, at least
+Java, in the near future), the only way to utilize it, if your service or environment is not in Golang,
+is to sidecar it, by either making it a standard alone service or shell-exec omniparser, both of which
+involves omniparser's CLI.
+
+Recall in [Getting Started](./gettingstarted.md#cli-command-line-interface) we demonstrated omniparser
+CLI's `transform` command. You can shell-exec it from your service. Keep in mind the following if you
+want to go down this path:
+- you will have to pre-compile omniparser CLI binary (which needs to platform/OS specific) and ship with
+your service, and
+- you will need to copy down the input file locally in your service before invoking the CLI, and then
+intercept `stdout`/`stderr` from the CLI and its exit code in order to get the results.
+
+Omniparser CLI has another command `server`, which simply launches the CLI into a http listening service
+that exposes a REST API:
+- `POST`
+- request `Content-Type`: `application/json`
+- request JSON:
+    ```
+    {
+        "schema": "... the schema content, required ...",
+        "input": "... the input to be parsed and transformed, required ...",
+        "properties": { ... JSON string map used for `external` transforms, optional ...}
+    }
+    ```
+Keep in mind the following if you want to go down this path:
+- you will need to host this CLI-turned omniparser service somewhere accessible to your service,
+- you lose the benefit of omniparser stream processing, which enables parsing infinitely large input,
+because now you need to send the input as a single string in the `input` field of the HTTP POST request.
+
+# Programmability of Some Components without Omniparser
+
+There are many components inside omniparser can be useful in your code, even if you don't want to
+use omniparser as a whole for parsing and transforming input file/data. Here is a selected list of
+these components:
+
+## Functions
+
+- [`DateTimeToRFC3339()`, `DateTimeLayoutToRFC3339()`, `DateTimeToEpoch()`, `EpochToDateTimeRFC3339()`](../customfuncs/datetime.go)
+
+    Parsing and formatting date/time stamps isn't trivial at all, especially when time zones are
+    involved. These functions can be used independent of omniparser and are very useful when your
+    Golang code deals with date/time a lot.
+
+- [`JavaScript()`](../extensions/omniv21/customfuncs/javascript.go):
+
+  Omniparser uses github.com/dop251/goja as the native Golang javascript engine. Yes you can directly
+  use `goja`, but you'll have to deal with performance related vm caching, and error handling. Instead
+  you can directly use `JavaScript` function.
+
+## IDR
+
+We have an in-depth [doc](./idr.md) talking about IDR, which proves to be really useful in many document
+parsing situations, even outside of omniparser realm. This `idr` package contains the IDR node/tree
+definitions, creation, caching, recycling and releasing mechanisms, serialization helpers, XPath
+assisted navigation and querying, and two powerful stream readers for JSON and XML inputs.
+
+Particularly, the [JSON](../idr/jsonreader.go)/[XML](../idr/xmlreader.go) readers are two powerful
+parsers, capable of ingesting JSON/XML data in streaming fashion assisted by XPath style target
+filtering, thus enabling processing arbitrarily large inputs.
+
+## CSV Reader
+
+Use [`NewReader()`](../extensions/omniv21/fileformat/csv/reader.go) to create a CSV reader that does
+- header column validation
+- header/data row jumping
+- XPath based data row filtering
+- Mis-escaped quote replacement
+- Context-aware error message
+
+For more reader specific settings/configurations, check
+[CSV in Depth](./csv_in_depth.md#csv-file_declaration) page.
+
+## Fixed-Length Reader
+
+Use [`NewReader()`](../extensions/omniv21/fileformat/fixedlength/reader.go) to create a fixed-length
+reader that does
+- row based or header/footer based envelope parsing
+- XPath based data row filtering
+- Context-aware error message
+
+For more reader specific settings/configurations, check
+[Fixed-Length in Depth](./fixedlength_in_depth.md) page.
+
+## EDI Reader
+
+Use [`NewReader()`](../extensions/omniv21/fileformat/edi/reader.go) to create an EDI reader that does
+- segment min/max validation
+- XPath based data row filtering
+- Context-aware error message
+
+Future TO-DO: create a version of non-validating EDI reader for users who are only interested in
+getting the raw segment data, without any validation.
+
+## JSON Reader
+See [IDR](#idr) notes about the JSON/XML readers above.
+
+## XML Reader
+See [IDR](#idr) notes about the JSON/XML readers above.
diff --git a/doc/resources/typical_omnipasrser_service.png b/doc/resources/typical_omnipasrser_service.png
diff --git a/doc/xml_in_depth.md b/doc/xml_in_depth.md
diff --git a/extensions/omniv21/fileformat/csv/decl.go b/extensions/omniv21/fileformat/csv/decl.go
@@ -4,22 +4,24 @@ import (
 	"github.com/jf-tech/go-corelib/strs"
 )
 
-type column struct {
+// Column is a CSV column.
+type Column struct {
 	Name string `json:"name"`
 	// If the CSV column 'name' contains characters (such as space, or special letters) that are
 	// not suitable for *idr.Node construction and xpath query, this gives schema writer an
 	// alternate way to name/label the column. Optional.
 	Alias *string `json:"alias"`
 }
 
-func (c column) name() string {
+func (c Column) name() string {
 	return strs.StrPtrOrElse(c.Alias, c.Name)
 }
 
-type fileDecl struct {
+// FileDecl describes CSV specific schema settings for omniparser reader.
+type FileDecl struct {
 	Delimiter           string   `json:"delimiter"`
 	ReplaceDoubleQuotes bool     `json:"replace_double_quotes"`
 	HeaderRowIndex      *int     `json:"header_row_index"`
 	DataRowIndex        int      `json:"data_row_index"`
-	Columns             []column `json:"columns"`
+	Columns             []Column `json:"columns"`
 }
diff --git a/extensions/omniv21/fileformat/csv/decl_test.go b/extensions/omniv21/fileformat/csv/decl_test.go
@@ -8,6 +8,6 @@ import (
 )
 
 func TestColumnName(t *testing.T) {
-	assert.Equal(t, "name", column{Name: "name"}.name())
-	assert.Equal(t, "alias", column{Name: "name", Alias: strs.StrPtr("alias")}.name())
+	assert.Equal(t, "name", Column{Name: "name"}.name())
+	assert.Equal(t, "alias", Column{Name: "name", Alias: strs.StrPtr("alias")}.name())
 }
diff --git a/extensions/omniv21/fileformat/csv/format.go b/extensions/omniv21/fileformat/csv/format.go
@@ -30,7 +30,7 @@ func NewCSVFileFormat(schemaName string) fileformat.FileFormat {
 }
 
 type csvFormatRuntime struct {
-	Decl  *fileDecl `json:"file_declaration"`
+	Decl  *FileDecl `json:"file_declaration"`
 	XPath string
 }
 
@@ -65,7 +65,7 @@ func (f *csvFileFormat) ValidateSchema(
 	return &runtime, nil
 }
 
-func (f *csvFileFormat) validateFileDecl(decl *fileDecl) error {
+func (f *csvFileFormat) validateFileDecl(decl *FileDecl) error {
 	// If header_row_index is specified, then it must be < data_row_index
 	if decl.HeaderRowIndex != nil && *decl.HeaderRowIndex >= decl.DataRowIndex {
 		return f.FmtErr(
@@ -78,7 +78,7 @@ func (f *csvFileFormat) validateFileDecl(decl *fileDecl) error {
 	return nil
 }
 
-func (f *csvFileFormat) validateColumns(columns []column) error {
+func (f *csvFileFormat) validateColumns(columns []Column) error {
 	namesSeen := map[string]bool{}
 	aliasesSeen := map[string]bool{}
 	for _, column := range columns {

diff --git a/extensions/omniv21/fileformat/csv/format_test.go b/extensions/omniv21/fileformat/csv/format_test.go
@@ -164,11 +164,11 @@ func TestCreateFormatReader(t *testing.T) {
 				lf("x|y")+
 				lf("4|5|6")),
 		&csvFormatRuntime{
-			Decl: &fileDecl{
+			Decl: &FileDecl{
 				Delimiter:      "|",
 				HeaderRowIndex: testlib.IntPtr(1),
 				DataRowIndex:   2,
-				Columns:        []column{{Name: "A"}, {Name: "B"}, {Name: "C"}},
+				Columns:        []Column{{Name: "A"}, {Name: "B"}, {Name: "C"}},
 			},
 			XPath: ".[A != 'x']",
 		})

diff --git a/extensions/omniv21/fileformat/csv/reader.go b/extensions/omniv21/fileformat/csv/reader.go
@@ -32,7 +32,7 @@ func IsErrInvalidHeader(err error) bool {
 
 type reader struct {
 	inputName     string
-	decl          *fileDecl
+	decl          *FileDecl
 	xpath         *xpath.Expr
 	r             *ios.LineNumReportingCsvReader
 	headerChecked bool
@@ -144,7 +144,7 @@ func (r *reader) fmtErrStr(format string, args ...interface{}) string {
 }
 
 // NewReader creates an FormatReader for CSV file format.
-func NewReader(inputName string, r io.Reader, decl *fileDecl, xpathStr string) (*reader, error) {
+func NewReader(inputName string, r io.Reader, decl *FileDecl, xpathStr string) (*reader, error) {
 	var expr *xpath.Expr
 	var err error
 	xpathStr = strings.TrimSpace(xpathStr)