diff --git a/go.mod b/go.mod index eb0a22b..a44a1cc 100644 --- a/go.mod +++ b/go.mod @@ -8,4 +8,5 @@ require ( github.com/bradleyjkemp/cupaloy v2.3.0+incompatible github.com/hashicorp/golang-lru v0.5.4 github.com/stretchr/testify v1.6.1 + golang.org/x/text v0.3.0 ) diff --git a/omniparser/parser.go b/omniparser/parser.go new file mode 100644 index 0000000..59cdd9e --- /dev/null +++ b/omniparser/parser.go @@ -0,0 +1,25 @@ +package omniparser + +import ( + "io" + + "github.com/jf-tech/omniparser/omniparser/schemaPlugin" + "github.com/jf-tech/omniparser/omniparser/transformCtx" +) + +// Parser is an interface that represents an instance of omniparser. +// One instance of Parser is associated with one and only one schema. +// The instance of Parser can be reused for parsing and transforming +// multiple input files/streams, as long as they are all intended for the +// same schema. +// Each parsing/transform, however, needs a separate instance of +// TransformOp. TransformOp must not be shared and reused across different +// input files/streams. +// While the same instance of Parser can be shared across multiple threads, +// TransformOp is not multi-thread safe. All operations on it must be done +// within the same go routine. +type Parser interface { + GetTransformOp(name string, input io.Reader, ctx *transformCtx.Ctx) (TransformOp, error) + SchemaHeader() schemaPlugin.Header + SchemaRawContent() string +} diff --git a/omniparser/schemaPlugin/.snapshots/TestSupportedEncodingMappingsDump b/omniparser/schemaPlugin/.snapshots/TestSupportedEncodingMappingsDump new file mode 100644 index 0000000..f79b4d6 --- /dev/null +++ b/omniparser/schemaPlugin/.snapshots/TestSupportedEncodingMappingsDump @@ -0,0 +1,6 @@ +[ + "iso-8859-1", + "utf-8", + "windows-1252" +] + diff --git a/omniparser/schemaPlugin/header.go b/omniparser/schemaPlugin/header.go new file mode 100644 index 0000000..e3e3600 --- /dev/null +++ b/omniparser/schemaPlugin/header.go @@ -0,0 +1,50 @@ +package schemaPlugin + +import ( + "io" + + "golang.org/x/text/encoding/charmap" + + "github.com/jf-tech/omniparser/strutil" +) + +// ParserSettings defines the common header (and its JSON format) for all schemas across all schema plugins. +// It contains vital information about what schema plugin a schema wants to use, and what file format the +// input stream is of (e.g. fixed-length txt, CSV/TSV, XML, JSON, EDI, etc). +// Also optionally, it specifies the expected the encoding scheme for the input streams this schema is used +// for. +type ParserSettings struct { + Version string `json:"version,omitempty"` + FileFormatType string `json:"file_format_type,omitempty"` + Encoding *string `json:"encoding,omitempty"` +} + +const ( + // EncodingUTF8 is the UTF-8 (golang's default) encoding scheme. + EncodingUTF8 = "utf-8" + // EncodingISO8859_1 is the ISO 8859-1 encoding. + EncodingISO8859_1 = "iso-8859-1" + // EncodingWindows1252 is the Windows 1252 encoding. + EncodingWindows1252 = "windows-1252" +) + +type encodingMappingFunc func(reader io.Reader) io.Reader + +// SupportedEncodingMappings provides mapping between input stream reader and a func that does +// encoding specific translation. +var SupportedEncodingMappings = map[string]encodingMappingFunc{ + EncodingUTF8: func(r io.Reader) io.Reader { return r }, + EncodingISO8859_1: func(r io.Reader) io.Reader { return charmap.ISO8859_1.NewDecoder().Reader(r) }, + EncodingWindows1252: func(r io.Reader) io.Reader { return charmap.Windows1252.NewDecoder().Reader(r) }, +} + +// GetEncoding returns the encoding of the schema. If no encoding is specified in the schema, which +// the most comment default case, it assumes the input stream will be in UTF-8. +func (p ParserSettings) GetEncoding() string { + return strutil.StrPtrOrElse(p.Encoding, EncodingUTF8) +} + +// Header contains the common ParserSettings for all schemas. +type Header struct { + ParserSettings ParserSettings `json:"parser_settings,omitempty"` +} diff --git a/omniparser/schemaPlugin/header_test.go b/omniparser/schemaPlugin/header_test.go new file mode 100644 index 0000000..18ed65a --- /dev/null +++ b/omniparser/schemaPlugin/header_test.go @@ -0,0 +1,46 @@ +package schemaPlugin + +import ( + "io/ioutil" + "sort" + "strings" + "testing" + + "github.com/bradleyjkemp/cupaloy" + "github.com/stretchr/testify/assert" + + "github.com/jf-tech/omniparser/jsonutil" + "github.com/jf-tech/omniparser/testutil" +) + +func TestSupportedEncodingMappingsDump(t *testing.T) { + var supported []string + for k, _ := range SupportedEncodingMappings { + supported = append(supported, k) + } + sort.Strings(supported) + cupaloy.SnapshotT(t, jsonutil.BPM(supported)) +} + +func TestSupportedEncodingMappings(t *testing.T) { + for encoding, mappingFn := range SupportedEncodingMappings { + t.Run(encoding, func(t *testing.T) { + actual, err := ioutil.ReadAll(mappingFn(strings.NewReader("test"))) + assert.NoError(t, err) + assert.Equal(t, []byte("test"), actual) + }) + } +} + +func TestGetEncoding(t *testing.T) { + assert.Equal( + t, EncodingUTF8, (ParserSettings{Encoding: testutil.StrPtr(EncodingUTF8)}).GetEncoding()) + assert.Equal( + t, EncodingISO8859_1, (ParserSettings{Encoding: testutil.StrPtr(EncodingISO8859_1)}).GetEncoding()) + assert.Equal( + t, EncodingWindows1252, (ParserSettings{Encoding: testutil.StrPtr(EncodingWindows1252)}).GetEncoding()) + assert.Equal( + t, EncodingUTF8, (ParserSettings{}).GetEncoding()) + assert.Equal( + t, "whatever", (ParserSettings{Encoding: testutil.StrPtr("whatever")}).GetEncoding()) +} diff --git a/omniparser/transformCtx/ctx.go b/omniparser/transformCtx/ctx.go new file mode 100644 index 0000000..ede77d2 --- /dev/null +++ b/omniparser/transformCtx/ctx.go @@ -0,0 +1,5 @@ +package transformCtx + +// Ctx contains the context object used throughout the lifespan of a TransformOp action. +type Ctx struct { +} diff --git a/omniparser/transformOp.go b/omniparser/transformOp.go new file mode 100644 index 0000000..dc0ad89 --- /dev/null +++ b/omniparser/transformOp.go @@ -0,0 +1,13 @@ +package omniparser + +// TransformOp is an interface that represents one input stream parsing/transform operation. +// Instance of TransformOp must not be shared and reused among different input streams. +// Instance of TransformOp should not be used across multiple goroutines. +type TransformOp interface { + // Next indicates whether the parsing/transform operation is completed or not. + Next() bool + // Read returns a JSON byte slice representing one parsing/transform result. + Read() ([]byte, error) + // Parser returns the Parser from which this TransformOp is created. + Parser() Parser +} diff --git a/strutil/strutil.go b/strutil/strutil.go new file mode 100644 index 0000000..78330ba --- /dev/null +++ b/strutil/strutil.go @@ -0,0 +1,8 @@ +package strutil + +func StrPtrOrElse(sp *string, orElse string) string { + if sp != nil { + return *sp + } + return orElse +} diff --git a/strutil/strutil_test.go b/strutil/strutil_test.go new file mode 100644 index 0000000..e3bd632 --- /dev/null +++ b/strutil/strutil_test.go @@ -0,0 +1,14 @@ +package strutil + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/jf-tech/omniparser/testutil" +) + +func TestStrPtrOrElse(t *testing.T) { + assert.Equal(t, "this", StrPtrOrElse(testutil.StrPtr("this"), "that")) + assert.Equal(t, "that", StrPtrOrElse(nil, "that")) +} diff --git a/testutil/testutil.go b/testutil/testutil.go new file mode 100644 index 0000000..9e44c48 --- /dev/null +++ b/testutil/testutil.go @@ -0,0 +1,13 @@ +package testutil + +// IntPtr returns an int pointer with a given value. +// Tests cases needed inline int pointer declaration can use this. +func IntPtr(n int) *int { + return &n +} + +// StrPtr returns a string pointer with a given value. +// Tests cases needed inline string pointer declaration can use this. +func StrPtr(s string) *string { + return &s +} diff --git a/testutil/testutil_test.go b/testutil/testutil_test.go new file mode 100644 index 0000000..2a7eb20 --- /dev/null +++ b/testutil/testutil_test.go @@ -0,0 +1,19 @@ +package testutil + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIntPtr(t *testing.T) { + np := IntPtr(31415926) + assert.NotNil(t, np) + assert.Equal(t, 31415926, *np) +} + +func TestStrPtr(t *testing.T) { + sp := StrPtr("pi") + assert.NotNil(t, sp) + assert.Equal(t, "pi", *sp) +}