forked from dgraph-io/tokenizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.go
110 lines (90 loc) · 2.2 KB
/
pipeline.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package tokenizer
import (
"github.com/pkg/errors"
"strings"
)
/*
normalizer : s -> (s|e)
splitter : s -> ([]s | e)
tokenizer : (s|[]s) -> ([]s | e)
*/
type Input interface {
String() string
Strings() []string
}
type Result interface {
Input
Err() error
}
type S string
func (s S) String() string { return string(s) }
func (s S) Strings() []string { return []string{string(s)} }
func (s S) Err() error { return nil }
type SS []string
func (s SS) String() string { panic("Cannot return string") }
func (s SS) Strings() []string { return []string(s) }
func (s SS) Err() error {
if len(s) == 0 {
return errors.Errorf("Empty string slice")
}
return nil
}
type Err struct{ error }
func (e Err) String() string { panic("Err is not a string") }
func (e Err) Strings() []string { panic("Err is not a string slice") }
func (e Err) Err() error { return e.error }
type Step interface {
Do(Input) Result
}
func Pipeline(steps ...Step) func(a string) ([]string, error) {
return func(a string) ([]string, error) {
var in Input = S(a)
var out Result
for i, step := range steps {
out = step.Do(in)
if err := out.Err(); err != nil {
return nil, errors.Wrapf(err, "in Step %d of pipeline", i)
}
in = out
}
if err := out.Err(); err != nil {
return nil, errors.Wrap(err, "In pipelined calls")
}
return out.Strings(), nil
}
}
// some steps in the pipeline
func (n *Normalizer) Do(a Input) Result {
s := a.String()
retVal, err := n.Norm(s)
if err != nil {
return Err{err}
}
return S(retVal)
}
func (t *Tokenizer) Do(a Input) Result {
ss := a.Strings()
var retVal SS
for _, s := range ss {
toks, err := t.Tokenize(s)
if err != nil {
return Err{errors.Wrap(err, "in tokenizer.Do()")}
}
retVal = append(retVal, toks...)
}
return retVal
}
// Split is a function that splits a string into strings
type Split func(a string) []string
func (s Split) Do(a Input) Result {
return SS(s(a.String()))
}
// BySpace splits a string into strings by space
func BySpace(a string) []string {
return strings.Split(a, " ")
}
// Transform is a function that transforms a string.
type Transform func(a string) string
func (t Transform) Do(a Input) Result {
return S(t(a.String()))
}