Skip to content

Commit

Permalink
added malayalam tokenizer plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
joicemjoseph committed Oct 1, 2020
1 parent f4d87da commit 51967c4
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ build:
build-tokenizers:
# Compile the Kannada tokenizer.
go build -ldflags="-s -w" -buildmode=plugin -o kannada.tk tokenizers/kannada/kannada.go
go build -ldflags="-s -w" -buildmode=plugin -o malayalam.tk tokenizers/malayalam/malayalam.go

# pack-releases runs stuffbin packing on a given list of
# binaries. This is used with goreleaser for packing
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/lib/pq v1.2.0
github.com/mattn/go-sqlite3 v1.10.0 // indirect
github.com/spf13/pflag v1.0.3
gitlab.com/joice/mlphone-go v0.0.0-20201001084309-2bb02984eed8
google.golang.org/appengine v1.4.0 // indirect
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnIn
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
gitlab.com/joice/mlphone-go v0.0.0-20201001084309-2bb02984eed8 h1:+5m7ptsr40FZTM+5KeDanmT47Mf/tkIwGGlaej39KC4=
gitlab.com/joice/mlphone-go v0.0.0-20201001084309-2bb02984eed8/go.mod h1:5Dd7/l9PpLOInmiSIbpTn2DT6zOz+cSA6csICfPtVxU=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
google.golang.org/appengine v1.4.0 h1:/wp5JvzpHIxhs/dumFmF7BXTf3Z+dd4uXta4kVyO508=
Expand All @@ -58,3 +62,5 @@ gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b h1:P+3+n9hUbqSD
gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b/go.mod h1:0LRKfykySnChgQpG3Qpk+bkZFWazQ+MMfc5oldQCwnY=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
41 changes: 41 additions & 0 deletions tokenizers/malayalam/malayalam.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package malayalam

import (
"fmt"

"github.com/knadh/dictmaker/search"
"gitlab.com/joice/mlphone-go"
)

// Malayalam is the Kannada tokenizer that generates tsvectors for romanized (knphone algorithm)
// Kannada strings
type Malayalam struct {
ph *mlphone.MLPhone
}

// New returns a new instance of the Malayalam tokenizer.
func New() (search.Tokenizer, error) {
return &Malayalam{
ph: mlphone.New(),
}, nil
}

// ID returns the ID of the tokenizer.
func (*Malayalam) ID() string {
return "kannada"
}

// Name returns the name of the tokenizer.
func (*Malayalam) Name() string {
return "Kannada"
}

// Tokenize tokenizes a Kannada string into Romanized (mlphone) Postgres
// tsquery string.
func (ml *Malayalam) Tokenize(in string) string {
key0, key1, key2 := ml.ph.Encode(in)
if key0 == "" {
return ""
}
return fmt.Sprintf("%s | (%s & %s) ", key2, key1, key0)
}

0 comments on commit 51967c4

Please sign in to comment.