-
Notifications
You must be signed in to change notification settings - Fork 43
/
indicphone.go
86 lines (70 loc) · 1.82 KB
/
indicphone.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package indicphone
import (
"errors"
"fmt"
"slices"
"strings"
"github.com/knadh/dictpress/internal/data"
"github.com/knadh/knphone"
"gitlab.com/joice/mlphone-go"
)
// IndicPhone is a phonetic tokenizer that generates phonetic tokens for
// Indian languages. It is similar to Metaphone for English.
type IndicPhone struct {
kn *knphone.KNphone
ml *mlphone.MLPhone
}
// New returns a new instance of the Kannada tokenizer.
func New() *IndicPhone {
return &IndicPhone{
kn: knphone.New(),
ml: mlphone.New(),
}
}
// ToTokens tokenizes a string and a language returns an array of tsvector tokens.
// eg: [KRM0 KRM] or [KRM:2 KRM:1] with weights.
func (ip *IndicPhone) ToTokens(s string, lang string) ([]string, error) {
if lang != "kannada" && lang != "malayalam" {
return nil, errors.New("unknown language to tokenize")
}
var (
chunks = strings.Split(s, " ")
tokens = make([]data.Token, 0, len(chunks)*3)
key0, key1, key2 string
)
for _, c := range chunks {
switch lang {
case "kannada":
key0, key1, key2 = ip.kn.Encode(c)
case "malayalam":
key0, key1, key2 = ip.ml.Encode(c)
}
if key0 == "" {
continue
}
tokens = append(tokens,
data.Token{Token: key0, Weight: 3},
data.Token{Token: key1, Weight: 2},
data.Token{Token: key2, Weight: 1})
}
return data.TokensToTSVector(tokens), nil
}
// ToQuery tokenizes a Kannada string into Romanized (knphone) Postgres
// tsquery string.
func (ip *IndicPhone) ToQuery(s string, lang string) (string, error) {
var key0, key1, key2 string
switch lang {
case "kannada":
key0, key1, key2 = ip.kn.Encode(s)
case "malayalam":
key0, key1, key2 = ip.ml.Encode(s)
}
if key0 == "" {
return "", nil
}
tokens := slices.Compact([]string{key2, key1, key0})
if len(tokens) == 3 {
return fmt.Sprintf("%s | %s", key2, key1), nil
}
return tokens[0], nil
}