forked from dgraph-io/tokenizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.go
151 lines (137 loc) · 2.83 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package tokenizer
import (
"github.com/go-nlp/bpe"
"github.com/pkg/errors"
)
type Tokenizer struct {
ranks map[bpe.Pair]int
enc bpe.Encoder
lut map[rune]bpe.Pair
// buffers
pairbuf []bpe.Pair
wordbuf []rune
}
func NewTokenizer(enc bpe.Encoder) *Tokenizer {
ranks := make(map[bpe.Pair]int)
for i, p := range enc.Pairs {
ranks[p] = i
}
lut := make(map[rune]bpe.Pair)
for k, v := range enc.Replacements {
lut[v] = k
}
return &Tokenizer{
ranks: ranks,
enc: enc,
lut: lut,
pairbuf: make([]bpe.Pair, 0, 256),
wordbuf: make([]rune, 0, 512),
}
}
func (t *Tokenizer) Tokenize(a string) ([]string, error) {
t.wordbuf = t.wordbuf[:0]
pairs := bpe.PairsWithReuse(a, t.pairbuf)
if len(pairs) == 0 {
return []string{a}, nil
}
w := []rune(a)
newWord := t.wordbuf
for {
bigram, ok := t.minRank(pairs)
if !ok {
break
}
fst := bigram.Fst()
snd := bigram.Snd()
for i := 0; i < len(w); {
j, ok := index(w, i, fst)
if !ok {
newWord = append(newWord, w[i:]...)
break
} else {
newWord = append(newWord, w[i:j]...)
i = j
}
if w[i] == fst && i < len(w)-1 && w[i+1] == snd {
replacement, ok := t.enc.Replacements[bigram]
if !ok {
return nil, errors.Errorf("Cannot find replacement for the bigram %v", bigram)
}
newWord = append(newWord, replacement)
i += 2
} else {
newWord = append(newWord, w[i])
i++
}
}
copy(w, newWord)
w = w[:len(newWord)]
newWord = newWord[:0]
if len(w) == 1 {
break
}
pairs = bpe.PairsRunesWithReuse(w, t.pairbuf)
}
newWord = newWord[:0] // reuse the buffer
var tokens []string // don't preallocate this. it's faster to let Go handle appends than pre-empting at this point
for _, r := range w {
newWord = t.untokenize(r, newWord)
if len(newWord) == 0 {
continue
}
s := string(newWord)
if s == " " {
continue
}
tokens = append(tokens, s)
newWord = newWord[:0]
}
return tokens, nil
}
func (t *Tokenizer) Untokenize(a []string) string {
var retVal []rune
for _, w := range a {
asRunes := []rune(w)
var buf []rune
for _, r := range asRunes {
buf = t.untokenize(r, buf)
retVal = append(retVal, buf...)
buf = buf[:0]
}
}
return string(retVal)
}
func (t *Tokenizer) untokenize(a rune, buf []rune) []rune {
pair, ok := t.lut[a]
if ok {
buf = t.untokenize(pair.Fst(), buf)
buf = t.untokenize(pair.Snd(), buf)
} else {
buf = append(buf, a)
}
return buf
}
func (t *Tokenizer) minRank(ps []bpe.Pair) (min bpe.Pair, ok bool) {
rank := len(t.ranks) + 1
for _, p := range ps {
r, k := t.ranks[p]
if k && r < rank {
rank = r
min = p
ok = true
}
}
return
}
// UTIL
func index(rs []rune, start int, of rune) (int, bool) {
if start >= len(rs) {
return -1, false
}
for i, r := range rs[start:] {
if r == of {
return i + start, true
}
}
return -1, false
}