Skip to content

Commit

Permalink
Add an option to omit BOS/EOS tokens from outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
ikawaha committed Sep 30, 2020
1 parent 1dc2193 commit 99a93dc
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 139 deletions.
9 changes: 1 addition & 8 deletions README.md
Expand Up @@ -62,7 +62,7 @@ import (
)

func main() {
t, err := tokenizer.New(ipa.Dict())
t, err := tokenizer.New(ipa.Dict(), tokenizer.OmitBosEos())
if err != nil {
panic(err)
}
Expand All @@ -75,11 +75,6 @@ func main() {
fmt.Println("---tokenize---")
tokens := t.Tokenize("すもももももももものうち")
for _, token := range tokens {
if token.Class == tokenizer.DUMMY {
// BOS: Begin Of Sentence, EOS: End Of Sentence.
fmt.Printf("%s\n", token.Surface)
continue
}
features := strings.Join(token.Features(), ",")
fmt.Printf("%s\t%v\n", token.Surface, features)
}
Expand All @@ -92,15 +87,13 @@ output:
---wakati---
[すもも も もも も もも の うち]
---tokenize---
BOS
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
も 助詞,係助詞,*,*,*,*,も,モ,モ
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
の 助詞,連体化,*,*,*,*,の,ノ,ノ
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
EOS
```

# Commands
Expand Down
33 changes: 8 additions & 25 deletions tokenizer/tokenizer.go
Expand Up @@ -38,35 +38,15 @@ const (
Search
// Extended is the experimental tokenize mode.
Extended
// BosEosID means the beginning a sentence or the end of a sentence.
// BosEosID means the beginning a sentence (BOS) or the end of a sentence (EOS).
BosEosID = lattice.BosEosID
)

// Option represents an option for the tokenizer.
type Option func(*Tokenizer) error

// Nop represents a no operation option.
func Nop() Option {
return func(t *Tokenizer) error {
return nil
}
}

// UserDict is a tokenizer option to sets a user dictionary.
func UserDict(d *dict.UserDict) Option {
return func(t *Tokenizer) error {
if d == nil {
return errors.New("empty user dictionary")
}
t.userDict = d
return nil
}
}

// Tokenizer represents morphological analyzer.
type Tokenizer struct {
dict *dict.Dict // system dictionary
userDict *dict.UserDict // user dictionary
dict *dict.Dict // system dictionary
userDict *dict.UserDict // user dictionary
omitBosEos bool // omit BOS/EOS
}

// New creates a tokenizer.
Expand Down Expand Up @@ -120,6 +100,9 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) (tokens []Token) {
tokens = make([]Token, 0, size)
for i := range la.Output {
n := la.Output[size-1-i]
if t.omitBosEos && n.ID == BosEosID {
continue
}
tok := Token{
ID: n.ID,
Class: TokenClass(n.Class),
Expand All @@ -129,7 +112,7 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) (tokens []Token) {
dict: t.dict,
udict: t.userDict,
}
if tok.ID == lattice.BosEosID {
if tok.ID == BosEosID {
if i == 0 {
tok.Surface = "BOS"
} else {
Expand Down
36 changes: 36 additions & 0 deletions tokenizer/tokenizer_option.go
@@ -0,0 +1,36 @@
package tokenizer

import (
"errors"

"github.com/ikawaha/kagome-dict/dict"
)

// Option represents an option for the tokenizer.
type Option func(*Tokenizer) error

// Nop represents a no operation option.
func Nop() Option {
return func(t *Tokenizer) error {
return nil
}
}

// UserDict is a tokenizer option to sets a user dictionary.
func UserDict(d *dict.UserDict) Option {
return func(t *Tokenizer) error {
if d == nil {
return errors.New("empty user dictionary")
}
t.userDict = d
return nil
}
}

// OmitBosEos is a tokenizer option to omit BOS/EOS from output tokens.
func OmitBosEos() Option {
return func(t *Tokenizer) error {
t.omitBosEos = true
return nil
}
}
143 changes: 143 additions & 0 deletions tokenizer/tokenizer_option_test.go
@@ -0,0 +1,143 @@
package tokenizer

import (
"testing"

"github.com/ikawaha/kagome-dict/dict"
"github.com/ikawaha/kagome/v2/tokenizer/lattice"
)

const (
testUserDictPath = "../_sample/userdict.txt"
)

func Test_AnalyzeWithUserDict(t *testing.T) {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
udict, err := dict.NewUserDict(testUserDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tnz, err := New(d, UserDict(udict))
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tokens := tnz.Analyze("関西国際空港", Normal)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
{ID: -1, Surface: "EOS", Start: 6, End: 6},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}

}

func Test_AnalyzeWithSearchModeWithUserDict(t *testing.T) {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
udict, err := dict.NewUserDict(testUserDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tnz, err := New(d, UserDict(udict))
if err != nil {
t.Fatalf("unexpected error, %v", err)
}

tokens := tnz.Analyze("関西国際空港", Search)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
{ID: -1, Surface: "EOS", Start: 6, End: 6},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}

}

func Test_AnalyzeWithExtendedModeWithUserDict(t *testing.T) {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
udict, err := dict.NewUserDict(testUserDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tnz, err := New(d, UserDict(udict))
if err != nil {
t.Fatalf("unexpected error, %v", err)
}

tokens := tnz.Analyze("関西国際空港", Extended)
expected := []Token{
{ID: -1, Surface: "BOS"},
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
{ID: -1, Surface: "EOS", Start: 6, End: 6},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}

}

func TestTokenizer_Analyze_OmitBOSEOS(t *testing.T) {
d, err := dict.LoadDictFile(testDictPath)
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tnz, err := New(d, OmitBosEos())
if err != nil {
t.Fatalf("unexpected error, %v", err)
}
tokens := tnz.Analyze("関西国際空港", Normal)
expected := []Token{
{ID: 372978, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.KNOWN)},
}
if len(tokens) != len(expected) {
t.Fatalf("got %v, expected %v", tokens, expected)
}
for i, tok := range tokens {
if tok.ID != expected[i].ID ||
tok.Class != expected[i].Class ||
tok.Start != expected[i].Start ||
tok.End != expected[i].End ||
tok.Surface != expected[i].Surface {
t.Errorf("got %v, expected %v", tok, expected[i])
}
}
}

0 comments on commit 99a93dc

Please sign in to comment.