Add an option to omit BOS/EOS tokens from outputs

ikawaha · Sep 30, 2020 · 99a93dc · 99a93dc
1 parent 1dc2193
commit 99a93dc
Show file tree

Hide file tree

Showing 5 changed files with 189 additions and 139 deletions.
diff --git a/README.md b/README.md
@@ -62,7 +62,7 @@ import (
 )
 
 func main() {
-	t, err := tokenizer.New(ipa.Dict())
+	t, err := tokenizer.New(ipa.Dict(), tokenizer.OmitBosEos())
 	if err != nil {
 		panic(err)
 	}
@@ -75,11 +75,6 @@ func main() {
 	fmt.Println("---tokenize---")
 	tokens := t.Tokenize("すもももももももものうち")
 	for _, token := range tokens {
-		if token.Class == tokenizer.DUMMY {
-			// BOS: Begin Of Sentence, EOS: End Of Sentence.
-			fmt.Printf("%s\n", token.Surface)
-			continue
-		}
 		features := strings.Join(token.Features(), ",")
 		fmt.Printf("%s\t%v\n", token.Surface, features)
 	}
@@ -92,15 +87,13 @@ output:
 ---wakati---
 [すもも も もも も もも の うち]
 ---tokenize---
-BOS
 すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
 も	助詞,係助詞,*,*,*,*,も,モ,モ
 もも	名詞,一般,*,*,*,*,もも,モモ,モモ
 も	助詞,係助詞,*,*,*,*,も,モ,モ
 もも	名詞,一般,*,*,*,*,もも,モモ,モモ
 の	助詞,連体化,*,*,*,*,の,ノ,ノ
 うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
-EOS
 ```
 
 # Commands

diff --git a/tokenizer/tokenizer.go b/tokenizer/tokenizer.go
@@ -38,35 +38,15 @@ const (
 	Search
 	// Extended is the experimental tokenize mode.
 	Extended
-	// BosEosID means the beginning a sentence or the end of a sentence.
+	// BosEosID means the beginning a sentence (BOS) or the end of a sentence (EOS).
 	BosEosID = lattice.BosEosID
 )
 
-// Option represents an option for the tokenizer.
-type Option func(*Tokenizer) error
-
-// Nop represents a no operation option.
-func Nop() Option {
-	return func(t *Tokenizer) error {
-		return nil
-	}
-}
-
-// UserDict is a tokenizer option to sets a user dictionary.
-func UserDict(d *dict.UserDict) Option {
-	return func(t *Tokenizer) error {
-		if d == nil {
-			return errors.New("empty user dictionary")
-		}
-		t.userDict = d
-		return nil
-	}
-}
-
 // Tokenizer represents morphological analyzer.
 type Tokenizer struct {
-	dict     *dict.Dict     // system dictionary
-	userDict *dict.UserDict // user dictionary
+	dict       *dict.Dict     // system dictionary
+	userDict   *dict.UserDict // user dictionary
+	omitBosEos bool           // omit BOS/EOS
 }
 
 // New creates a tokenizer.
@@ -120,6 +100,9 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) (tokens []Token) {
 	tokens = make([]Token, 0, size)
 	for i := range la.Output {
 		n := la.Output[size-1-i]
+		if t.omitBosEos && n.ID == BosEosID {
+			continue
+		}
 		tok := Token{
 			ID:      n.ID,
 			Class:   TokenClass(n.Class),
@@ -129,7 +112,7 @@ func (t Tokenizer) Analyze(input string, mode TokenizeMode) (tokens []Token) {
 			dict:    t.dict,
 			udict:   t.userDict,
 		}
-		if tok.ID == lattice.BosEosID {
+		if tok.ID == BosEosID {
 			if i == 0 {
 				tok.Surface = "BOS"
 			} else {

diff --git a/tokenizer/tokenizer_option.go b/tokenizer/tokenizer_option.go
@@ -0,0 +1,36 @@
+package tokenizer
+
+import (
+	"errors"
+
+	"github.com/ikawaha/kagome-dict/dict"
+)
+
+// Option represents an option for the tokenizer.
+type Option func(*Tokenizer) error
+
+// Nop represents a no operation option.
+func Nop() Option {
+	return func(t *Tokenizer) error {
+		return nil
+	}
+}
+
+// UserDict is a tokenizer option to sets a user dictionary.
+func UserDict(d *dict.UserDict) Option {
+	return func(t *Tokenizer) error {
+		if d == nil {
+			return errors.New("empty user dictionary")
+		}
+		t.userDict = d
+		return nil
+	}
+}
+
+// OmitBosEos is a tokenizer option to omit BOS/EOS from output tokens.
+func OmitBosEos() Option {
+	return func(t *Tokenizer) error {
+		t.omitBosEos = true
+		return nil
+	}
+}
diff --git a/tokenizer/tokenizer_option_test.go b/tokenizer/tokenizer_option_test.go
@@ -0,0 +1,143 @@
+package tokenizer
+
+import (
+	"testing"
+
+	"github.com/ikawaha/kagome-dict/dict"
+	"github.com/ikawaha/kagome/v2/tokenizer/lattice"
+)
+
+const (
+	testUserDictPath = "../_sample/userdict.txt"
+)
+
+func Test_AnalyzeWithUserDict(t *testing.T) {
+	d, err := dict.LoadDictFile(testDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	udict, err := dict.NewUserDict(testUserDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tnz, err := New(d, UserDict(udict))
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tokens := tnz.Analyze("関西国際空港", Normal)
+	expected := []Token{
+		{ID: -1, Surface: "BOS"},
+		{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
+		{ID: -1, Surface: "EOS", Start: 6, End: 6},
+	}
+	if len(tokens) != len(expected) {
+		t.Fatalf("got %v, expected %v", tokens, expected)
+	}
+	for i, tok := range tokens {
+		if tok.ID != expected[i].ID ||
+			tok.Class != expected[i].Class ||
+			tok.Start != expected[i].Start ||
+			tok.End != expected[i].End ||
+			tok.Surface != expected[i].Surface {
+			t.Errorf("got %v, expected %v", tok, expected[i])
+		}
+	}
+
+}
+
+func Test_AnalyzeWithSearchModeWithUserDict(t *testing.T) {
+	d, err := dict.LoadDictFile(testDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	udict, err := dict.NewUserDict(testUserDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tnz, err := New(d, UserDict(udict))
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+
+	tokens := tnz.Analyze("関西国際空港", Search)
+	expected := []Token{
+		{ID: -1, Surface: "BOS"},
+		{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
+		{ID: -1, Surface: "EOS", Start: 6, End: 6},
+	}
+	if len(tokens) != len(expected) {
+		t.Fatalf("got %v, expected %v", tokens, expected)
+	}
+	for i, tok := range tokens {
+		if tok.ID != expected[i].ID ||
+			tok.Class != expected[i].Class ||
+			tok.Start != expected[i].Start ||
+			tok.End != expected[i].End ||
+			tok.Surface != expected[i].Surface {
+			t.Errorf("got %v, expected %v", tok, expected[i])
+		}
+	}
+
+}
+
+func Test_AnalyzeWithExtendedModeWithUserDict(t *testing.T) {
+	d, err := dict.LoadDictFile(testDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	udict, err := dict.NewUserDict(testUserDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tnz, err := New(d, UserDict(udict))
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+
+	tokens := tnz.Analyze("関西国際空港", Extended)
+	expected := []Token{
+		{ID: -1, Surface: "BOS"},
+		{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)},
+		{ID: -1, Surface: "EOS", Start: 6, End: 6},
+	}
+	if len(tokens) != len(expected) {
+		t.Fatalf("got %v, expected %v", tokens, expected)
+	}
+	for i, tok := range tokens {
+		if tok.ID != expected[i].ID ||
+			tok.Class != expected[i].Class ||
+			tok.Start != expected[i].Start ||
+			tok.End != expected[i].End ||
+			tok.Surface != expected[i].Surface {
+			t.Errorf("got %v, expected %v", tok, expected[i])
+		}
+	}
+
+}
+
+func TestTokenizer_Analyze_OmitBOSEOS(t *testing.T) {
+	d, err := dict.LoadDictFile(testDictPath)
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tnz, err := New(d, OmitBosEos())
+	if err != nil {
+		t.Fatalf("unexpected error, %v", err)
+	}
+	tokens := tnz.Analyze("関西国際空港", Normal)
+	expected := []Token{
+		{ID: 372978, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.KNOWN)},
+	}
+	if len(tokens) != len(expected) {
+		t.Fatalf("got %v, expected %v", tokens, expected)
+	}
+	for i, tok := range tokens {
+		if tok.ID != expected[i].ID ||
+			tok.Class != expected[i].Class ||
+			tok.Start != expected[i].Start ||
+			tok.End != expected[i].End ||
+			tok.Surface != expected[i].Surface {
+			t.Errorf("got %v, expected %v", tok, expected[i])
+		}
+	}
+}