Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
189 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package tokenizer | ||
|
||
import ( | ||
"errors" | ||
|
||
"github.com/ikawaha/kagome-dict/dict" | ||
) | ||
|
||
// Option represents an option for the tokenizer. | ||
type Option func(*Tokenizer) error | ||
|
||
// Nop represents a no operation option. | ||
func Nop() Option { | ||
return func(t *Tokenizer) error { | ||
return nil | ||
} | ||
} | ||
|
||
// UserDict is a tokenizer option to sets a user dictionary. | ||
func UserDict(d *dict.UserDict) Option { | ||
return func(t *Tokenizer) error { | ||
if d == nil { | ||
return errors.New("empty user dictionary") | ||
} | ||
t.userDict = d | ||
return nil | ||
} | ||
} | ||
|
||
// OmitBosEos is a tokenizer option to omit BOS/EOS from output tokens. | ||
func OmitBosEos() Option { | ||
return func(t *Tokenizer) error { | ||
t.omitBosEos = true | ||
return nil | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
package tokenizer | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/ikawaha/kagome-dict/dict" | ||
"github.com/ikawaha/kagome/v2/tokenizer/lattice" | ||
) | ||
|
||
const ( | ||
testUserDictPath = "../_sample/userdict.txt" | ||
) | ||
|
||
func Test_AnalyzeWithUserDict(t *testing.T) { | ||
d, err := dict.LoadDictFile(testDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
udict, err := dict.NewUserDict(testUserDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tnz, err := New(d, UserDict(udict)) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tokens := tnz.Analyze("関西国際空港", Normal) | ||
expected := []Token{ | ||
{ID: -1, Surface: "BOS"}, | ||
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)}, | ||
{ID: -1, Surface: "EOS", Start: 6, End: 6}, | ||
} | ||
if len(tokens) != len(expected) { | ||
t.Fatalf("got %v, expected %v", tokens, expected) | ||
} | ||
for i, tok := range tokens { | ||
if tok.ID != expected[i].ID || | ||
tok.Class != expected[i].Class || | ||
tok.Start != expected[i].Start || | ||
tok.End != expected[i].End || | ||
tok.Surface != expected[i].Surface { | ||
t.Errorf("got %v, expected %v", tok, expected[i]) | ||
} | ||
} | ||
|
||
} | ||
|
||
func Test_AnalyzeWithSearchModeWithUserDict(t *testing.T) { | ||
d, err := dict.LoadDictFile(testDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
udict, err := dict.NewUserDict(testUserDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tnz, err := New(d, UserDict(udict)) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
|
||
tokens := tnz.Analyze("関西国際空港", Search) | ||
expected := []Token{ | ||
{ID: -1, Surface: "BOS"}, | ||
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)}, | ||
{ID: -1, Surface: "EOS", Start: 6, End: 6}, | ||
} | ||
if len(tokens) != len(expected) { | ||
t.Fatalf("got %v, expected %v", tokens, expected) | ||
} | ||
for i, tok := range tokens { | ||
if tok.ID != expected[i].ID || | ||
tok.Class != expected[i].Class || | ||
tok.Start != expected[i].Start || | ||
tok.End != expected[i].End || | ||
tok.Surface != expected[i].Surface { | ||
t.Errorf("got %v, expected %v", tok, expected[i]) | ||
} | ||
} | ||
|
||
} | ||
|
||
func Test_AnalyzeWithExtendedModeWithUserDict(t *testing.T) { | ||
d, err := dict.LoadDictFile(testDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
udict, err := dict.NewUserDict(testUserDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tnz, err := New(d, UserDict(udict)) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
|
||
tokens := tnz.Analyze("関西国際空港", Extended) | ||
expected := []Token{ | ||
{ID: -1, Surface: "BOS"}, | ||
{ID: 2, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.USER)}, | ||
{ID: -1, Surface: "EOS", Start: 6, End: 6}, | ||
} | ||
if len(tokens) != len(expected) { | ||
t.Fatalf("got %v, expected %v", tokens, expected) | ||
} | ||
for i, tok := range tokens { | ||
if tok.ID != expected[i].ID || | ||
tok.Class != expected[i].Class || | ||
tok.Start != expected[i].Start || | ||
tok.End != expected[i].End || | ||
tok.Surface != expected[i].Surface { | ||
t.Errorf("got %v, expected %v", tok, expected[i]) | ||
} | ||
} | ||
|
||
} | ||
|
||
func TestTokenizer_Analyze_OmitBOSEOS(t *testing.T) { | ||
d, err := dict.LoadDictFile(testDictPath) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tnz, err := New(d, OmitBosEos()) | ||
if err != nil { | ||
t.Fatalf("unexpected error, %v", err) | ||
} | ||
tokens := tnz.Analyze("関西国際空港", Normal) | ||
expected := []Token{ | ||
{ID: 372978, Surface: "関西国際空港", Start: 0, End: 6, Class: TokenClass(lattice.KNOWN)}, | ||
} | ||
if len(tokens) != len(expected) { | ||
t.Fatalf("got %v, expected %v", tokens, expected) | ||
} | ||
for i, tok := range tokens { | ||
if tok.ID != expected[i].ID || | ||
tok.Class != expected[i].Class || | ||
tok.Start != expected[i].Start || | ||
tok.End != expected[i].End || | ||
tok.Surface != expected[i].Surface { | ||
t.Errorf("got %v, expected %v", tok, expected[i]) | ||
} | ||
} | ||
} |
Oops, something went wrong.