/
builder.go
146 lines (130 loc) · 3.61 KB
/
builder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package builder
import (
"fmt"
"path/filepath"
"sort"
"strconv"
"github.com/ikawaha/kagome-dict/dict"
"golang.org/x/text/encoding"
)
// MaxInt16 represents the int16 limit value.
const MaxInt16 = 1<<15 - 1
// Config represents the configuration of dictionary builder.
type Config struct {
paths []string
recordInfo *MorphRecordInfo
unkInfo *UnkRecordInfo
enc encoding.Encoding
MatrixDefFileName string
CharDefFileName string
UnkDefFileName string
}
// NewConfig creates a configuration for dictionary builder.
func NewConfig(path string, other []string, enc encoding.Encoding, info *MorphRecordInfo, unk *UnkRecordInfo) *Config {
paths := append([]string{path}, other...)
return &Config{
paths: paths,
recordInfo: info,
unkInfo: unk,
enc: enc,
// default def file names
MatrixDefFileName: "matrix.def",
CharDefFileName: "char.def",
UnkDefFileName: "unk.def",
}
}
// Build builds a dictionary.
func Build(c *Config) (*dict.Dict, error) {
if c == nil {
return nil, fmt.Errorf("empty config")
}
if len(c.paths) == 0 {
return nil, fmt.Errorf("empty path")
}
// Morph CSV
var records Records
for i, v := range c.paths {
var enc encoding.Encoding
if i == 0 {
enc = c.enc
}
rec, err := parseCSVFiles(v, enc, c.recordInfo.ColSize)
if err != nil {
return nil, err
}
records = append(records, rec...)
}
sort.Sort(records)
ret := dict.Dict{
Morphs: make([]dict.Morph, 0, len(records)),
POSTable: dict.POSTable{
POSs: make([]dict.POS, 0, len(records)),
},
ContentsMeta: c.recordInfo.Meta,
Contents: make([][]string, 0, len(records)),
}
// ConnectionTable
matrix, err := parseMatrixDefFile(c.paths[0] + "/" + c.MatrixDefFileName)
if err != nil {
return nil, err
}
ret.Connection.Row = matrix.rowSize
ret.Connection.Col = matrix.colSize
ret.Connection.Vec = matrix.vec
// Words
var keywords []string
posMap := make(dict.POSMap)
for _, rec := range records {
keywords = append(keywords, rec[c.recordInfo.SurfaceIndex])
l, err := strconv.Atoi(rec[c.recordInfo.LeftIDIndex])
if err != nil {
return nil, err
}
if l >= int(matrix.colSize) || l > MaxInt16 {
return nil, fmt.Errorf("morph left ID %d > %d, record: %v", l, MaxInt16, rec)
}
r, err := strconv.Atoi(rec[c.recordInfo.RightIDIndex])
if err != nil {
return nil, err
}
if r >= int(matrix.rowSize) || r > MaxInt16 {
return nil, fmt.Errorf("morph right ID %d > %d, record: %v", r, MaxInt16, rec)
}
w, err := strconv.Atoi(rec[c.recordInfo.WeightIndex])
if err != nil {
return nil, err
}
if w > MaxInt16 {
return nil, fmt.Errorf("morph weight %d > %d, record: %v", r, MaxInt16, rec)
}
m := dict.Morph{LeftID: int16(l), RightID: int16(r), Weight: int16(w)}
ret.Morphs = append(ret.Morphs, m)
ret.POSTable.POSs = append(ret.POSTable.POSs, posMap.Add(
rec[c.recordInfo.POSStartIndex:c.recordInfo.OtherContentsStartIndex]),
)
ret.Contents = append(ret.Contents, rec[c.recordInfo.OtherContentsStartIndex:])
}
ret.POSTable.NameList = posMap.List()
// Index
index, err := dict.BuildIndexTable(keywords)
if err != nil {
return nil, err
}
ret.Index = index
// CharDef
def, err := parseCharClassDefFile(c.paths[0] + "/" + c.CharDefFileName)
if err != nil {
return nil, err
}
ret.CharClass = def.charClass
ret.CharCategory = def.charCategory
ret.InvokeList = def.invokeMap
ret.GroupList = def.groupMap
// Unk
unk, err := parseUnkDefFile(filepath.Join(c.paths[0], c.UnkDefFileName), c.enc, c.unkInfo, ret.CharClass)
if err != nil {
return nil, fmt.Errorf("unk file parse error, %v", err)
}
ret.UnkDict = *unk
return &ret, err
}