-
Notifications
You must be signed in to change notification settings - Fork 44
/
juman_format.cc
176 lines (152 loc) · 4.45 KB
/
juman_format.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
//
// Created by Arseny Tolmachev on 2017/03/09.
//
#include "juman_format.h"
#include "core/analysis/charlattice.h"
namespace jumanpp {
namespace jumandic {
namespace output {
Status JumanFormat::format(const core::analysis::Analyzer& analysis,
StringPiece comment) {
printer.reset();
JPP_RETURN_IF_ERROR(analysisResult.reset(analysis));
JPP_RETURN_IF_ERROR(analysisResult.fillTop1(&top1));
auto& outMgr = analysis.output();
if (!comment.empty()) {
printer << "# " << comment << '\n';
}
while (top1.nextBoundary()) {
if (top1.remainingNodesInChunk() <= 0) {
return Status::InvalidState() << "no nodes in chunk";
}
core::analysis::ConnectionPtr connPtr;
if (!top1.nextNode(&connPtr)) {
return Status::InvalidState() << "failed to load a node";
}
formatOne(outMgr, connPtr, true);
while (top1.nextNode(&connPtr)) {
formatOne(outMgr, connPtr, false);
}
}
printer << "EOS\n";
return Status::Ok();
}
namespace {
StringPiece escapeForJumanOutput(StringPiece in) {
if (in.size() == 1) {
switch (in[0]) {
// return fullwidth char
case '\t':
return StringPiece("\\t");
case ' ':
return StringPiece("\\␣");
}
}
return in;
}
} // namespace
void formatNormalizedFeature(util::io::FastPrinter& p, i32 featureVal) {
p << "非標準表記:";
using m = core::analysis::charlattice::Modifiers;
namespace c = core::analysis::charlattice;
auto flag = static_cast<c::Modifiers>(featureVal);
if (c::ExistFlag(flag, m::REPLACE)) {
p << "R";
}
if (c::ExistFlag(flag, m::REPLACE_SMALLKANA)) {
p << "s";
}
if (c::ExistFlag(flag, m::REPLACE_PROLONG)) {
p << "p";
}
if (c::ExistFlag(flag, m::REPLACE_EROW_WITH_E)) {
p << "e";
}
if (c::ExistFlag(flag, m::DELETE)) {
p << "D";
}
if (c::ExistFlag(flag, m::DELETE_PROLONG)) {
p << "P";
}
if (c::ExistFlag(flag, m::DELETE_SMALLKANA)) {
p << "S";
}
if (c::ExistFlag(flag, m::DELETE_HASTSUON)) {
p << "H";
}
if (c::ExistFlag(flag, m::DELETE_LAST)) {
p << "L";
}
}
bool JumanFormat::formatOne(const core::analysis::OutputManager& om,
const core::analysis::ConnectionPtr& ptr,
bool first) {
core::analysis::LatticeNodePtr nodePtr{ptr.boundary, ptr.right};
JPP_RET_CHECK(om.locate(nodePtr, &walker));
while (walker.next()) {
if (!first) {
printer << "@ ";
}
auto fieldBuffer = walker.features();
JumandicPosId rawId{fieldBuffer[1], fieldBuffer[2],
fieldBuffer[4], // conjForm and conjType are reversed
fieldBuffer[3]};
auto newId = idResolver.dicToJuman(rawId);
printer << escapeForJumanOutput(flds.surface[walker]) << " ";
printer << escapeForJumanOutput(flds.reading[walker]) << " ";
printer << escapeForJumanOutput(flds.baseform[walker]) << " ";
printer << ifEmpty(flds.pos[walker], "*") << " " << newId.pos << " ";
printer << ifEmpty(flds.subpos[walker], "*") << " " << newId.subpos << " ";
printer << ifEmpty(flds.conjType[walker], "*") << " " << newId.conjType
<< " ";
printer << ifEmpty(flds.conjForm[walker], "*") << " " << newId.conjForm
<< " ";
auto res = flds.features[walker];
auto canonic = flds.canonicForm[walker];
auto eptr = walker.eptr();
bool hasFeatures = eptr.isSpecial() || res.hasNext() || !canonic.empty();
if (!hasFeatures) {
printer << "NIL";
} else {
bool output = false;
printer << '"';
if (!canonic.empty()) {
printer << "代表表記:" << canonic;
if (res.hasNext()) {
printer << " ";
}
output = true;
}
while (res.next()) {
output = true;
printer << res.key();
if (res.hasValue()) {
printer << ':' << res.value();
}
if (res.hasNext()) {
printer << " ";
}
}
if (eptr.isSpecial()) {
auto ufld =
om.valueOfUnkPlaceholder(eptr, jumandic::NormalizedPlaceholderIdx);
if (ufld != 0) {
if (output) {
printer << " ";
}
formatNormalizedFeature(printer, ufld);
}
}
printer << '"';
}
printer << "\n";
first = false;
}
return true;
}
JumanFormat::JumanFormat() : analysisResult() {
printer.reserve(16 * 1024); // 16k
}
} // namespace output
} // namespace jumandic
} // namespace jumanpp