-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Language.java
481 lines (432 loc) · 15 KB
/
Language.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool;
import org.jetbrains.annotations.Nullable;
import org.languagetool.chunking.Chunker;
import org.languagetool.databroker.ResourceDataBroker;
import org.languagetool.language.Contributor;
import org.languagetool.languagemodel.LanguageModel;
import org.languagetool.rules.Rule;
import org.languagetool.rules.patterns.*;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
import org.languagetool.tagging.disambiguation.xx.DemoDisambiguator;
import org.languagetool.tagging.xx.DemoTagger;
import org.languagetool.tokenizers.SentenceTokenizer;
import org.languagetool.tokenizers.SimpleSentenceTokenizer;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tokenizers.WordTokenizer;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.regex.Pattern;
/**
* Base class for any supported language (English, German, etc). Language classes
* are detected at runtime by searching the classpath for files named
* {@code META-INF/org/languagetool/language-module.properties}. Those file(s)
* need to contain a key {@code languageClasses} which specifies the fully qualified
* class name(s), e.g. {@code org.languagetool.language.English}. Use commas to specify
* more than one class.
*
* <p>Sub classes should typically use lazy init for anything that's costly to set up.
* This improves start up time for the LanguageTool stand-alone version.
*/
public abstract class Language {
private static final Disambiguator DEMO_DISAMBIGUATOR = new DemoDisambiguator();
private static final Tagger DEMO_TAGGER = new DemoTagger();
private static final SentenceTokenizer SENTENCE_TOKENIZER = new SimpleSentenceTokenizer();
private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer();
private final UnifierConfiguration unifierConfig = new UnifierConfiguration();
private final UnifierConfiguration disambiguationUnifierConfig = new UnifierConfiguration();
private final Pattern ignoredCharactersRegex = Pattern.compile("[\u00AD]"); // soft hyphen
private List<AbstractPatternRule> patternRules;
/**
* Get this language's character code, e.g. <code>en</code> for English.
* For most languages this is a two-letter code according to ISO 639-1,
* but for those languages that don't have a two-letter code, a three-letter
* code according to ISO 639-2 is returned.
* The country parameter (e.g. "US"), if any, is not returned.
* @since 3.6
*/
public abstract String getShortCode();
/**
* Get this language's name in English, e.g. <code>English</code> or
* <code>German (Germany)</code>.
* @return language name
*/
public abstract String getName();
/**
* Get this language's country options , e.g. <code>US</code> (as in <code>en-US</code>) or
* <code>PL</code> (as in <code>pl-PL</code>).
* @return String[] - array of country options for the language.
*/
public abstract String[] getCountries();
/**
* Get the name(s) of the maintainer(s) for this language or <code>null</code>.
*/
@Nullable
public abstract Contributor[] getMaintainers();
/**
* Get the rules classes that should run for texts in this language.
* @since 1.4 (signature modified in 2.7)
*/
public abstract List<Rule> getRelevantRules(ResourceBundle messages) throws IOException;
// -------------------------------------------------------------------------
/**
* Get this language's variant, e.g. <code>valencia</code> (as in <code>ca-ES-valencia</code>)
* or <code>null</code>.
* Attention: not to be confused with "country" option
* @return variant for the language or {@code null}
* @since 2.3
*/
@Nullable
public String getVariant() {
return null;
}
/**
* Get enabled rules different from the default ones for this language variant.
*
* @return enabled rules for the language variant.
* @since 2.4
*/
public List<String> getDefaultEnabledRulesForVariant() {
return Collections.emptyList();
}
/**
* Get disabled rules different from the default ones for this language variant.
*
* @return disabled rules for the language variant.
* @since 2.4
*/
public List<String> getDefaultDisabledRulesForVariant() {
return Collections.emptyList();
}
/**
* @param indexDir directory with a '3grams' sub directory which contains a Lucene index with 3gram occurrence counts
* @return a LanguageModel or {@code null} if this language doesn't support one
* @since 2.7
*/
@Nullable
public LanguageModel getLanguageModel(File indexDir) throws IOException {
return null;
}
/**
* Get a list of rules that require a {@link LanguageModel}. Returns an empty list for
* languages that don't have such rules.
* @since 2.7
*/
public List<Rule> getRelevantLanguageModelRules(ResourceBundle messages, LanguageModel languageModel) throws IOException {
return Collections.emptyList();
}
/**
* Get this language's Java locale, not considering the country code.
*/
public Locale getLocale() {
return new Locale(getShortCode());
}
/**
* Get this language's Java locale, considering language code and country code (if any).
* @since 2.1
*/
public Locale getLocaleWithCountryAndVariant() {
if (getCountries().length > 0) {
if (getVariant() != null) {
return new Locale(getShortCode(), getCountries()[0], getVariant());
} else {
return new Locale(getShortCode(), getCountries()[0]);
}
} else {
return getLocale();
}
}
/**
* Get the location of the rule file(s) in a form like {@code /org/languagetool/rules/de/grammar.xml},
* i.e. a path in the classpath.
*/
public List<String> getRuleFileNames() {
List<String> ruleFiles = new ArrayList<>();
ResourceDataBroker dataBroker = JLanguageTool.getDataBroker();
ruleFiles.add(dataBroker.getRulesDir()
+ "/" + getShortCode() + "/" + JLanguageTool.PATTERN_FILE);
if (getShortCodeWithCountryAndVariant().length() > 2) {
String fileName = getShortCode() + "/"
+ getShortCodeWithCountryAndVariant()
+ "/" + JLanguageTool.PATTERN_FILE;
if (dataBroker.ruleFileExists(fileName)) {
ruleFiles.add(dataBroker.getRulesDir() + "/" + fileName);
}
}
return ruleFiles;
}
/**
* Languages that have country variants need to overwrite this to select their most common variant.
* @return default country variant or {@code null}
* @since 1.8
*/
@Nullable
public Language getDefaultLanguageVariant() {
return null;
}
/**
* Get this language's part-of-speech disambiguator implementation.
*/
public Disambiguator getDisambiguator() {
return DEMO_DISAMBIGUATOR;
}
/**
* Get this language's part-of-speech tagger implementation. The tagger must not
* be {@code null}, but it can be a trivial pseudo-tagger that only assigns {@code null} tags.
*/
public Tagger getTagger() {
return DEMO_TAGGER;
}
/**
* Get this language's sentence tokenizer implementation.
*/
public SentenceTokenizer getSentenceTokenizer() {
return SENTENCE_TOKENIZER;
}
/**
* Get this language's word tokenizer implementation.
*/
public Tokenizer getWordTokenizer() {
return WORD_TOKENIZER;
}
/**
* Get this language's chunker implementation or {@code null}.
* @since 2.3
*/
@Nullable
public Chunker getChunker() {
return null;
}
/**
* Get this language's chunker implementation or {@code null}.
* @since 2.9
*/
@Nullable
public Chunker getPostDisambiguationChunker() {
return null;
}
/**
* Get this language's part-of-speech synthesizer implementation or {@code null}.
*/
@Nullable
public Synthesizer getSynthesizer() {
return null;
}
/**
* Get this language's feature unifier.
* @return Feature unifier for analyzed tokens.
*/
public Unifier getUnifier() {
return unifierConfig.createUnifier();
}
/**
* Get this language's feature unifier used for disambiguation.
* Note: it might be different from the normal rule unifier.
* @return Feature unifier for analyzed tokens.
*/
public Unifier getDisambiguationUnifier() {
return disambiguationUnifierConfig.createUnifier();
}
/**
* @since 2.3
*/
public UnifierConfiguration getUnifierConfiguration() {
return unifierConfig;
}
/**
* @since 2.3
*/
public UnifierConfiguration getDisambiguationUnifierConfiguration() {
return disambiguationUnifierConfig;
}
/**
* Get the name of the language translated to the current locale,
* if available. Otherwise, get the untranslated name.
*/
public final String getTranslatedName(ResourceBundle messages) {
try {
return messages.getString(getShortCodeWithCountryAndVariant());
} catch (MissingResourceException e) {
try {
return messages.getString(getShortCode());
} catch (MissingResourceException e1) {
return getName();
}
}
}
/**
* Get the short name of the language with country and variant (if any), if it is
* a single-country language. For generic language classes, get only a two- or
* three-character code.
* @since 3.6
*/
public final String getShortCodeWithCountryAndVariant() {
String name = getShortCode();
if (getCountries().length == 1 && !name.contains("-x-")) { // e.g. "de-DE-x-simple-language"
name += "-" + getCountries()[0];
if (getVariant() != null) { // e.g. "ca-ES-valencia"
name += "-" + getVariant();
}
}
return name;
}
/**
* Get the pattern rules as defined in the files returned by {@link #getRuleFileNames()}.
* @since 2.7
*/
@SuppressWarnings("resource")
protected synchronized List<AbstractPatternRule> getPatternRules() throws IOException {
// use lazy loading to speed up start of stand-alone LT, where all the languages get initialized:
if (patternRules == null) {
List<AbstractPatternRule> rules = new ArrayList<>();
PatternRuleLoader ruleLoader = new PatternRuleLoader();
for (String fileName : getRuleFileNames()) {
InputStream is = null;
try {
is = this.getClass().getResourceAsStream(fileName);
if (is == null) { // files loaded via the dialog
is = new FileInputStream(fileName);
}
rules.addAll(ruleLoader.getRules(is, fileName));
patternRules = Collections.unmodifiableList(rules);
} finally {
if (is != null) {
is.close();
}
}
}
}
return patternRules;
}
@Override
public final String toString() {
return getName();
}
/**
* Whether this is a country variant of another language, i.e. whether it doesn't
* directly extend {@link Language}, but a subclass of {@link Language}.
* @since 1.8
*/
public final boolean isVariant() {
for (Language language : Languages.get()) {
boolean skip = language.getShortCodeWithCountryAndVariant().equals(getShortCodeWithCountryAndVariant());
if (!skip && language.getClass().isAssignableFrom(getClass())) {
return true;
}
}
return false;
}
/**
* Whether this class has at least one subclass that implements variants of this language.
* @since 1.8
*/
public final boolean hasVariant() {
for (Language language : Languages.get()) {
boolean skip = language.getShortCodeWithCountryAndVariant().equals(getShortCodeWithCountryAndVariant());
if (!skip && getClass().isAssignableFrom(language.getClass())) {
return true;
}
}
return false;
}
/**
* For internal use only. Overwritten to return {@code true} for languages that
* have been loaded from an external file after start up.
*/
public boolean isExternal() {
return false;
}
/**
* Return true if this is the same language as the given one, considering country
* variants only if set for both languages. For example: en = en, en = en-GB, en-GB = en-GB,
* but en-US != en-GB
* @since 1.8
*/
public boolean equalsConsiderVariantsIfSpecified(Language otherLanguage) {
if (getShortCode().equals(otherLanguage.getShortCode())) {
boolean thisHasCountry = hasCountry();
boolean otherHasCountry = otherLanguage.hasCountry();
return !(thisHasCountry && otherHasCountry) ||
getShortCodeWithCountryAndVariant().equals(otherLanguage.getShortCodeWithCountryAndVariant());
} else {
return false;
}
}
private boolean hasCountry() {
return getCountries().length == 1;
}
/**
* @return Return compiled regular expression to ignore inside tokens
* @since 2.9
*/
public Pattern getIgnoredCharactersRegex() {
return ignoredCharactersRegex;
}
/**
* Information about whether the support for this language in LanguageTool is actively maintained.
* If not, the user interface might show a warning.
* @since 3.3
*/
public LanguageMaintainedState getMaintainedState() {
return LanguageMaintainedState.LookingForNewMaintainer;
}
/*
* True if language should be hidden on GUI (i.e. en, de, pt,
* instead of en-US, de-DE, pt-PT)
* @since 3.3
*/
public boolean isHiddenFromGui() {
return hasVariant() && !isVariant() && !isTheDefaultVariant();
}
private boolean isTheDefaultVariant() {
if (getDefaultLanguageVariant() != null) {
return getClass().equals(getDefaultLanguageVariant().getClass());
}
return false;
}
/**
* Returns a priority for Rule or Category Id (default: 0).
* Positive integers have higher priority.
* Negative integers have lower priority.
* @since 3.6
*/
public int getPriorityForId(String id) {
return 0;
}
/**
* Considers languages as equal if their language code, including the country and variant codes are equal.
*/
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Language other = (Language) o;
return Objects.equals(getShortCodeWithCountryAndVariant(), other.getShortCodeWithCountryAndVariant());
}
@Override
public int hashCode() {
return getShortCodeWithCountryAndVariant().hashCode();
}
}