diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/CommandLineTools.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/CommandLineTools.java
index 06e1a09151591..0461c54ad5f5c 100644
--- a/languagetool-commandline/src/main/java/org/languagetool/commandline/CommandLineTools.java
+++ b/languagetool-commandline/src/main/java/org/languagetool/commandline/CommandLineTools.java
@@ -328,7 +328,7 @@ public static void profileRulesOnText(String contents,
float timeInSeconds = time / 1000.0f;
float sentencesPerSecond = sentences.size() / timeInSeconds;
System.out.printf(Locale.ENGLISH,
- "%-40s%10d%10d%10d%15.1f\n", rule.getId(),
+ "%-40s%10d%10d%10d%15.1f\n", rule.getFullId(),
time, sentences.size(), matchCount, sentencesPerSecond);
}
diff --git a/languagetool-core/pom.xml b/languagetool-core/pom.xml
index 4147a06b55924..958f606ffc8f9 100644
--- a/languagetool-core/pom.xml
+++ b/languagetool-core/pom.xml
@@ -208,8 +208,8 @@
annotations
- org.jetbrains.intellij.deps
- trove4j
+ it.unimi.dsi
+ fastutil-core
org.json
diff --git a/languagetool-core/src/main/java/org/languagetool/AnalyzedTokenReadings.java b/languagetool-core/src/main/java/org/languagetool/AnalyzedTokenReadings.java
index fd65a110e4019..08ebae5fb9103 100644
--- a/languagetool-core/src/main/java/org/languagetool/AnalyzedTokenReadings.java
+++ b/languagetool-core/src/main/java/org/languagetool/AnalyzedTokenReadings.java
@@ -106,23 +106,23 @@ public AnalyzedTokenReadings(List tokens, int startPos) {
public AnalyzedTokenReadings(AnalyzedTokenReadings oldAtr, List newReadings, String ruleApplied) {
this(newReadings, oldAtr.getStartPos());
if (oldAtr.isSentenceEnd()) {
- this.setSentEnd();
+ setSentEnd();
}
if (oldAtr.isParagraphEnd()) {
- this.setParagraphEnd();
+ setParagraphEnd();
}
- this.setWhitespaceBefore(oldAtr.getWhitespaceBefore());
- this.setChunkTags(oldAtr.getChunkTags());
+ setWhitespaceBefore(oldAtr.getWhitespaceBefore());
+ setChunkTags(oldAtr.getChunkTags());
if (oldAtr.isImmunized()) {
- this.immunize(oldAtr.getImmunizationSourceLine());
+ immunize(oldAtr.getImmunizationSourceLine());
}
if (oldAtr.isIgnoredBySpeller()) {
- this.ignoreSpelling();
+ ignoreSpelling();
}
if (oldAtr.hasTypographicApostrophe()) {
- this.setTypographicApostrophe();
+ setTypographicApostrophe();
}
- this.setHistoricalAnnotations(oldAtr.getHistoricalAnnotations());
+ setHistoricalAnnotations(oldAtr.getHistoricalAnnotations());
addHistoricalAnnotations(oldAtr.toString(), ruleApplied);
}
@@ -147,14 +147,12 @@ public AnalyzedToken getAnalyzedToken(int idx) {
* @param posTag POS tag to look for
*/
public boolean hasPosTag(String posTag) {
- boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- found = posTag.equals(reading.getPOSTag());
- if (found) {
- break;
+ if (posTag.equals(reading.getPOSTag())) {
+ return true;
}
}
- return found;
+ return false;
}
/**
@@ -162,14 +160,12 @@ public boolean hasPosTag(String posTag) {
* @param posTag POS tag and lemma to look for
*/
public boolean hasPosTagAndLemma(String posTag, String lemma) {
- boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- found = posTag.equals(reading.getPOSTag()) && lemma.equals(reading.getLemma());
- if (found) {
- break;
+ if (posTag.equals(reading.getPOSTag()) && lemma.equals(reading.getLemma())) {
+ return true;
}
}
- return found;
+ return false;
}
/**
@@ -187,11 +183,8 @@ public boolean hasReading() {
public boolean hasLemma(String lemma) {
boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getLemma() != null) {
- found = lemma.equals(reading.getLemma());
- if (found) {
- break;
- }
+ if (reading.getLemma() != null && lemma.equals(reading.getLemma())) {
+ return true;
}
}
return found;
@@ -202,16 +195,14 @@ public boolean hasLemma(String lemma) {
* @param lemmas lemmas to look for
*/
public boolean hasAnyLemma(String... lemmas) {
- boolean found = false;
for(String lemma : lemmas) {
for (AnalyzedToken reading : anTokReadings) {
- found = lemma.equals(reading.getLemma());
- if (found) {
- return found;
+ if (lemma.equals(reading.getLemma())) {
+ return true;
}
}
}
- return found;
+ return false;
}
/**
@@ -221,16 +212,12 @@ public boolean hasAnyLemma(String... lemmas) {
* @since 1.8
*/
public boolean hasPartialPosTag(String posTag) {
- boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getPOSTag() != null) {
- found = reading.getPOSTag().contains(posTag);
- if (found) {
- break;
- }
+ if (reading.getPOSTag() != null && reading.getPOSTag().contains(posTag)) {
+ return true;
}
}
- return found;
+ return false;
}
/**
@@ -255,67 +242,57 @@ public boolean hasAnyPartialPosTag(String... posTags) {
* @since 4.0
*/
public boolean hasPosTagStartingWith(String posTag) {
- boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getPOSTag() != null) {
- found = reading.getPOSTag().startsWith(posTag);
- if (found) {
- break;
- }
+ if (reading.getPOSTag() != null && reading.getPOSTag().startsWith(posTag)) {
+ return true;
}
}
- return found;
+ return false;
}
/**
* Checks if at least one of the readings matches a given POS tag regex.
- *
* @param posTagRegex POS tag regular expression to look for
* @since 2.9
*/
public boolean matchesPosTagRegex(String posTagRegex) {
Pattern pattern = Pattern.compile(posTagRegex);
- boolean found = false;
+ return matchesPosTagRegex(pattern);
+ }
+
+ /**
+ * Checks if at least one of the readings matches a given POS tag pattern.
+ * @since 6.4
+ */
+ public boolean matchesPosTagRegex(Pattern pattern) {
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getPOSTag() != null) {
- found = pattern.matcher(reading.getPOSTag()).matches();
- if (found) {
- break;
- }
+ if (reading.getPOSTag() != null && pattern.matcher(reading.getPOSTag()).matches()) {
+ return true;
}
}
- return found;
+ return false;
}
-
+
public boolean matchesChunkRegex(String chunkRegex) {
Pattern pattern = Pattern.compile(chunkRegex);
- boolean found = false;
- for ( ChunkTag chunk : getChunkTags()) {
- if (chunk != null) {
- found = pattern.matcher(chunk.getChunkTag()).matches();
- if (found) {
- break;
- }
+ for (ChunkTag chunk : getChunkTags()) {
+ if (chunk != null && pattern.matcher(chunk.getChunkTag()).matches()) {
+ return true;
}
}
- return found;
+ return false;
}
/**
* Returns the first reading that matches a given POS tag regex.
- *
* @param posTagRegex POS tag regular expression to look for
* @since 5.5
*/
public AnalyzedToken readingWithTagRegex(String posTagRegex) {
Pattern pattern = Pattern.compile(posTagRegex);
- boolean found = false;
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getPOSTag() != null) {
- found = pattern.matcher(reading.getPOSTag()).matches();
- if (found) {
- return reading;
- }
+ if (reading.getPOSTag() != null && pattern.matcher(reading.getPOSTag()).matches()) {
+ return reading;
}
}
return null;
@@ -326,13 +303,9 @@ public AnalyzedToken readingWithTagRegex(String posTagRegex) {
* @since 5.8
*/
public AnalyzedToken readingWithLemma(String lemma) {
- boolean found;
for (AnalyzedToken reading : anTokReadings) {
- if (reading.getLemma() != null) {
- found = reading.getLemma().equals(lemma);
- if (found) {
- return reading;
- }
+ if (reading.getLemma() != null && reading.getLemma().equals(lemma)) {
+ return reading;
}
}
return null;
@@ -619,16 +592,17 @@ public String getHistoricalAnnotations() {
* @param historicalAnnotations the historicalAnnotations to set
*/
private void setHistoricalAnnotations(String historicalAnnotations) {
- this.historicalAnnotations = historicalAnnotations;
+ if (GlobalConfig.isVerbose()) {
+ this.historicalAnnotations = historicalAnnotations;
+ }
}
private void addHistoricalAnnotations(String oldValue, String ruleApplied) {
- if (!ruleApplied.isEmpty()) {
+ if (!ruleApplied.isEmpty() && GlobalConfig.isVerbose()) {
this.historicalAnnotations = this.getHistoricalAnnotations() + "\n" + ruleApplied + ": " + oldValue + " -> "
+ this;
}
}
-
/**
* @since 2.3
diff --git a/languagetool-core/src/main/java/org/languagetool/CheckResults.java b/languagetool-core/src/main/java/org/languagetool/CheckResults.java
index 7afff5bd6f855..606ca6643bf57 100644
--- a/languagetool-core/src/main/java/org/languagetool/CheckResults.java
+++ b/languagetool-core/src/main/java/org/languagetool/CheckResults.java
@@ -18,46 +18,48 @@
*/
package org.languagetool;
+import lombok.Getter;
import org.jetbrains.annotations.NotNull;
import org.languagetool.rules.RuleMatch;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
+import java.util.stream.Collectors;
/**
* @since 5.3
*/
public class CheckResults {
+ @Getter
private List ruleMatches;
- private List ignoredRanges;
+ @Getter
+ private final List ignoredRanges;
+ @Getter
+ private final List extendedSentenceRanges;
private final List sentenceRanges = new ArrayList<>();
- public CheckResults(List ruleMatches, List ignoredRanges) {
- this.ruleMatches = Objects.requireNonNull(ruleMatches);
- this.ignoredRanges = Objects.requireNonNull(ignoredRanges);
- }
- public List getIgnoredRanges() {
- return ignoredRanges;
+ public CheckResults(List ruleMatches, List ignoredRanges) {
+ this(ruleMatches, ignoredRanges, Collections.emptyList());
}
- public List getRuleMatches() {
- return ruleMatches;
+ public CheckResults(List ruleMatches, List ignoredRanges, List extendedSentenceRanges) {
+ this.ruleMatches = Objects.requireNonNull(ruleMatches);
+ this.ignoredRanges = Objects.requireNonNull(ignoredRanges);
+ this.extendedSentenceRanges = Objects.requireNonNull(extendedSentenceRanges.stream().sorted().collect(Collectors.toList()));
+ //TODO: use this later, when we are sure the sentenceRanges (from extendedSentenceRange) are are correct.
+ // Right now the sentenceRanges are calculated different from those in extendedSentenceRange.
+ // extendedSentenceRanges.forEach(extendedSentenceRange -> this.sentenceRanges.add(new SentenceRange(extendedSentenceRange.getFromPos(), extendedSentenceRange.getToPos())));
}
@NotNull
public List getSentenceRanges() {
- return sentenceRanges;
+ return Collections.unmodifiableList(this.sentenceRanges);
}
+
public void addSentenceRanges(List sentenceRanges) {
this.sentenceRanges.addAll(sentenceRanges);
}
-
- public void setIgnoredRanges(List ignoredRanges) {
- this.ignoredRanges = Objects.requireNonNull(ignoredRanges);
- }
public void setRuleMatches(List ruleMatches) {
this.ruleMatches = Objects.requireNonNull(ruleMatches);
diff --git a/languagetool-core/src/main/java/org/languagetool/DynamicLanguage.java b/languagetool-core/src/main/java/org/languagetool/DynamicLanguage.java
index 548de1083c89a..8b742f86b379e 100644
--- a/languagetool-core/src/main/java/org/languagetool/DynamicLanguage.java
+++ b/languagetool-core/src/main/java/org/languagetool/DynamicLanguage.java
@@ -25,9 +25,12 @@
import java.util.Collections;
import java.util.List;
import java.util.Objects;
+import java.util.regex.Pattern;
abstract class DynamicLanguage extends Language {
+ private static final Pattern DASH = Pattern.compile("-.*");
+
protected final String name;
protected final String code;
protected final File dictPath;
@@ -40,7 +43,7 @@ abstract class DynamicLanguage extends Language {
@Override
public String getShortCode() {
- return code.replaceFirst("-.*", "");
+ return DASH.matcher(code).replaceFirst("");
}
@Override
diff --git a/languagetool-core/src/main/java/org/languagetool/ExtendedSentenceRange.java b/languagetool-core/src/main/java/org/languagetool/ExtendedSentenceRange.java
new file mode 100644
index 0000000000000..a41ea88600969
--- /dev/null
+++ b/languagetool-core/src/main/java/org/languagetool/ExtendedSentenceRange.java
@@ -0,0 +1,76 @@
+/*
+ * LanguageTool, a natural language style checker
+ * Copyright (c) 2023. Stefan Viol (https://stevio.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+package org.languagetool;
+
+import lombok.Getter;
+import org.jetbrains.annotations.NotNull;
+
+import java.util.*;
+
+public final class ExtendedSentenceRange implements Comparable {
+
+ @Getter
+ private final int fromPos;
+ @Getter
+ private final int toPos;
+ @Getter
+ private final Map languageConfidenceRates; //languageCode;0-1 confidenceRate from LanguageDetectionService
+
+ ExtendedSentenceRange(int fromPos, int toPos, String languageCode) {
+ this(fromPos, toPos, Collections.singletonMap(languageCode, 1.0f));
+ }
+
+ ExtendedSentenceRange(int fromPos, int toPos, @NotNull Map languageConfidenceRates) {
+ this.fromPos = fromPos;
+ this.toPos = toPos;
+ this.languageConfidenceRates = new LinkedHashMap<>(languageConfidenceRates);
+ }
+
+ public void updateLanguageConfidenceRates(@NotNull Map languageConfidenceRates) {
+ this.languageConfidenceRates.clear();
+ this.languageConfidenceRates.putAll(languageConfidenceRates);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ ExtendedSentenceRange extendedSentenceRange = (ExtendedSentenceRange) o;
+ return fromPos == extendedSentenceRange.fromPos && toPos == extendedSentenceRange.toPos;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = fromPos;
+ result = 31 * result + toPos;
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return fromPos + "-" + toPos + ":" + languageConfidenceRates;
+ }
+
+ @Override
+ public int compareTo(@NotNull ExtendedSentenceRange o) {
+ return Integer.compare(this.fromPos, o.fromPos);
+ }
+}
diff --git a/languagetool-core/src/main/java/org/languagetool/GlobalConfig.java b/languagetool-core/src/main/java/org/languagetool/GlobalConfig.java
index 0389d676cdea2..0833525d82eca 100644
--- a/languagetool-core/src/main/java/org/languagetool/GlobalConfig.java
+++ b/languagetool-core/src/main/java/org/languagetool/GlobalConfig.java
@@ -34,6 +34,19 @@ public class GlobalConfig {
private File beolingusFile;
private String nerUrl;
+ private static boolean verbose = false;
+
+ /**
+ * @return whether we need to track additional information like e.g. the disambiguation log to show in verbose mode
+ */
+ public static boolean isVerbose() {
+ return verbose;
+ }
+
+ public static void setVerbose(boolean verbose) {
+ GlobalConfig.verbose = verbose;
+ }
+
public void setGrammalecteServer(String serverUrl) {
grammalecteServer = serverUrl;
}
@@ -79,6 +92,7 @@ public String getNerUrl() {
return nerUrl;
}
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java b/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
index 8f82e614d62f6..3d03bead69102 100644
--- a/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
+++ b/languagetool-core/src/main/java/org/languagetool/JLanguageTool.java
@@ -74,10 +74,12 @@
* @see MultiThreadedJLanguageTool
*/
public class JLanguageTool {
+
private static final Logger logger = LoggerFactory.getLogger(JLanguageTool.class);
+ private static final Pattern ZERO_WIDTH_NBSP = Pattern.compile("(?<=\uFEFF)|(?=\uFEFF)");
/** LanguageTool version as a string like {@code 2.3} or {@code 2.4-SNAPSHOT}. */
- public static final String VERSION = "6.3-SNAPSHOT";
+ public static final String VERSION = "6.4-SNAPSHOT";
/** LanguageTool build date and time like {@code 2013-10-17 16:10} or {@code null} if not run from JAR. */
@Nullable public static final String BUILD_DATE = getBuildDate();
/**
@@ -518,6 +520,7 @@ private List getAllBuiltinRules(Language language, ResourceBundle messages
*/
public void setOutput(PrintStream printStream) {
this.printStream = printStream;
+ GlobalConfig.setVerbose(printStream != null);
}
/**
@@ -749,7 +752,7 @@ public void disableCategory(CategoryId id) {
*
* @param id the id of the category to check - no error will be thrown if the id does not exist
* @return true if this category is explicitly disabled.
- * @see #disableCategory(org.languagetool.rules.CategoryId)
+ * @see #disableCategory(CategoryId)
* @since 3.5
*/
public boolean isCategoryDisabled(CategoryId id) {
@@ -787,7 +790,7 @@ public void enableRule(String ruleId) {
* Enable all rules of the given category so the check methods like {@link #check(String)} will use it.
* This will not throw an exception if the given rule id doesn't exist.
*
- * @see #disableCategory(org.languagetool.rules.CategoryId)
+ * @see #disableCategory(CategoryId)
* @since 3.3
*/
public void enableRuleCategory(CategoryId id) {
@@ -927,7 +930,8 @@ public CheckResults check2(AnnotatedText annotatedText, boolean tokenizeText, Pa
List sentences = getSentences(annotatedText, tokenizeText);
List analyzedSentences = analyzeSentences(sentences);
CheckResults checkResults = checkInternal(annotatedText, paraMode, listener, mode, level, toneTags, textSessionID, sentences, analyzedSentences);
- checkResults.addSentenceRanges(SentenceRange.getRangesFromSentences(annotatedText, sentences));
+ List sentenceRanges = SentenceRange.getRangesFromSentences(annotatedText, sentences);
+ checkResults.addSentenceRanges(sentenceRanges);
return checkResults;
}
@@ -951,7 +955,7 @@ private AnnotatedText cleanText(AnnotatedText annotatedText) {
if (part.getType() == TextPart.Type.TEXT) {
String byteOrderMark = "\uFEFF"; // BOM or zero-width non-breaking space
// split by byteOrderMark and let the delimiter also be part of the array
- String[] split = part.getPart().split("(?<=\uFEFF)|(?=\uFEFF)");
+ String[] split = ZERO_WIDTH_NBSP.split(part.getPart());
for (String text : split) {
if ("\uFEFF".equals(text)) {
atb.addMarkup(byteOrderMark);
@@ -1055,8 +1059,7 @@ protected CheckResults checkInternal(AnnotatedText annotatedText, ParagraphHandl
} catch (Exception e) {
throw new RuntimeException(e);
}
-
- return new CheckResults(ruleMatches, res.getIgnoredRanges());
+ return new CheckResults(ruleMatches, res.getIgnoredRanges(), res.getExtendedSentenceRanges());
}
private List filterMatches(AnnotatedText annotatedText, RuleSet rules, List ruleMatches) {
@@ -1445,12 +1448,12 @@ private List checkAnalyzedSentence(ParagraphHandling paraMode, List 0 && errorsPerWord > maxErrorsPerWordRate && wordCounter > 25) {
errorRateLog.forEach(e -> logger.info(LoggingTools.BAD_REQUEST, e));
- logger.info(LoggingTools.BAD_REQUEST, "ErrorRateTooHigh is reached by a single sentence after rule: " + rule.getFullId() +
- " the whole text contains " + wordCounter + " words " +
- " this sentence has " + sentenceMatches.size() + " matches");
- throw new ErrorRateTooHighException("ErrorRateTooHigh is reached by a single sentence after rule: " + rule.getFullId() +
- " the whole text contains " + wordCounter + " words" +
- " this sentence has " + sentenceMatches.size() + " matches");
+ logger.info(LoggingTools.BAD_REQUEST, "ErrorRateTooHigh is reached by a single sentence after rule: " + rule.getFullId() + ". " +
+ "The whole text contains " + wordCounter + " words " +
+ " and this sentence has " + sentenceMatches.size() + " matches.");
+ throw new ErrorRateTooHighException("ErrorRateTooHigh is reached by a single sentence after rule: " + rule.getFullId() + ". " +
+ "The whole text contains " + wordCounter + " words" +
+ "and this sentence has " + sentenceMatches.size() + " matches.");
}
}
}
@@ -1936,15 +1939,18 @@ class TextCheckCallable implements Callable {
public CheckResults call() throws Exception {
List ruleMatches = new ArrayList<>();
List ignoreRanges = new ArrayList<>();
+ List extendedSentenceRanges = new ArrayList<>();
if (mode == Mode.ALL) {
ruleMatches.addAll(getTextLevelRuleMatches());
CheckResults otherRuleMatches = getOtherRuleMatches(toneTags);
ruleMatches.addAll(otherRuleMatches.getRuleMatches());
ignoreRanges.addAll(otherRuleMatches.getIgnoredRanges());
+ extendedSentenceRanges.addAll(otherRuleMatches.getExtendedSentenceRanges());
} else if (mode == Mode.ALL_BUT_TEXTLEVEL_ONLY) {
CheckResults otherRuleMatches = getOtherRuleMatches(toneTags);
ruleMatches.addAll(otherRuleMatches.getRuleMatches());
ignoreRanges.addAll(otherRuleMatches.getIgnoredRanges());
+ extendedSentenceRanges.addAll(otherRuleMatches.getExtendedSentenceRanges());
} else if (mode == Mode.TEXTLEVEL_ONLY) {
ruleMatches.addAll(getTextLevelRuleMatches());
} else {
@@ -1952,7 +1958,7 @@ public CheckResults call() throws Exception {
}
// can't call applyCustomRuleFilters here, done in performCheck ->
// should run just once w/ complete list of matches
- return new CheckResults(ruleMatches, ignoreRanges);
+ return new CheckResults(ruleMatches, ignoreRanges, extendedSentenceRanges);
}
private List getTextLevelRuleMatches() throws IOException {
@@ -1969,15 +1975,21 @@ private List getTextLevelRuleMatches() throws IOException {
RuleMatch[] matches = ((TextLevelRule) rule).match(analyzedSentences, annotatedText);
List adaptedMatches = new ArrayList<>();
for (RuleMatch match : matches) {
- LineColumnPosition from = findLineColumn(match.getFromPos());
- LineColumnPosition to = findLineColumn(match.getToPos());
+ LineColumnPosition from;
+ LineColumnPosition to;
+ try {
+ from = findLineColumn(match.getFromPos());
+ to = findLineColumn(match.getToPos());
+ } catch (RuntimeException e) {
+ throw new RuntimeException("Getting line/column positions failed for match " + match + " Sentence: " + match.getSentence().getText(), e);
+ }
int newFromPos;
int newToPos;
try {
newFromPos = annotatedText.getOriginalTextPositionFor(match.getFromPos(), false);
newToPos = annotatedText.getOriginalTextPositionFor(match.getToPos() - 1, true) + 1;
} catch (RuntimeException e) {
- throw new RuntimeException("Getting positions failed for match " + match, e);
+ throw new RuntimeException("Getting positions failed for match " + match + " Sentence: " + match.getSentence().getText(), e);
}
RuleMatch newMatch = new RuleMatch(match);
newMatch.setOffsetPosition(newFromPos, newToPos);
@@ -2001,7 +2013,9 @@ private List getTextLevelRuleMatches() throws IOException {
private CheckResults getOtherRuleMatches(Set toneTags) {
List ruleMatches = new ArrayList<>();
- List ignoreRanges = new ArrayList<>();
+ List ignoreRanges = new ArrayList<>(); //TODO: remove later
+ List extendedSentenceRanges = new ArrayList<>();
+
int textWordCounter = sentences.stream().map(sentenceData -> sentenceData.wordCount).reduce(0, Integer::sum);
int wordCounter = 0;
float tmpErrorsPerWord = 0.0f;
@@ -2009,6 +2023,8 @@ private CheckResults getOtherRuleMatches(Set toneTags) {
for (int i = 0, sentencesSize = sentences.size(); i < sentencesSize; i++) {
SentenceData sentence = sentences.get(i);
wordCounter += sentence.wordCount;
+ ExtendedSentenceRange extendedSentenceRange = new ExtendedSentenceRange(sentence.startOffset, sentence.startOffset + sentence.text.trim().length(), language.getShortCode());
+ extendedSentenceRanges.add(extendedSentenceRange);
try {
//comment in to trigger an exception via input text:
//if (analyzedSentence.getText().contains("fakecrash")) {
@@ -2023,7 +2039,6 @@ private CheckResults getOtherRuleMatches(Set toneTags) {
sentenceMatches = cache.getIfPresent(cacheKey);
}
if (sentenceMatches == null) {
-
List rules = new ArrayList<>(this.rules.rulesForSentence(sentence.analyzed));
rules.addAll(userConfig.getRules());
sentenceMatches = checkAnalyzedSentence(paraMode, rules, sentence.analyzed, checkRemoteRules, textWordCounter);
@@ -2037,11 +2052,13 @@ private CheckResults getOtherRuleMatches(Set toneTags) {
}
for (RuleMatch elem : sentenceMatches) {
RuleMatch thisMatch = adjustRuleMatchPos(elem, sentence.startOffset, sentence.startColumn, sentence.startLine, sentence.text, annotatedText);
- if (elem.getErrorLimitLang() != null) {
- Range ignoreRange = new Range(sentence.startOffset, sentence.startOffset + sentence.text.length(), elem.getErrorLimitLang());
+ if (!elem.getNewLanguageMatches().isEmpty()) {
+ //TODO: remove after the addon is updated
+ Range ignoreRange = new Range(sentence.startOffset, sentence.startOffset + sentence.text.length(), elem.getNewLanguageMatches().entrySet().iterator().next().getKey());
if (!ignoreRanges.contains(ignoreRange)) {
ignoreRanges.add(ignoreRange);
}
+ extendedSentenceRange.updateLanguageConfidenceRates(elem.getNewLanguageMatches());
}
ruleMatches.add(thisMatch);
if (listener != null) {
@@ -2072,7 +2089,7 @@ private CheckResults getOtherRuleMatches(Set toneTags) {
+ StringUtils.abbreviate(sentence.analyzed.toTextString(), 500) + "", e);
}
}
- return new CheckResults(ruleMatches, ignoreRanges);
+ return new CheckResults(ruleMatches, ignoreRanges, extendedSentenceRanges);
}
private LineColumnPosition findLineColumn(int offset) {
diff --git a/languagetool-core/src/main/java/org/languagetool/Language.java b/languagetool-core/src/main/java/org/languagetool/Language.java
index d63394fb94675..46bb9c46ebc87 100644
--- a/languagetool-core/src/main/java/org/languagetool/Language.java
+++ b/languagetool-core/src/main/java/org/languagetool/Language.java
@@ -32,6 +32,7 @@
import org.languagetool.rules.patterns.Unifier;
import org.languagetool.rules.patterns.UnifierConfiguration;
import org.languagetool.rules.spelling.SpellingCheckRule;
+import org.languagetool.rules.spelling.multitoken.MultitokenSpeller;
import org.languagetool.synthesis.Synthesizer;
import org.languagetool.tagging.Tagger;
import org.languagetool.tagging.disambiguation.Disambiguator;
@@ -52,6 +53,8 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import static java.util.regex.Pattern.*;
+
/**
* Base class for any supported language (English, German, etc). Language classes
* are detected at runtime by searching the classpath for files named
@@ -70,16 +73,31 @@ public abstract class Language {
private static final Tagger DEMO_TAGGER = new DemoTagger();
private static final SentenceTokenizer SENTENCE_TOKENIZER = new SimpleSentenceTokenizer();
private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer();
- private static final Pattern INSIDE_SUGGESTION = Pattern.compile("(.+?)");
- private static final Pattern APOSTROPHE = Pattern.compile("([\\p{L}\\d-])'([\\p{L}«])",
- Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
+ private static final Pattern INSIDE_SUGGESTION = compile("(.+?)");
+ private static final Pattern APOSTROPHE = compile("([\\p{L}\\d-])'([\\p{L}«])",
+ CASE_INSENSITIVE | UNICODE_CASE);
+
+ private static final Pattern SUGGESTION_OPEN_TAG = compile("");
+ private static final Pattern SUGGESTION_CLOSE_TAG = compile("");
+
+ private static final Pattern ELLIPSIS = compile("\\.\\.\\.");
+ private static final Pattern NBSPACE1 = compile("\\b([a-zA-Z]\\.) ([a-zA-Z]\\.)");
+ private static final Pattern NBSPACE2 = compile("\\b([a-zA-Z]\\.) ");
private static final Map, JLanguageTool> languagetoolInstances = new ConcurrentHashMap<>();
+ private static final Pattern SINGLE_QUOTE_PATTERN = compile("'");
+ private static final Pattern QUOTED_CHAR_PATTERN = compile(" '(.)'");
+ private static final Pattern TYPOGRAPHY_PATTERN_1 = compile("([\\u202f\\u00a0 «\"\\(])'");
+ private static final Pattern TYPOGRAPHY_PATTERN_2 = compile("'([\u202f\u00a0 !\\?,\\.;:\"\\)])");
+ private static final Pattern TYPOGRAPHY_PATTERN_3 = compile("‘s\\b([^’])");
+ private static final Pattern DOUBLE_QUOTE_PATTERN = compile("\"");
+ private static final Pattern TYPOGRAPHY_PATTERN_4 = compile("([ \\(])\"");
+ private static final Pattern TYPOGRAPHY_PATTERN_5 = compile("\"([\\u202f\\u00a0 !\\?,\\.;:\\)])");
private final UnifierConfiguration unifierConfig = new UnifierConfiguration();
private final UnifierConfiguration disambiguationUnifierConfig = new UnifierConfiguration();
- private final Pattern ignoredCharactersRegex = Pattern.compile("[\u00AD]"); // soft hyphen
+ private final Pattern ignoredCharactersRegex = compile("[\u00AD]"); // soft hyphen
private List patternRules;
private final AtomicBoolean noLmWarningPrinted = new AtomicBoolean();
@@ -92,6 +110,11 @@ public abstract class Language {
private Chunker postDisambiguationChunker;
private Synthesizer synthesizer;
+ private String shortCodeWithCountryAndVariant;
+
+ protected Language() {
+ }
+
/**
* Get this language's character code, e.g. en
for English.
* For most languages this is a two-letter code according to ISO 639-1,
@@ -213,7 +236,7 @@ public List getRelevantLanguageModelCapableRules(ResourceBundle messages,
}
/**
- * For rules that depend on a remote server; based on {@link org.languagetool.rules.RemoteRule}
+ * For rules that depend on a remote server; based on {@link RemoteRule}
* will be executed asynchronously, with timeout, retries, etc. as configured
* Can return non-remote rules (e.g. if configuration missing, or for A/B tests), will be executed normally
*/
@@ -243,7 +266,7 @@ public List getRelevantRemoteRules(ResourceBundle messageBundle, List", getOpeningDoubleQuote()).replaceAll("", getClosingDoubleQuote());
+ return SUGGESTION_CLOSE_TAG.matcher(
+ SUGGESTION_OPEN_TAG.matcher(input).replaceAll(getOpeningDoubleQuote())
+ ).replaceAll(getClosingDoubleQuote());
}
String output = input;
@@ -862,49 +903,51 @@ public String toAdvancedTypography(String input) {
while (m.find(offset)) {
String group = m.group(1);
preservedStrings.add(group);
- output = output.replaceFirst("" + Pattern.quote(group) + "", "\\\\" + String.valueOf(countPreserved));
+ output = output.replaceFirst("" + quote(group) + "", "\\\\" + countPreserved);
countPreserved++;
offset = m.end();
}
// Ellipsis (for all languages?)
- output = output.replaceAll("\\.\\.\\.", "…");
+ output = ELLIPSIS.matcher(output).replaceAll("…");
// non-breaking space
- output = output.replaceAll("\\b([a-zA-Z]\\.) ([a-zA-Z]\\.)", "$1\u00a0$2");
- output = output.replaceAll("\\b([a-zA-Z]\\.) ", "$1\u00a0");
+ output = NBSPACE1.matcher(output).replaceAll("$1\u00a0$2");
+ output = NBSPACE2.matcher(output).replaceAll("$1\u00a0");
Matcher matcher = APOSTROPHE.matcher(output);
output = matcher.replaceAll("$1’$2");
// single quotes
if (output.startsWith("'")) {
- output = output.replaceFirst("'", getOpeningSingleQuote());
+ output = SINGLE_QUOTE_PATTERN.matcher(output).replaceFirst(getOpeningSingleQuote());
}
if (output.endsWith("'")) {
output = output.substring(0, output.length() - 1 ) + getClosingSingleQuote();
}
- output = output.replaceAll(" '(.)'", " " + getOpeningSingleQuote()+"$1"+getClosingSingleQuote()); //exception single character
- output = output.replaceAll("([\\u202f\\u00a0 «\"\\(])'", "$1" + getOpeningSingleQuote());
- output = output.replaceAll("'([\u202f\u00a0 !\\?,\\.;:\"\\)])", getClosingSingleQuote() + "$1");
- output = output.replaceAll("‘s\\b([^’])", "’s$1"); // exception genitive
+ output = QUOTED_CHAR_PATTERN.matcher(output).replaceAll(" " + getOpeningSingleQuote() + "$1" + getClosingSingleQuote()); //exception single character
+ output = TYPOGRAPHY_PATTERN_1.matcher(output).replaceAll("$1" + getOpeningSingleQuote());
+ output = TYPOGRAPHY_PATTERN_2.matcher(output).replaceAll(getClosingSingleQuote() + "$1");
+ output = TYPOGRAPHY_PATTERN_3.matcher(output).replaceAll("’s$1"); // exception genitive
// double quotes
if (output.startsWith("\"")) {
- output = output.replaceFirst("\"", getOpeningDoubleQuote());
+ output = DOUBLE_QUOTE_PATTERN.matcher(output).replaceFirst(getOpeningDoubleQuote());
}
if (output.endsWith("\"")) {
output = output.substring(0, output.length() - 1 ) + getClosingDoubleQuote();
}
- output = output.replaceAll("([ \\(])\"", "$1" + getOpeningDoubleQuote());
- output = output.replaceAll("\"([\\u202f\\u00a0 !\\?,\\.;:\\)])", getClosingDoubleQuote() + "$1");
+ output = TYPOGRAPHY_PATTERN_4.matcher(output).replaceAll("$1" + getOpeningDoubleQuote());
+ output = TYPOGRAPHY_PATTERN_5.matcher(output).replaceAll(getClosingDoubleQuote() + "$1");
//restore suggestions
for (int i = 0; i < preservedStrings.size(); i++) {
output = output.replaceFirst("\\\\" + i, getOpeningDoubleQuote() + Matcher.quoteReplacement(preservedStrings.get(i)) + getClosingDoubleQuote() );
}
-
- return output.replaceAll("", getOpeningDoubleQuote()).replaceAll("", getClosingDoubleQuote());
+
+ return SUGGESTION_CLOSE_TAG.matcher(
+ SUGGESTION_OPEN_TAG.matcher(output).replaceAll(getOpeningDoubleQuote())
+ ).replaceAll(getClosingDoubleQuote());
}
/**
@@ -955,6 +998,10 @@ public RuleMatch adjustMatch(RuleMatch rm, List features) {
return rm;
}
+ public String prepareLineForSpeller(String s) {
+ return s;
+ }
+
/**
* This function is called by JLanguageTool before CleanOverlappingFilter removes overlapping ruleMatches
*
@@ -966,4 +1013,8 @@ public RuleMatch adjustMatch(RuleMatch rm, List features) {
public List mergeSuggestions(List ruleMatches, AnnotatedText text, Set enabledRules) {
return ruleMatches;
}
+
+ public MultitokenSpeller getMultitokenSpeller() {
+ return null;
+ }
}
diff --git a/languagetool-core/src/main/java/org/languagetool/Languages.java b/languagetool-core/src/main/java/org/languagetool/Languages.java
index 4567384a3996a..f321658e8e211 100644
--- a/languagetool-core/src/main/java/org/languagetool/Languages.java
+++ b/languagetool-core/src/main/java/org/languagetool/Languages.java
@@ -44,7 +44,8 @@ public final class Languages {
private static final List dynLanguages = new ArrayList<>();
- private static List languages;
+ private static List staticAndDynamicLanguages;
+ private static List staticAndDynamicLanguagesImmutable;
private Languages() {
}
@@ -62,6 +63,7 @@ public static Language addLanguage(String name, String code, File dictPath) {
throw new RuntimeException("Please specify a dictPath that ends in '.dict' (Morfologik binary dictionary) or '.dic' (Hunspell dictionary): " + dictPath);
}
dynLanguages.add(lang);
+ staticAndDynamicLanguages.add(lang);
return lang;
}
@@ -89,14 +91,15 @@ public static List get() {
* @return an unmodifiable list
*/
public static List getWithDemoLanguage() {
- return Collections.unmodifiableList(getStaticAndDynamicLanguages());
+ return getStaticAndDynamicLanguages();
}
private static List getStaticAndDynamicLanguages() {
- if (languages == null) {
- languages = getAllLanguages();
+ if (staticAndDynamicLanguages == null) {
+ staticAndDynamicLanguages = = new ArrayList<>(getAllLanguages());
+ staticAndDynamicLanguagesImmutable = Collections.unmodifiableList(staticAndDynamicLanguages);
}
- return Stream.concat(languages.stream(), dynLanguages.stream()).collect(Collectors.toList());
+ return staticAndDynamicLanguagesImmutable;
}
private static List getAllLanguages() {
@@ -178,6 +181,7 @@ public static Language getOrAddLanguageByClassName(String className) {
Constructor> constructor = aClass.getConstructor();
Language language = (Language) constructor.newInstance();
dynLanguages.add(language);
+ staticAndDynamicLanguages.add(language);
return language;
} catch (ClassNotFoundException e) {
throw new RuntimeException("Class '" + className + " could not be found in classpath", e);
diff --git a/languagetool-core/src/main/java/org/languagetool/Range.java b/languagetool-core/src/main/java/org/languagetool/Range.java
index 77dc5672b8489..d990c0d4dc029 100644
--- a/languagetool-core/src/main/java/org/languagetool/Range.java
+++ b/languagetool-core/src/main/java/org/languagetool/Range.java
@@ -18,8 +18,6 @@
*/
package org.languagetool;
-import lombok.Getter;
-
import java.util.Objects;
/**
diff --git a/languagetool-core/src/main/java/org/languagetool/SentenceRange.java b/languagetool-core/src/main/java/org/languagetool/SentenceRange.java
index 970ec3ec28193..aa45fdfc0dc28 100644
--- a/languagetool-core/src/main/java/org/languagetool/SentenceRange.java
+++ b/languagetool-core/src/main/java/org/languagetool/SentenceRange.java
@@ -18,17 +18,22 @@
*/
package org.languagetool;
+import org.jetbrains.annotations.NotNull;
import org.languagetool.markup.AnnotatedText;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
+import java.util.regex.Pattern;
/**
* A range in a text that makes up a sentence.
* @since 5.8
*/
-public class SentenceRange {
+public class SentenceRange implements Comparable{
+
+ private static final Pattern BEGINS_WITH_SPACE = Pattern.compile("^\\s*");
+ private static final Pattern ENDS_WITH_SPACE = Pattern.compile("\\s+$");
private final int fromPos;
private final int toPos;
@@ -41,7 +46,8 @@ public class SentenceRange {
public static List getRangesFromSentences(AnnotatedText annotatedText, List sentences) {
List sentenceRanges = new ArrayList<>();
int pos = 0;
- int diff = annotatedText.getTextWithMarkup().length() - annotatedText.getPlainText().length();
+ int markupTextLength = annotatedText.getTextWithMarkup().length();
+ int diff = markupTextLength - annotatedText.getPlainText().length();
for (String sentence : sentences) {
if (sentence.trim().isEmpty()) {
//No content no sentence
@@ -49,18 +55,18 @@ public static List getRangesFromSentences(AnnotatedText annotated
continue;
}
//trim whitespaces
- String sentenceNoBeginnWhitespace = sentence.replaceFirst("^\\s*", "");
- String sentenceNoEndWhitespace = sentence.replaceFirst("\\s++$", "");
+ String sentenceNoBeginWhitespace = BEGINS_WITH_SPACE.matcher(sentence).replaceFirst("");
+ String sentenceNoEndWhitespace = ENDS_WITH_SPACE.matcher(sentence).replaceFirst("");
//Get position without tailing and leading whitespace
- int fromPos = pos + (sentence.length() - sentenceNoBeginnWhitespace.length());
+ int fromPos = pos + (sentence.length() - sentenceNoBeginWhitespace.length());
int toPos = pos + sentenceNoEndWhitespace.length();
int fromPosOrig = fromPos + diff;
int toPosOrig = toPos + diff;
- if (fromPosOrig != annotatedText.getTextWithMarkup().length()) {
+ if (fromPosOrig != markupTextLength) {
fromPosOrig = annotatedText.getOriginalTextPositionFor(fromPos, false);
}
- if (toPosOrig != annotatedText.getTextWithMarkup().length()) {
+ if (toPosOrig != markupTextLength) {
toPosOrig = annotatedText.getOriginalTextPositionFor(toPos, true);
}
sentenceRanges.add(new SentenceRange(fromPosOrig, toPosOrig));
@@ -94,4 +100,9 @@ public boolean equals(Object o) {
public int hashCode() {
return Objects.hash(fromPos, toPos);
}
+
+ @Override
+ public int compareTo(@NotNull SentenceRange o) {
+ return Integer.compare(this.fromPos, o.fromPos);
+ }
}
diff --git a/languagetool-core/src/main/java/org/languagetool/XMLValidator.java b/languagetool-core/src/main/java/org/languagetool/XMLValidator.java
index 9905c6810f39b..3a111796e62ef 100644
--- a/languagetool-core/src/main/java/org/languagetool/XMLValidator.java
+++ b/languagetool-core/src/main/java/org/languagetool/XMLValidator.java
@@ -36,8 +36,6 @@
import java.io.*;
import java.net.URL;
import java.nio.charset.StandardCharsets;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
/**
* Validate XML files with a given DTD or XML Schema (XSD).
@@ -50,26 +48,6 @@ public XMLValidator() {
Tools.setPasswordAuthenticator();
}
- /**
- * Check some limits of our simplified XML output.
- */
- public void checkSimpleXMLString(String xml) throws IOException {
- Pattern pattern = Pattern.compile("()", Pattern.DOTALL|Pattern.MULTILINE);
- Matcher matcher = pattern.matcher(xml);
- int pos = 0;
- while (matcher.find(pos)) {
- String errorElement = matcher.group();
- pos = matcher.end();
- if (errorElement.contains("\n") || errorElement.contains("\r")) {
- throw new IOException(" may not contain line breaks");
- }
- char beforeError = xml.charAt(matcher.start()-1);
- if (beforeError != '\n' && beforeError != '\r') {
- throw new IOException("Each must start on a new line");
- }
- }
- }
-
/**
* Validate XML with the given DTD. Throws exception on error.
*/
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java
index 7509c962a9599..0f2c559bb6a02 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java
@@ -26,6 +26,7 @@
import com.optimaize.langdetect.text.RemoveMinorityScriptsTextFilter;
import com.optimaize.langdetect.text.TextObjectFactory;
import com.optimaize.langdetect.text.TextObjectFactoryBuilder;
+import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.jetbrains.annotations.TestOnly;
import org.languagetool.DetectedLanguage;
@@ -145,7 +146,7 @@ public void setFastTextDetector(FastTextDetector fastTextDetector) {
public AtomicInteger getFasttextInitCounter() {
return fasttextInitCounter;
}
-
+
/**
* @since 5.2
*/
@@ -228,20 +229,26 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
@Nullable
@Override
public DetectedLanguage detectLanguage(String cleanText, List noopLangsTmp, List preferredLangsTmp, boolean limitOnPreferredLangs) {
+ List detectedLanguageScores = getDetectedLanguageScores(cleanText, noopLangsTmp, preferredLangsTmp, limitOnPreferredLangs, 1);
+ return detectedLanguageScores.stream().findFirst().orElse(null);
+ }
+
+ @NotNull
+ @Override
+ public List getDetectedLanguageScores(String cleanText, List noopLangsTmp, List preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
String text = cleanText;
ParsedLanguageLists parsedLanguageLists = prepareDetectLanguage(text, noopLangsTmp, preferredLangsTmp);
if (parsedLanguageLists == null) {
- return new DetectedLanguage(null, new NoopLanguage());
+ return Collections.singletonList(new DetectedLanguage(null, new NoopLanguage()));
}
List additionalLangs = parsedLanguageLists.getAdditionalLangs();
List preferredLangs = parsedLanguageLists.getPreferredLangs();
- Map.Entry result = null;
+ Map scores = null;
boolean fasttextFailed = false;
String source = "";
if (fastTextDetector != null || ngram != null) {
try {
- Map scores;
boolean usingFastText = false;
if ((text.length() <= SHORT_ALGO_THRESHOLD || fastTextDetector == null) && ngram != null) {
scores = ngram.detectLanguages(text.trim(), additionalLangs);
@@ -251,13 +258,13 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
scores = fastTextDetector.runFasttext(text, additionalLangs);
source += "fasttext";
}
- result = getHighestScoringResult(scores);
/*if (result.getValue().floatValue() < THRESHOLD) {
System.out.println("FastText below threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
} else {
System.out.println("FastText above threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
}*/
- if ((usingFastText && result.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || result.getKey().equals("zz")) {
+ Map.Entry fasttextHighestScoringResult = getHighestScoringResult(scores);
+ if ((usingFastText && fasttextHighestScoringResult.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || fasttextHighestScoringResult.getKey().equals("zz")) {
//System.out.println(cleanText + " ->" + result.getValue().floatValue() + " " + result.getKey());
Map lang2Count = COMMON_WORDS_LANG_IDENTIFIER.getKnownWordsPerLanguage(text);
Set baseLangAlreadyHandled = new HashSet<>();
@@ -276,32 +283,19 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
}
}
source += "+commonwords";
- result = getHighestScoringResult(scores);
}
if (preferredLangs.contains("no") && !preferredLangs.contains("da")) {
// Special case, as Norwegian easily gets detected as Danish (https://github.com/languagetool-org/languagetool/issues/5520).
scores.keySet().removeIf(k -> k.equals("da"));
- result = getHighestScoringResult(scores);
}
if (!preferredLangs.isEmpty() && (text.length() <= CONSIDER_ONLY_PREFERRED_THRESHOLD || limitOnPreferredLangs)) {
- //System.out.println("remove? " + preferredLangs + " <-> " + scores);
boolean wasRemoved = scores.keySet().removeIf(k -> !preferredLangs.contains(k));
if (wasRemoved && scores.isEmpty() && limitOnPreferredLangs) {
//TODO: just to see how often we would return no results because of that parameter -> remove later
logger.warn("No language detected for text after remove all not preferred languages from score.");
}
- //System.out.println("-> " + b + " ==> " + scores);
- result = getHighestScoringResult(scores);
- //add login was wäre wenn ansonsten hier so lassen
source += "+prefLang(forced: " + limitOnPreferredLangs + ")";
}
- // Calculate a trivial confidence value because fasttext's confidence is often
- // wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
- // use 1.0 because we can never be totally sure...
- double newScore = 0.99 / (30.0 / Math.min(text.length(), 30));
- //System.out.println("fasttext : " + result);
- //System.out.println("newScore : " + newScore);
- result = new AbstractMap.SimpleImmutableEntry<>(result.getKey(), newScore);
} catch (FastTextDetector.FastTextException e) {
if (e.isDisabled()) {
fasttextFailed = true;
@@ -318,22 +312,50 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
if (fastTextDetector == null && ngram == null || fasttextFailed) { // no else, value can change in if clause
text = textObjectFactory.forText(text).toString();
source +="+fallback";
- result = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
- if (additionalLangs.size() > 0) {
- logger.warn("Cannot consider noopLanguages because not in fastText mode: " + additionalLangs);
+ if (scores == null) {
+ scores = new HashMap<>();
+ }
+ Map.Entry localResult = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
+ if (localResult != null) {
+ scores.put(localResult.getKey(), localResult.getValue());
+ }
+ if (!additionalLangs.isEmpty()) {
+ logger.warn("Cannot consider noopLanguages because not in fastText mode: {}", additionalLangs);
}
}
- if (result != null && result.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(result.getKey(), additionalLangs)) {
- return new DetectedLanguage(null,
- Languages.getLanguageForShortCode(result.getKey(), additionalLangs),
- result.getValue().floatValue(), source);
+
+ List detectedLanguages = new LinkedList<>();
+ if (count > 1) {
+ Map orderedScores = getOrderedScores(scores, count);
+ for (Map.Entry entry : orderedScores.entrySet()) {
+ if (entry.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(entry.getKey(), additionalLangs)) {
+ float rate = Math.round(entry.getValue() * 100.0) / 100.0f; // Convert to a non-scientific float and potentially round down
+ detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), additionalLangs), rate, source));
+ }
+ }
} else {
- if (preferredLangs.size() > 0 && Languages.isLanguageSupported(preferredLangs.get(0))) {
- source += "+fallbackToPrefLang";
- return new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source);
+ Map.Entry highestScoringResult = getHighestScoringResult(scores);
+ if (highestScoringResult.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(highestScoringResult.getKey(), additionalLangs)) {
+ float newScore;
+ if (source.contains("fasttext")) {
+ // Calculate a trivial confidence value because fasttext's confidence is often
+ // wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
+ // use 1.0 because we can never be totally sure...
+ newScore = (float) (0.99/ (30.0 / Math.min(text.length(), 30)));
+ } else {
+ newScore = highestScoringResult.getValue().floatValue();
+ }
+ detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(highestScoringResult.getKey(), additionalLangs), newScore, source));
}
- return null;
}
+ if (detectedLanguages.isEmpty() && !preferredLangs.isEmpty() &&
+ preferredLangs.get(0) != null &&
+ !preferredLangs.get(0).trim().isEmpty() &&
+ Languages.isLanguageSupported(preferredLangs.get(0))) {
+ source += "+fallbackToPrefLang";
+ detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source));
+ }
+ return detectedLanguages;
}
private void reinitFasttextAfterFailure(Exception e) {
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/LanguageIdentifier.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/LanguageIdentifier.java
index d4d4d5c27ad2e..64d02784f94df 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/LanguageIdentifier.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/LanguageIdentifier.java
@@ -22,6 +22,7 @@
import com.optimaize.langdetect.text.TextFilter;
import lombok.Getter;
+import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.DetectedLanguage;
import org.languagetool.Language;
@@ -38,6 +39,7 @@ public abstract class LanguageIdentifier {
private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
private static final Pattern SIGNATURE = Pattern.compile("\n-- \n.*", Pattern.DOTALL);
private static final Pattern MENTION = Pattern.compile("@[A-Za-z0-9_]+");
+ private static final Pattern NBSP_INVIS_SEPARATOR = Pattern.compile("[\uFEFF\u2063]+");
protected static final float SCORE_THRESHOLD = 0.85f;
protected static final int CONSIDER_ONLY_PREFERRED_THRESHOLD = 50;
protected static final List NON_LATIN_CHARS_LANGUAGES = Arrays.asList("ar", "fa", "ru", "uk", "be", "zh", "ja", "km", "ta", "el", "hi", "mr", "th", "he", "ko");
@@ -79,6 +81,9 @@ public LanguageIdentifier(int maxLength) {
@Nullable
public abstract DetectedLanguage detectLanguage(String cleanText, List noopLangsTmp, List preferredLangsTmp, boolean limitOnPreferredLangs);
+ @NotNull
+ public abstract List getDetectedLanguageScores(String cleanText, List noopLangsTmp, List preferredLangsTmp, boolean limitOnPreferredLangs, int count);
+
/**
* @param cleanText a cleanText as returned by {@link #cleanAndShortenText(String)}
* @return language or {@code null} if language could not be identified
@@ -92,7 +97,7 @@ public LanguageIdentifier(int maxLength) {
*/
public String cleanAndShortenText(String text) {
String shortText = text.length() > maxLength ? text.substring(0, maxLength) : text;
- shortText = shortText.replaceAll("[\uFEFF\u2063]+", " "); // used by the browser add-on to filter HTML etc. (_ignoreText() in validator.js)
+ shortText = NBSP_INVIS_SEPARATOR.matcher(shortText).replaceAll(" "); // used by the browser add-on to filter HTML etc. (_ignoreText() in validator.js)
shortText = REMOVE_URL_FILTER.filter(shortText);
shortText = REMOVE_EMAIL_SIGNATURE_FILTER.filter(shortText);
shortText = REMOVE_MENTION_FILTER.filter(shortText);
@@ -138,6 +143,16 @@ protected Map.Entry getHighestScoringResult(Map
return new AbstractMap.SimpleImmutableEntry<>(result, max);
}
+ protected Map getOrderedScores(Map scores, int count) {
+ ArrayList> entries = new ArrayList<>(scores.entrySet());
+ entries.sort(Map.Entry.comparingByValue(Collections.reverseOrder()));
+ Map sortedScores = new LinkedHashMap<>();
+ for (int i = 0; i < entries.size() && i < count; i++) {
+ sortedScores.put(entries.get(i).getKey(), entries.get(i).getValue());
+ }
+ return sortedScores;
+ }
+
protected static class ParsedLanguageLists {
@Getter
private final List additionalLangs = new ArrayList<>();
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/SimpleLanguageIdentifier.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/SimpleLanguageIdentifier.java
index 49a359d22ed1f..2204524341227 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/SimpleLanguageIdentifier.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/SimpleLanguageIdentifier.java
@@ -1,17 +1,17 @@
/*
- * LanguageTool, a natural language style checker
+ * LanguageTool, a natural language style checker
* Copyright (c) 2022. Stefan Viol (https://stevio.de)
- *
+ *
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
- *
+ *
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
@@ -21,6 +21,7 @@
package org.languagetool.language.identifier;
import lombok.extern.slf4j.Slf4j;
+import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.DetectedLanguage;
import org.languagetool.JLanguageTool;
@@ -31,12 +32,15 @@
import java.io.IOException;
import java.util.*;
+import java.util.regex.Pattern;
import static org.languagetool.JLanguageTool.getDataBroker;
@Slf4j
public class SimpleLanguageIdentifier extends LanguageIdentifier {
+ private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+
private final Map spellingCheckRules = new HashMap<>();
public SimpleLanguageIdentifier() {
@@ -86,7 +90,7 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
List additionalLangs = parsedLanguageLists.getAdditionalLangs();
List preferredLangs = parsedLanguageLists.getPreferredLangs();
- String[] words = cleanText.split("\\s+");
+ String[] words = WHITESPACE.split(cleanText);
List dominantLangCodes = UNICODE_BASED_LANG_IDENTIFIER.getDominantLangCodes(cleanText);
Map scores = new HashMap<>();
String detectionSource = "spellchecker";
@@ -167,6 +171,12 @@ public DetectedLanguage detectLanguage(String cleanText, List noopLangsT
return this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp);
}
+ @NotNull
+ @Override
+ public List getDetectedLanguageScores(String cleanText, List noopLangsTmp, List preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
+ return Collections.singletonList(this.detectLanguage(cleanText, noopLangsTmp, preferredLangsTmp, limitOnPreferredLangs));
+ }
+
@Nullable
@Override
public Language detectLanguage(String cleanText) {
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/CommonWordsDetector.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/CommonWordsDetector.java
index 72778913b2c8e..037a69560d289 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/CommonWordsDetector.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/CommonWordsDetector.java
@@ -43,7 +43,10 @@ public class CommonWordsDetector {
private final static Pattern notSpanishPattern = Pattern.compile("^[lmndts]['’].*$|^.*(ns|[áéó].i[oa]s?)$|^.*(ss|[çàèòïâêôãõìù]|l·l).*$");
private final static Pattern notCatalanPattern = Pattern.compile("^.*([áéó].i[oa]s?|d[oa]s)$|^.*[áâêôãõìùñ].*$");
private final static Pattern portuguesePattern = Pattern.compile("^.*([áó]ri[oa]|ério)s?$"); // éria can be French
-
+ private static final Pattern PUNCT_PATTERN = Pattern.compile("[(),.:;!?„“\"¡¿\\s\\[\\]{}-«»”]");
+ private static final Pattern CHARS_PATTERN = Pattern.compile("\\p{L}+$");
+ private static final Pattern SPACE_OR_HYPHEN_PATTERN = Pattern.compile("[ -]");
+
public CommonWordsDetector() throws IOException {
synchronized (word2langs) {
if (word2langs.isEmpty()) {
@@ -103,14 +106,14 @@ public CommonWordsDetector() throws IOException {
public Map getKnownWordsPerLanguage(String text) {
Map result = new HashMap<>();
- String auxText = text.replaceAll("[(),.:;!?„“\"¡¿\\s\\[\\]{}-«»”]", " ");
+ String auxText = PUNCT_PATTERN.matcher(text).replaceAll(" ");
if (!auxText.endsWith(" ") && StringUtils.countMatches(auxText, " ") > 0) {
// last word might not be finished yet, so ignore
- auxText = auxText.replaceFirst("\\p{L}+$", "");
+ auxText = CHARS_PATTERN.matcher(auxText).replaceFirst("");
}
// Proper per-language tokenizing might help, but then the common_words.txt
// will also need to be tokenized the same way. Also, this is quite fast.
- String[] words = auxText.split("[ -]");
+ String[] words = SPACE_OR_HYPHEN_PATTERN.split(auxText);
for (String word : words) {
if (numberPattern.matcher(word).matches()) {
continue;
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/FastTextDetector.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/FastTextDetector.java
index 5c394a9510a5f..8014824f79d49 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/FastTextDetector.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/FastTextDetector.java
@@ -26,6 +26,7 @@
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
+import java.util.regex.Pattern;
/**
* @since 5.0
@@ -35,6 +36,7 @@ public class FastTextDetector {
private static final Logger logger = LoggerFactory.getLogger(FastTextDetector.class);
private static final int K_HIGHEST_SCORES = 5;
private static final int BUFFER_SIZE = 4096;
+ private static final Pattern WHITESPACE = Pattern.compile("\\s+");
private Process fasttextProcess;
private Reader fasttextIn;
@@ -81,7 +83,7 @@ private void init() throws IOException{
}
public Map runFasttext(String text, List additionalLanguageCodes) throws IOException {
- String joined = text.replace("\n", " ").toLowerCase(Locale.ROOT);
+ String joined = text.replace('\n', ' ').toLowerCase(Locale.ROOT);
char[] cbuf = new char[BUFFER_SIZE];
synchronized (this) {
fasttextOut.write(joined + System.lineSeparator());
@@ -109,7 +111,7 @@ public Map runFasttext(String text, List additionalLangu
@NotNull
Map parseBuffer(String buffer, List additionalLanguageCodes) {
- String[] values = buffer.trim().split("\\s+");
+ String[] values = WHITESPACE.split(buffer.trim());
if (!buffer.startsWith("__label__")) {
throw new FastTextException("FastText output is expected to start with '__label__': ''" + buffer + "'", true);
}
diff --git a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/NGramDetector.java b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/NGramDetector.java
index 9c480557ee496..f76d019a0fe24 100644
--- a/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/NGramDetector.java
+++ b/languagetool-core/src/main/java/org/languagetool/language/identifier/detector/NGramDetector.java
@@ -25,15 +25,27 @@
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.zip.ZipFile;
import static java.lang.StrictMath.log;
import static java.lang.StrictMath.min;
+import static java.util.regex.Pattern.compile;
public class NGramDetector {
- private final static double EPSILON = 1e-4;
+ private static final double EPSILON = 1e-4;
+ private static final Pattern DIGITS = compile("\\d+");
+ private static final Pattern KOREAN = compile("[\\uac00-\\ud7a3]");
+ private static final Pattern JAPANESE = compile("[\\u3040-\\u30ff]");
+ private static final Pattern CHINESE = compile("[\\u4e00-\\u9FFF]");
+ private static final Pattern KHMER = compile("[\\u1780-\\u17FF]");
+ private static final Pattern TAGALOG = compile("[\\u1700-\\u171F]");
+ private static final Pattern ARMENIAN = compile("[\\u0530-\\u058F]");
+ private static final Pattern GREEK = compile("[\\u0370-\\u03FF]");
+ private static final Pattern TAMIL = compile("[\\u0B80-\\u0BFF]");
+ private static final Pattern WHITESPACE = compile("\\s+");
private final Map vocab;
private final List codes; // Elem format = {Name, 2-code (or "NULL"), 3-code}
@@ -143,7 +155,7 @@ private List readLines(String path) {
while ((line = br.readLine()) != null) {
result.add(line);
}
- } catch(java.io.IOException e) {
+ } catch(IOException e) {
throw new RuntimeException(e);
}
return result;
@@ -175,21 +187,20 @@ private List encode(String text) {
text = text.substring(0, maxLength);
}
text = Normalizer.normalize(text, Normalizer.Form.NFKC).toLowerCase();
- text = text.replaceAll("\\d+", "");
- text = text.replaceAll("[\\uac00-\\ud7a3]", ""); // Korean
- text = text.replaceAll("[\\u3040-\\u30ff]", ""); // Japanese
- text = text.replaceAll("[\\u4e00-\\u9FFF]", ""); // Chinese
- text = text.replaceAll("[\\u1780-\\u17FF]", ""); // Khmer
- text = text.replaceAll("[\\u1700-\\u171F]", ""); // Tagalog
- text = text.replaceAll("[\\u0530-\\u058F]", ""); // Armenian
- text = text.replaceAll("[\\u0370-\\u03FF]", ""); // Greek
- text = text.replaceAll("[\\u0B80-\\u0BFF]", ""); // Tamil
- text = text.replaceAll("\\s+", "▁");
+ text = DIGITS.matcher(text).replaceAll("");
+ text = KOREAN.matcher(text).replaceAll("");
+ text = JAPANESE.matcher(text).replaceAll("");
+ text = CHINESE.matcher(text).replaceAll("");
+ text = KHMER.matcher(text).replaceAll("");
+ text = TAGALOG.matcher(text).replaceAll("");
+ text = ARMENIAN.matcher(text).replaceAll("");
+ text = GREEK.matcher(text).replaceAll("");
+ text = TAMIL.matcher(text).replaceAll("");
+ text = WHITESPACE.matcher(text).replaceAll("▁");
if (text.length() == 0) {
return result;
}
text = "▁" + text;
-
int cur = 0;
while (cur < text.length()) {
int tok = 0;
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractCheckCaseRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractCheckCaseRule.java
index 3bbdf0d11e395..3b9c70f1e6ee2 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractCheckCaseRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractCheckCaseRule.java
@@ -34,6 +34,8 @@
*/
public abstract class AbstractCheckCaseRule extends AbstractSimpleReplaceRule2 {
private final Language language;
+ private boolean ignoreShortUppercaseWords = true;
+ private int MAX_LENGTH_SHORT_WORDS = 4;
public AbstractCheckCaseRule(ResourceBundle messages, Language language) {
super(messages, language);
@@ -96,7 +98,15 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
break;
}
if (originalPhrase.equals(originalPhrase.toUpperCase())) {
- continue;
+ if (ignoreShortUppercaseWords) {
+ continue;
+ } else {
+ if ( originalPhrase.length() <= MAX_LENGTH_SHORT_WORDS ){
+ // correct uppercase words of max X characters
+ } else{
+ continue;
+ }
+ }
}
if (correctPhrase != null && !correctPhrase.equals(originalPhrase)) {
RuleMatch ruleMatch;
@@ -131,6 +141,14 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
return toRuleMatchArray(ruleMatches);
}
+ protected boolean isIgnoreShortUppercaseWords() {
+ return ignoreShortUppercaseWords;
+ }
+
+ protected void setIgnoreShortUppercaseWords(boolean value) {
+ ignoreShortUppercaseWords = value;
+ }
+
private boolean isPunctuationStart(String word) {
return StringUtils.getDigits(word).length() > 0 // e.g. postal codes
|| StringTools.isPunctuationMark(word) || StringTools.isNotWordCharacter(word);
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractCompoundRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractCompoundRule.java
index fb11027baec10..f6dbf7c7e1027 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractCompoundRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractCompoundRule.java
@@ -30,6 +30,7 @@
import java.io.IOException;
import java.util.*;
+import java.util.regex.Pattern;
import java.util.stream.Stream;
/**
@@ -41,6 +42,11 @@ public abstract class AbstractCompoundRule extends Rule {
static final int MAX_TERMS = 5;
+ private static final Pattern WHITESPACE_DASH = Pattern.compile(" - ", Pattern.LITERAL);
+ private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+ private static final Pattern DIGIT = Pattern.compile("\\d+");
+ private static final Pattern DASHES = Pattern.compile("--+");
+
private final String withHyphenMessage;
private final String withoutHyphenMessage;
private final String withOrWithoutHyphenMessage;
@@ -135,7 +141,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
containsDigits = true;
}
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck) ||
- (containsDigits && getCompoundRuleData().getIncorrectCompounds().contains(digitsRegexp = stringToCheck.replaceAll("\\d+", "\\\\d+")))) {
+ (containsDigits && getCompoundRuleData().getIncorrectCompounds().contains(digitsRegexp = DIGIT.matcher(stringToCheck).replaceAll("\\\\d+")))) {
AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
List replacement = new ArrayList<>();
@@ -192,9 +198,9 @@ public RuleMatch[] match(AnalyzedSentence sentence) throws IOException {
}
protected List filterReplacements(List replacements, String original) throws IOException {
- List newReplacements = new ArrayList();
+ List newReplacements = new ArrayList<>();
for (String replacement : replacements) {
- String newReplacement = replacement.replaceAll("\\-\\-+", "-");
+ String newReplacement = DASHES.matcher(replacement).replaceAll("-");
if (!newReplacement.equals(original) && isCorrectSpell(newReplacement)) {
newReplacements.add(newReplacement);
}
@@ -234,9 +240,9 @@ private Map getStringToTokenMap(Queue getWordsToCheck();
protected abstract Synthesizer getSynthesizer();
@@ -60,9 +65,8 @@ public String getId() {
return ruleId;
}
- private String ruleId;
-
- private Language language;
+ private final String ruleId;
+ private final Language language;
@Override
public abstract String getDescription();
@@ -115,7 +119,7 @@ public RuleMatch[] match(List sentences) throws IOException {
boolean isAllUppercase = StringTools.isAllUppercase(token);
i++;
boolean isException = token.isEmpty() || isException(tokens, i, sentStart, isCapitalized, isAllUppercase);
- if (sentStart && !token.isEmpty() && !token.matches("\\p{P}")) {
+ if (sentStart && !token.isEmpty() && !PUNCT_PATTERN.matcher(token).matches()) {
sentStart = false;
}
if (isException) {
@@ -176,18 +180,16 @@ public RuleMatch[] match(List sentences) throws IOException {
return toRuleMatchArray(matches);
}
- private static final String FILE_ENCODING = "utf-8";
-
protected static Map loadWords(String path) {
- final InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
- final Map map = new HashMap<>();
+ InputStream inputStream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
+ Map map = new HashMap<>();
try (Scanner scanner = new Scanner(inputStream, FILE_ENCODING)) {
while (scanner.hasNextLine()) {
- final String line = scanner.nextLine().replaceFirst("#.*", "").trim();
+ String line = HASH_PATTERN.matcher(scanner.nextLine()).replaceFirst("").trim();
if (line.isEmpty()) {
continue;
}
- final String[] mainParts = line.split("=");
+ String[] mainParts = line.split("=");
String[] parts = null;
String postag = null;
String chunk = null;
@@ -239,6 +241,5 @@ protected static Map loadWords(String path) {
}
return map;
}
-
}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSpecificCaseRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractSpecificCaseRule.java
index 49ab6e11ac31f..72e81637827b0 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSpecificCaseRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractSpecificCaseRule.java
@@ -18,20 +18,15 @@
*/
package org.languagetool.rules;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.ResourceBundle;
-import java.util.Set;
-
+import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.rules.spelling.CachingWordListLoader;
import org.languagetool.tools.StringTools;
-import gnu.trove.THashMap;
-import gnu.trove.THashSet;
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
/**
* A rule that matches words which need a specific upper/lowercase spelling.
@@ -41,14 +36,13 @@ public abstract class AbstractSpecificCaseRule extends Rule {
// a map that has as keys the special case phrases into lowercase
// and as values the special case phrases properly spelled:
- private static final Map lcToProperSpelling = new THashMap<>();
- // the phrases that will be detected by the rule:
- private static Set phrases;
+ // one for each subclass
+ private static final ConcurrentMap> lcToProperSpelling = new ConcurrentHashMap<>();
private static int maxLen;
// used to speed up the server as the phrases are loaded in every initialization:
protected final CachingWordListLoader phrasesListLoader = new CachingWordListLoader();
-
+
/**
* The constructor of the abstract class AbstractSpecificCaseRule
* @param messages the messages to apply the rule
@@ -59,12 +53,12 @@ public AbstractSpecificCaseRule(ResourceBundle messages) {
setLocQualityIssueType(ITSIssueType.Misspelling);
loadPhrases();
}
-
+
/**
* @return the path to the txt file that contains the phrases for the rule
*/
public abstract String getPhrasesPath();
-
+
/**
* @return the message that will be shown if the words of the
* wrongly capitalized phrase must begin with capital
@@ -72,39 +66,35 @@ public AbstractSpecificCaseRule(ResourceBundle messages) {
public String getInitialCapitalMessage() {
return "The initials of the particular phrase must be capitals.";
}
-
+
/**
* @return the message that will be shown if the wrongly capitalized phrase
- * must not be written with capital initials
+ * must not be written with capital initials
* (another special kind of capitalization)
*/
- public String getOtherCapitalizationMessage() {
+ public String getOtherCapitalizationMessage() {
return "The particular expression should follow the suggested capitalization.";
}
-
+
public String getShortMessage() {
return "Special capitalization";
}
-
+
/**
* Initializes the phrases that will be detected from the rule by the given path
*/
- private void loadPhrases() {
- List l = new ArrayList<>();
- List lines = phrasesListLoader.loadWords(getPhrasesPath());
- for (String line : lines) {
- int parts = line.split(" ").length;
- maxLen = Math.max(parts, maxLen);
- l.add(line.trim());
- }
- phrases = new THashSet<>(l);
- initializeLcToProperSpellingMap();
- }
-
- synchronized static private void initializeLcToProperSpellingMap() {
- for (String phrase : phrases) {
- lcToProperSpelling.put(phrase.toLowerCase(), phrase);
- }
+ private synchronized void loadPhrases() {
+ lcToProperSpelling.computeIfAbsent(this.getClass(), (clazz) -> {
+ Map properSpelling = new Object2ObjectOpenHashMap<>();
+ List lines = phrasesListLoader.loadWords(getPhrasesPath());
+ for (String line : lines) {
+ int parts = line.split(" ").length;
+ maxLen = Math.max(parts, maxLen);
+ String phrase = line.trim();
+ properSpelling.put(phrase.toLowerCase(), phrase);
+ }
+ return properSpelling;
+ });
}
@Override
@@ -121,6 +111,7 @@ public String getDescription() {
public RuleMatch[] match(AnalyzedSentence sentence) {
List matches = new ArrayList<>();
AnalyzedTokenReadings[] tokens = sentence.getTokensWithoutWhitespace();
+ Map properSpellingMap = lcToProperSpelling.get(this.getClass());
for (int i = 0; i < tokens.length; i++) {
List l = new ArrayList<>();
int j = 0;
@@ -129,7 +120,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
j++;
String phrase = String.join(" ", l);
String lcPhrase = phrase.toLowerCase();
- String properSpelling = lcToProperSpelling.get(lcPhrase);
+ String properSpelling = properSpellingMap.get(lcPhrase);
if (properSpelling != null && !StringTools.isAllUppercase(phrase) && !phrase.equals(properSpelling)) {
if (i > 0 && tokens[i-1].isSentenceStart() && !StringTools.startsWithUppercase(properSpelling)) {
// avoid suggesting e.g. "vitamin C" at sentence start:
@@ -141,7 +132,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
} else {
msg = getOtherCapitalizationMessage();
}
- RuleMatch match = new RuleMatch(this, sentence, tokens[i].getStartPos(),
+ RuleMatch match = new RuleMatch(this, sentence, tokens[i].getStartPos(),
tokens[i+j-1].getEndPos(), msg, getShortMessage());
match.setSuggestedReplacement(properSpelling);
matches.add(match);
@@ -154,7 +145,7 @@ public RuleMatch[] match(AnalyzedSentence sentence) {
/**
* Checks if all the words in the given string begin with a capital letter
* @param s the string to check
- * @return true
if all the words within the given string
+ * @return true
if all the words within the given string
* begin with capital letter, else false
*/
private boolean allWordsUppercase(String s) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractStyleTooOftenUsedWordRule.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractStyleTooOftenUsedWordRule.java
index aa8cd7e6f600b..ead6d4817fcec 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractStyleTooOftenUsedWordRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractStyleTooOftenUsedWordRule.java
@@ -26,10 +26,7 @@
import java.util.ResourceBundle;
import java.util.regex.Pattern;
-import org.languagetool.AnalyzedSentence;
-import org.languagetool.AnalyzedTokenReadings;
-import org.languagetool.Language;
-import org.languagetool.UserConfig;
+import org.languagetool.*;
import org.languagetool.rules.Category.Location;
/**
@@ -46,11 +43,9 @@ public abstract class AbstractStyleTooOftenUsedWordRule extends TextLevelRule {
private final int minPercent;
private final int defaultMinPercent;
+ private final Map wordMap = new HashMap<>();
+
private boolean withoutDirectSpeech = false;
-
- private int numWords;
-
- private Map wordMap = new HashMap<>();
public AbstractStyleTooOftenUsedWordRule(ResourceBundle messages, Language lang, UserConfig userConfig, int minPercent) {
this(messages, lang, userConfig, minPercent, DEFAULT_ACTIVATION);
@@ -119,11 +114,6 @@ public int getMinConfigurableValue() {
return 1;
}
- @Override
- public int getMaxConfigurableValue() {
- return 100;
- }
-
public Map getWordMap() {
return wordMap;
}
@@ -170,7 +160,7 @@ private void FillWordMap(List sentences) {
*/
private List getTooOftenUsedWords() {
List words = new ArrayList<>();
- numWords = 0;
+ int numWords = 0;
for (String word : wordMap.keySet()) {
numWords += wordMap.get(word);
}
@@ -234,5 +224,16 @@ public RuleMatch[] match(List sentences) throws IOException {
public int minToCheckParagraph() {
return -1;
}
-
+
+ protected String getLemmaForPosTagStartsWith(String startPos, AnalyzedTokenReadings token) {
+ List readings = token.getReadings();
+ for (AnalyzedToken reading : readings) {
+ String posTag = reading.getPOSTag();
+ if (posTag != null && posTag.startsWith(startPos)) {
+ return reading.getLemma();
+ }
+ }
+ return null;
+ }
+
}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractTextToNumberFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/AbstractTextToNumberFilter.java
index f6d8762e25781..20083de77f8f0 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractTextToNumberFilter.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/AbstractTextToNumberFilter.java
@@ -29,8 +29,8 @@
public abstract class AbstractTextToNumberFilter extends RuleFilter {
- protected static Map numbers = new HashMap();
- protected static Map multipliers = new HashMap();
+ protected static Map numbers = new HashMap<>();
+ protected static Map multipliers = new HashMap<>();
@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map arguments, int patternTokenPos,
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java
index b3cb95631d917..a70a3d2e5f8d0 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java
@@ -56,19 +56,31 @@ public final List filter(List ruleMatches) {
throw new IllegalArgumentException(
"The list of rule matches is not ordered. Make sure it is sorted by start position.");
}
- // juxtaposed errors adding a comma in the same place
- boolean isJuxtaposedComma = false;
- if (ruleMatch.getFromPos() == prevRuleMatch.getToPos()
- && ruleMatch.getSuggestedReplacements().size() > 0
- && prevRuleMatch.getSuggestedReplacements().size() > 0) {
+
+ boolean isDuplicateSuggestion = false;
+ if (ruleMatch.getSuggestedReplacements().size() > 0
+ && prevRuleMatch.getSuggestedReplacements().size() > 0) {
String suggestion = ruleMatch.getSuggestedReplacements().get(0);
String prevSuggestion = prevRuleMatch.getSuggestedReplacements().get(0);
- if (prevSuggestion.endsWith(",") && suggestion.startsWith(", ")) {
- isJuxtaposedComma = true;
+ // juxtaposed errors adding a comma in the same place
+ if (ruleMatch.getFromPos() == prevRuleMatch.getToPos()) {
+ if (prevSuggestion.endsWith(",") && suggestion.startsWith(", ")) {
+ isDuplicateSuggestion = true;
+ }
+ }
+ // duplicate suggestion for the same position
+ if (suggestion.indexOf(" ") > 0 && prevSuggestion.indexOf(" ") > 0
+ && ruleMatch.getFromPos() == prevRuleMatch.getToPos() + 1) {
+ String parts[] = suggestion.split(" ");
+ String partsPrev[] = prevSuggestion.split(" ");
+ if (partsPrev.length > 1 && parts.length > 1 && partsPrev[1].equals(parts[0])) {
+ isDuplicateSuggestion = true;
+ }
}
}
+
// no overlapping (juxtaposed errors are not removed)
- if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos() && !isJuxtaposedComma) {
+ if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos() && !isDuplicateSuggestion) {
cleanList.add(prevRuleMatch);
prevRuleMatch = ruleMatch;
continue;
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/CompoundRuleData.java b/languagetool-core/src/main/java/org/languagetool/rules/CompoundRuleData.java
index 7aa4435017dea..6e273a92b6e0f 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/CompoundRuleData.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/CompoundRuleData.java
@@ -18,7 +18,7 @@
*/
package org.languagetool.rules;
-import gnu.trove.THashSet;
+import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import org.languagetool.JLanguageTool;
import java.io.IOException;
@@ -31,10 +31,10 @@
*/
public class CompoundRuleData {
- private final Set incorrectCompounds = new THashSet<>();
- private final Set joinedSuggestion = new THashSet<>();
- private final Set joinedLowerCaseSuggestion = new THashSet<>();
- private final Set dashSuggestion = new THashSet<>();
+ private final Set incorrectCompounds = new ObjectOpenHashSet<>();
+ private final Set joinedSuggestion = new ObjectOpenHashSet<>();
+ private final Set joinedLowerCaseSuggestion = new ObjectOpenHashSet<>();
+ private final Set dashSuggestion = new ObjectOpenHashSet<>();
private final LineExpander expander;
public CompoundRuleData(String path) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/GRPCRule.java b/languagetool-core/src/main/java/org/languagetool/rules/GRPCRule.java
index b6307bf4dc374..00aeac70e02f9 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/GRPCRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/GRPCRule.java
@@ -37,6 +37,7 @@
import java.util.concurrent.TimeoutException;
import java.util.function.BiFunction;
import java.util.function.Function;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -58,6 +59,7 @@
import org.languagetool.rules.ml.MLServerGrpc.MLServerFutureStub;
import org.languagetool.rules.ml.MLServerProto;
import org.languagetool.rules.ml.MLServerProto.MatchResponse;
+import org.languagetool.tools.StringTools;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -91,11 +93,12 @@ public abstract class GRPCRule extends RemoteRule {
private static final Logger logger = LoggerFactory.getLogger(GRPCRule.class);
private static final int DEFAULT_BATCH_SIZE = 8;
- public static final String WHITESPACE_REGEX = "[\u00a0\u202f\ufeff\ufffd]";
+ public static final Pattern WHITESPACE_REGEX = Pattern.compile("[\u00a0\u202f\ufeff\ufffd]");
private static final String DEFAULT_DESCRIPTION = "INTERNAL - dynamically loaded rule supported by remote server";
+ /*TODO Delete this temporal fix as this is for speeding up execution for too long sentences*/
- public static String cleanID(String id) {
- return id.replaceAll("[^a-zA-Z0-9_]", "_").toUpperCase();
+ public static String cleanID(String id, Language lang) {
+ return StringTools.toId(id, lang);
}
/**
* Internal rule to create rule matches with IDs based on Match Sub-IDs
@@ -104,11 +107,11 @@ public static class GRPCSubRule extends Rule {
private final String matchId;
private final String description;
- GRPCSubRule(String ruleId, String subId, String description) {
+ GRPCSubRule(String ruleId, String subId, String description, Language lang) {
if (subId != null && !subId.trim().isEmpty()) {
- this.matchId = cleanID(ruleId) + "_" + cleanID(subId);
+ this.matchId = cleanID(ruleId, lang) + "_" + cleanID(subId, lang);
} else {
- this.matchId = cleanID(ruleId);
+ this.matchId = cleanID(ruleId, lang);
}
this.description = description;
}
@@ -175,7 +178,7 @@ private void shutdown() {
}
}
}
-
+
private static final LoadingCache servers =
CacheBuilder.newBuilder().build(CacheLoader.from(serviceConfiguration -> {
if (serviceConfiguration == null) {
@@ -194,11 +197,13 @@ private void shutdown() {
private final Connection conn;
private final int batchSize;
-
private final boolean sendAnalyzedData;
+ private int maxSentenceLength;
public GRPCRule(Language language, ResourceBundle messages, RemoteRuleConfig config, boolean inputLogging) {
super(language, messages, config, inputLogging);
+
+ this.maxSentenceLength = Integer.parseInt(config.getOptions().getOrDefault("maxSentenceLength", String.valueOf(Integer.MAX_VALUE)));
sendAnalyzedData = config.getOptions()
.getOrDefault("analyzed", "false")
.equalsIgnoreCase("true");
@@ -241,34 +246,38 @@ public AnalyzedMLRuleRequest(List requests,
@Override
protected RemoteRule.RemoteRequest prepareRequest(List sentences, @Nullable Long textSessionId) {
List ids = Collections.emptyList();
+ // TODO this is a temp fix to avoid sending too long sentences to the server
+ List filteredSentences = sentences.stream()
+ .filter(s -> s.getText().length() <= maxSentenceLength)
+ .collect(Collectors.toList());
+
if (textSessionId != null) {
- ids = Collections.nCopies(sentences.size(), textSessionId);
+ ids = Collections.nCopies(filteredSentences.size(), textSessionId);
}
-
if (sendAnalyzedData) {
List requests = new ArrayList<>();
- for (int offset = 0; offset < sentences.size(); offset += batchSize) {
+ for (int offset = 0; offset < filteredSentences.size(); offset += batchSize) {
MLServerProto.AnalyzedMatchRequest req = MLServerProto.AnalyzedMatchRequest.newBuilder()
- .addAllSentences(sentences
- .subList(offset, Math.min(sentences.size(), offset + batchSize))
+ .addAllSentences(filteredSentences
+ .subList(offset, Math.min(filteredSentences.size(), offset + batchSize))
.stream().map(GRPCUtils::toGRPC).collect(Collectors.toList()))
.setInputLogging(inputLogging)
.addAllTextSessionID(textSessionId != null ?
- ids.subList(offset, Math.min(sentences.size(), offset + batchSize))
+ ids.subList(offset, Math.min(filteredSentences.size(), offset + batchSize))
: Collections.emptyList())
.build();
requests.add(req);
}
- return new AnalyzedMLRuleRequest(requests, sentences);
+ return new AnalyzedMLRuleRequest(requests, filteredSentences);
} else {
List requests = new ArrayList<>();
- for (int offset = 0; offset < sentences.size(); offset += batchSize) {
- List text = sentences.stream().map(AnalyzedSentence::getText).map(s -> {
+ for (int offset = 0; offset < filteredSentences.size(); offset += batchSize) {
+ List text = filteredSentences.stream().map(AnalyzedSentence::getText).map(s -> {
if (whitespaceNormalisation) {
// non-breaking space can be treated as normal space
- return s.replaceAll(WHITESPACE_REGEX, " ");
+ return WHITESPACE_REGEX.matcher(s).replaceAll(" ");
} else {
return s;
}
@@ -283,9 +292,9 @@ protected RemoteRule.RemoteRequest prepareRequest(List sentenc
requests.add(req);
}
if (requests.size() > 1) {
- logger.debug("Split {} sentences into {} requests for {}", sentences.size(), requests.size(), getId());
+ logger.debug("Split {} sentences into {} requests for {}", filteredSentences.size(), requests.size(), getId());
}
- return new MLRuleRequest(requests, sentences, textSessionId);
+ return new MLRuleRequest(requests, filteredSentences, textSessionId);
}
}
@@ -362,14 +371,14 @@ protected Callable executeRequest(RemoteRequest requestArg, lo
private List getRuleMatches(List sentences, List responses) {
BiFunction> createMatch = (matchList, sentence) -> matchList.getMatchesList().stream().map(match -> {
- String description = match.getRuleDescription();
- if (description == null || description.isEmpty()) {
- description = this.getDescription();
+ String description = match.getRuleDescription();
if (description == null || description.isEmpty()) {
- throw new RuntimeException("Missing description for rule with ID " + match.getId() + "_" + match.getSubId());
+ description = this.getDescription();
+ if (description == null || description.isEmpty()) {
+ throw new RuntimeException("Missing description for rule with ID " + match.getId() + "_" + match.getSubId());
+ }
}
- }
- GRPCSubRule subRule = new GRPCSubRule(match.getId(), match.getSubId(), description);
+ GRPCSubRule subRule = new GRPCSubRule(match.getId(), match.getSubId(), description, ruleLanguage);
String message = match.getMatchDescription();
String shortMessage = match.getMatchShortDescription();
if (message == null || message.isEmpty()) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java
index fa2f763be02f8..2e2f6dc344a8b 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRule.java
@@ -69,7 +69,7 @@ public RemoteRule(Language language, ResourceBundle messages, RemoteRuleConfig c
super(messages);
serviceConfiguration = config;
this.ruleLanguage = language;
- this.lt = new JLanguageTool(ruleLanguage);
+ this.lt = this.ruleLanguage.createDefaultJLanguageTool();
this.inputLogging = inputLogging;
if (ruleId == null) { // allow both providing rule ID in constructor or overriding getId
ruleId = getId();
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleFilters.java b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleFilters.java
index b8d0a33f59b46..7cdbdfdfb978a 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleFilters.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/RemoteRuleFilters.java
@@ -41,6 +41,7 @@
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ExecutionException;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -55,9 +56,10 @@ public final class RemoteRuleFilters {
public static final String RULE_FILE = "remote-rule-filters.xml";
- private static final LoadingCache>> rules =
+ private static final LoadingCache>>> rules =
CacheBuilder.newBuilder()
- .build(CacheLoader.from(RemoteRuleFilters::load));
+ .build(CacheLoader.from((lang) -> compilePatterns(RemoteRuleFilters.load(lang))));
+
private RemoteRuleFilters() {
}
@@ -69,8 +71,8 @@ public static List filterMatches(@NotNull Language lang, @NotNull Ana
}
// load all relevant filters for given matches
Set matchIds = matches.stream().map(m -> m.getRule().getId()).collect(Collectors.toSet());
- List filters = rules.get(lang).entrySet().stream()
- .filter(e -> matchIds.stream().anyMatch(id -> id.matches(e.getKey())))
+ List filters = rules.get(lang).stream()
+ .filter(e -> matchIds.stream().anyMatch(id -> e.getKey().matcher(id).matches()))
.flatMap(e -> e.getValue().stream())
.collect(Collectors.toList());
@@ -184,7 +186,7 @@ public static void main(String[] args) throws Exception {
}
static Map> load(Language lang) {
- JLanguageTool lt = new JLanguageTool(lang);
+ JLanguageTool lt = lang.createDefaultJLanguageTool();
ResourceDataBroker dataBroker = JLanguageTool.getDataBroker();
String filename = dataBroker.getRulesDir() + "/" + getFilename(lang);
try {
@@ -199,6 +201,18 @@ static Map> load(Language lang) {
}
}
+ static List>> compilePatterns(Map> rules) {
+ List>> result = new ArrayList<>(rules.size());
+ // we treat rule ids in this file as regexes over rule IDs of matches
+ // compile them once here and then reuse
+ rules.forEach((ruleId, ruleList) -> {
+ Pattern key = Pattern.compile(ruleId);
+ result.add(new AbstractMap.SimpleImmutableEntry<>(key, ruleList));
+ });
+ return result;
+ }
+
+
@NotNull
static String getFilename(Language lang) {
// we don't support language variants in AI rules / remote rule filters at the moment;
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/RuleMatch.java b/languagetool-core/src/main/java/org/languagetool/rules/RuleMatch.java
index e92bca9991e07..38a9dad170e91 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/RuleMatch.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/RuleMatch.java
@@ -25,8 +25,6 @@
import org.jetbrains.annotations.Nullable;
import org.languagetool.AnalyzedSentence;
import org.languagetool.ApiCleanupNeeded;
-import org.languagetool.Language;
-import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.PatternRuleMatcher;
import org.languagetool.tools.StringTools;
@@ -34,7 +32,6 @@
import java.net.URL;
import java.util.*;
import java.util.function.Supplier;
-import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
@@ -69,7 +66,7 @@ public class RuleMatch implements Comparable {
private Type type = Type.Other;
private SortedMap features = Collections.emptySortedMap();
private boolean autoCorrect = false;
- private String errorLimitLang;
+ private Map newLanguageMatches = new LinkedHashMap<>();
private String specificRuleId = "";
@@ -602,22 +599,21 @@ public int hashCode() {
/**
* The language that the text might be in if the error limit has been reached.
*
- * @since 5.3
+ * @since 6.4
*/
- @Nullable
- public String getErrorLimitLang() {
- return errorLimitLang;
+ public Map getNewLanguageMatches() {
+ return newLanguageMatches;
}
/**
* Call if the error limit is reached for this sentence. The caller will then get text ranges for the
* sentence and can ignore errors there. Note: will not have an effect for text-level rules.
*
- * @param langCode the language this could be instead
- * @since 5.3
+ * @param newLanguageMatches a map of possible languages this could be instead
+ * @since 6.4
*/
- public void setErrorLimitLang(String langCode) {
- this.errorLimitLang = langCode;
+ public void setNewLanguageMatches(Map newLanguageMatches) {
+ this.newLanguageMatches = newLanguageMatches;
}
/**
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/SuggestionFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/SuggestionFilter.java
index 28e20311a5035..8b4dc8a314cf2 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/SuggestionFilter.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/SuggestionFilter.java
@@ -39,7 +39,7 @@ public class SuggestionFilter {
public SuggestionFilter(Rule rule, Language lang) {
this.rule = Objects.requireNonNull(rule);
- this.lt = new JLanguageTool(lang);
+ this.lt = lang.createDefaultJLanguageTool();
}
public List filter(List replacements, String template) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSuppressIfAnyRuleMatchesFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/SuppressIfAnyRuleMatchesFilter.java
similarity index 70%
rename from languagetool-core/src/main/java/org/languagetool/rules/AbstractSuppressIfAnyRuleMatchesFilter.java
rename to languagetool-core/src/main/java/org/languagetool/rules/SuppressIfAnyRuleMatchesFilter.java
index 08878b7abda74..4b7571d84eaf3 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/AbstractSuppressIfAnyRuleMatchesFilter.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/SuppressIfAnyRuleMatchesFilter.java
@@ -23,38 +23,40 @@
import java.util.List;
import java.util.Map;
+import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
+import org.languagetool.rules.patterns.PatternRule;
import org.languagetool.rules.patterns.RuleFilter;
-public abstract class AbstractSuppressIfAnyRuleMatchesFilter extends RuleFilter {
+public class SuppressIfAnyRuleMatchesFilter extends RuleFilter {
+ /*
+ * Suppress the match if the new suggestion creates any new match with the rule IDs provided
+ */
@Override
public RuleMatch acceptRuleMatch(RuleMatch match, Map arguments, int patternTokenPos,
AnalyzedTokenReadings[] patternTokens) throws IOException {
-// if (match.getSentence().getText().contains("t'ho has cregut")) {
-// int ii=0;
-// ii++;
-// }
List ruleIDs = Arrays.asList(getRequired("ruleIDs", arguments).split(","));
- JLanguageTool lt = getJLanguageTool();
+ JLanguageTool lt = ((PatternRule) match.getRule()).getLanguage().createDefaultJLanguageTool();
String sentence = match.getSentence().getText();
for (String replacement : match.getSuggestedReplacements()) {
String newSentence = sentence.substring(0, match.getFromPos()) + replacement
+ sentence.substring(match.getToPos());
- List matches = lt.check(newSentence);
- for (RuleMatch m : matches) {
- if (ruleIDs.contains(m.getRule().getId())) {
- if ((m.getToPos() >= match.getFromPos() && m.getToPos() <= match.getToPos())
+ AnalyzedSentence analyzedSentence = lt.analyzeText(newSentence).get(0);
+ for (Rule r: lt.getAllActiveRules()) {
+ if (ruleIDs.contains(r.getId())) {
+ RuleMatch matches[] = r.match(analyzedSentence);
+ for (RuleMatch m : matches) {
+ if ((m.getToPos() >= match.getFromPos() && m.getToPos() <= match.getToPos())
|| (match.getToPos() >= m.getFromPos() && match.getToPos() <= m.getToPos())) {
- return null;
+ return null;
+ }
}
}
}
}
return match;
}
-
- protected abstract JLanguageTool getJLanguageTool();
}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/UnderlineSpacesFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/UnderlineSpacesFilter.java
new file mode 100644
index 0000000000000..99d838a6174d8
--- /dev/null
+++ b/languagetool-core/src/main/java/org/languagetool/rules/UnderlineSpacesFilter.java
@@ -0,0 +1,51 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2023 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+package org.languagetool.rules;
+
+import org.languagetool.AnalyzedTokenReadings;
+import org.languagetool.rules.patterns.RuleFilter;
+import org.languagetool.tools.StringTools;
+
+import java.io.IOException;
+import java.util.Map;
+
+public class UnderlineSpacesFilter extends RuleFilter {
+
+ /*
+ * Underline the whitespaces before and/or after the marker in the pattern
+ */
+ public RuleMatch acceptRuleMatch(RuleMatch match, Map arguments, int patternTokenPos,
+ AnalyzedTokenReadings[] patternTokens) throws IOException {
+ String underlineSpaces = getRequired("underlineSpaces", arguments); // before/after/both
+ String sentence = match.getSentence().getText();
+ if (underlineSpaces.equals("before") || underlineSpaces.equals("both")) {
+ if (match.getFromPos() - 1 >= 0
+ && StringTools.isWhitespace(sentence.substring(match.getFromPos() - 1, match.getFromPos()))) {
+ match.setOffsetPosition(match.getFromPos() - 1, match.getToPos());
+ }
+ }
+ if (underlineSpaces.equals("after") || underlineSpaces.equals("both")) {
+ if (match.getToPos() + 1 < sentence.length()
+ && StringTools.isWhitespace(sentence.substring(match.getToPos(), match.getToPos() + 1))) {
+ match.setOffsetPosition(match.getFromPos(), match.getToPos() + 1);
+ }
+ }
+ return match;
+ }
+}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/UppercaseSentenceStartRule.java b/languagetool-core/src/main/java/org/languagetool/rules/UppercaseSentenceStartRule.java
index a2d515c72241f..16c0a25dd0bc5 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/UppercaseSentenceStartRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/UppercaseSentenceStartRule.java
@@ -31,6 +31,8 @@
import org.languagetool.tokenizers.WordTokenizer;
import org.languagetool.tools.StringTools;
+import static java.util.regex.Pattern.compile;
+
/**
* Checks that a sentence starts with an uppercase letter.
*
@@ -39,9 +41,10 @@
public class UppercaseSentenceStartRule extends TextLevelRule {
private static final Pattern NUMERALS_EN =
- Pattern.compile("[a-z]|(m{0,4}(c[md]|d?c{0,3})(x[cl]|l?x{0,3})(i[xv]|v?i{0,3}))$");
- private static final Pattern WHITESPACE_OR_QUOTE = Pattern.compile("[ \"'„«»‘’“”\\n]"); //only ending quote is necessary?
- private static final Pattern SENTENCE_END1 = Pattern.compile("[.?!…]|");
+ compile("[a-z]|(m{0,4}(c[md]|d?c{0,3})(x[cl]|l?x{0,3})(i[xv]|v?i{0,3}))$");
+ private static final Pattern CONTAINS_DIGIT = compile(".*\\d.*");
+ private static final Pattern WHITESPACE_OR_QUOTE = compile("[ \"'„«»‘’“”\\n]"); //only ending quote is necessary?
+ private static final Pattern SENTENCE_END1 = compile("[.?!…]|");
private static final Set EXCEPTIONS = new HashSet<>(Arrays.asList(
"n", // n/a
"w", // w/o
@@ -51,6 +54,8 @@ public class UppercaseSentenceStartRule extends TextLevelRule {
"cc", // cc @daniel => "Cc @daniel" is strange
"pH"
));
+ private static final Pattern DIGIT_DOT = compile("\\d+\\. .*");
+ private static final Pattern LINEBREAK_DIGIT_DOT = compile(".*\n\\d+\\. ");
private final Language language;
@@ -127,8 +132,9 @@ public RuleMatch[] match(List sentences) throws IOException {
matchTokenPos = 3;
}
- if( isException(tokens, matchTokenPos) )
+ if (isException(tokens, matchTokenPos)) {
return toRuleMatchArray(ruleMatches);
+ }
String checkToken = firstToken;
if (thirdToken != null) {
@@ -147,6 +153,9 @@ public RuleMatch[] match(List sentences) throws IOException {
if (lastParagraphString.equals(",") || lastParagraphString.equals(";")) {
preventError = true;
}
+ if (CONTAINS_DIGIT.matcher(tokens[matchTokenPos].getToken()).matches()) {
+ preventError = true;
+ }
if (!SENTENCE_END1.matcher(lastParagraphString).matches() && !isSentenceEnd(lastToken)) {
preventError = true;
}
@@ -164,7 +173,7 @@ public RuleMatch[] match(List sentences) throws IOException {
}
if (isPrevSentenceNumberedList || isUrl(checkToken) || isEMail(checkToken) || firstTokenObj.isImmunized()
- || tokens[matchTokenPos].hasPosTag("_IS_URL")) {
+ || tokens[matchTokenPos].hasPosTag("_IS_URL")) {
preventError = true;
}
@@ -184,7 +193,7 @@ public RuleMatch[] match(List sentences) throws IOException {
// work around that here so the items don't create an error when starting lowercase:
// 1. item one
// 2. item two
- isPrevSentenceNumberedList = sentence.getText().matches("\\d+\\. .*") || sentence.getText().matches(".*\n\\d+\\. ");
+ isPrevSentenceNumberedList = DIGIT_DOT.matcher(sentence.getText()).matches() || LINEBREAK_DIGIT_DOT.matcher(sentence.getText()).matches();
}
return toRuleMatchArray(ruleMatches);
}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/WordCoherencyDataLoader.java b/languagetool-core/src/main/java/org/languagetool/rules/WordCoherencyDataLoader.java
index a44200347e9eb..992e3c1f40e96 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/WordCoherencyDataLoader.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/WordCoherencyDataLoader.java
@@ -18,7 +18,8 @@
*/
package org.languagetool.rules;
-import gnu.trove.THashMap;
+import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
+import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import org.languagetool.JLanguageTool;
import java.io.*;
@@ -36,7 +37,7 @@ public class WordCoherencyDataLoader {
public Map> loadWords(String path) {
InputStream stream = JLanguageTool.getDataBroker().getFromRulesDirAsStream(path);
- Map> map = new THashMap<>();
+ Map> map = new Object2ObjectOpenHashMap<>();
try (
InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
BufferedReader br = new BufferedReader(reader)
@@ -53,12 +54,12 @@ public Map> loadWords(String path) {
if(map.containsKey(parts[0])) {
map.get(parts[0]).add(parts[1]);
} else {
- map.put(parts[0], Stream.of(parts[1]).collect(Collectors.toSet()));
+ map.put(parts[0], Stream.of(parts[1]).collect(Collectors.toCollection(ObjectOpenHashSet::new)));
}
if(map.containsKey(parts[1])) {
map.get(parts[1]).add(parts[0]);
} else {
- map.put(parts[1], Stream.of(parts[0]).collect(Collectors.toSet()));
+ map.put(parts[1], Stream.of(parts[0]).collect(Collectors.toCollection(ObjectOpenHashSet::new)));
}
}
} catch (IOException e) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java
index 84748be5373d9..6c68a1d2b4aab 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/ngrams/ConfusionProbabilityRule.java
@@ -55,8 +55,8 @@ public abstract class ConfusionProbabilityRule extends Rule {
public static final float MIN_COVERAGE = 0.5f;
// the minimum value the more probable variant needs to have to be considered:
private static final double MIN_PROB = 0.0; // try values > 0 to avoid false alarms
-
private static final boolean DEBUG = false; // also see DEBUG in BaseLanguageModel.java
+ private static final Pattern REAL_WORD = Pattern.compile("\\p{L}+");
// Speed up the server use case, where rules get initialized for every call:
private static final LoadingCache>> confSetCache = CacheBuilder.newBuilder()
@@ -214,7 +214,7 @@ private String cleanId(String id) {
}
private boolean isRealWord(String token) {
- return token.matches("[\\p{L}]+");
+ return REAL_WORD.matcher(token).matches();
}
private boolean isLocalException(AnalyzedSentence sentence, GoogleToken googleToken) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleMatcher.java b/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleMatcher.java
index b657ab234976e..4c3912211a3b6 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleMatcher.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/patterns/PatternRuleMatcher.java
@@ -48,6 +48,11 @@ final public class PatternRuleMatcher extends AbstractPatternRulePerformer imple
.compile(RuleMatch.SUGGESTION_START_TAG + PatternRuleHandler.PLEASE_SPELL_ME
+ allowedChars + "(\\(" + allowedChars + "\\)|" + MISTAKE + ")" + allowedChars
+ RuleMatch.SUGGESTION_END_TAG);
+ private static final Pattern SINGLE_QUOTE = Pattern.compile("'");
+ private static final Pattern WHITESPACE_OR_PUNCT = Pattern.compile("[\\s,:;.!?].*");
+ private static final Pattern TAG_AND_PLEASE_SPELL_ME = Pattern.compile(RuleMatch.SUGGESTION_START_TAG + PatternRuleHandler.PLEASE_SPELL_ME);
+ private static final Pattern PLEASE_SPELL_ME_PATTERN = Pattern.compile(PatternRuleHandler.PLEASE_SPELL_ME);
+ private static final Pattern MISTAKE_PATTERN = Pattern.compile(MISTAKE);
private final boolean useList;
//private final Integer slowMatchThreshold;
@@ -144,7 +149,7 @@ private RuleMatch createRuleMatch(int[] tokenPositions,
boolean isInputAllUppercase = StringTools.isAllUppercase(inputTokens);
// one-character words (A, J', L') are not enough to consider it an all-uppercase word
boolean isAllUppercase = isInputAllUppercase &&
- (firstMatchTokenObj.getToken().replaceAll("'", "").length() > 1 || lastMatchToken > idx)
+ (SINGLE_QUOTE.matcher(firstMatchTokenObj.getToken()).replaceAll("").length() > 1 || lastMatchToken > idx)
&& matchPreservesCase(rule.getSuggestionMatches(), rule.getMessage())
&& matchPreservesCase(rule.getSuggestionMatchesOutMsg(), rule.getSuggestionsOutMsg());
isAllUppercase = isAllUppercase && rule.isAdjustSuggestionCase();
@@ -179,7 +184,8 @@ && matchPreservesCase(rule.getSuggestionMatches(), rule.getMessage())
// then do not create the rule match
if (!(errMessage.contains(PatternRuleHandler.PLEASE_SPELL_ME) && !errMessage.contains(RuleMatch.SUGGESTION_START_TAG)
&& !suggestionsOutMsg.contains(RuleMatch.SUGGESTION_START_TAG))) {
- String clearMsg = errMessage.replaceAll(PatternRuleHandler.PLEASE_SPELL_ME, "").replaceAll(MISTAKE, "");
+ String clearMsg = PLEASE_SPELL_ME_PATTERN.matcher(errMessage).replaceAll("");
+ clearMsg = MISTAKE_PATTERN.matcher(clearMsg).replaceAll("");
RuleMatch ruleMatch = new RuleMatch(rule, sentence, fromPos, toPos, tokens[firstMatchToken].getStartPos(), tokens[lastMatchToken].getEndPos(),
clearMsg, shortErrMessage, startsWithUppercase, isAllUppercase, suggestionsOutMsg, true);
ruleMatch.setType(rule.getType());
@@ -343,7 +349,7 @@ private String formatMatches(AnalyzedTokenReadings[] tokenReadings,
private static String concatWithoutExtraSpace(String leftSide, String rightSide) {
// can't do \\p{Punct} as it catches \2 placeholder
- if (leftSide.endsWith(" ") && rightSide.matches("[\\s,:;.!?].*")) {
+ if (leftSide.endsWith(" ") && WHITESPACE_OR_PUNCT.matcher(rightSide).matches()) {
return leftSide.substring(0, leftSide.length()-1) + rightSide;
}
if (leftSide.endsWith("suggestion>") && rightSide.startsWith(" ")) {
@@ -359,7 +365,7 @@ private static String removeSuppressMisspelled(String s) {
Matcher matcher = SUGGESTION_PATTERN_SUPPRESS.matcher(result);
result = matcher.replaceAll("");
// remove the remaining tags in suggestions but not in the message
- result = result.replaceAll(RuleMatch.SUGGESTION_START_TAG + PatternRuleHandler.PLEASE_SPELL_ME, RuleMatch.SUGGESTION_START_TAG);
+ result = TAG_AND_PLEASE_SPELL_ME.matcher(result).replaceAll(RuleMatch.SUGGESTION_START_TAG);
return result;
}
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/patterns/RuleFilterEvaluator.java b/languagetool-core/src/main/java/org/languagetool/rules/patterns/RuleFilterEvaluator.java
index 54c88713081c8..c2dc0e4c3b37e 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/patterns/RuleFilterEvaluator.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/patterns/RuleFilterEvaluator.java
@@ -24,6 +24,7 @@
import java.io.IOException;
import java.util.*;
+import java.util.regex.Pattern;
/**
* Evaluates a {@link RuleFilter}.
@@ -31,6 +32,8 @@
*/
public class RuleFilterEvaluator {
+ private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+
private final RuleFilter filter;
public RuleFilterEvaluator(RuleFilter filter) {
@@ -48,7 +51,7 @@ public RuleMatch runFilter(String filterArgs, RuleMatch ruleMatch, AnalyzedToken
*/
public Map getResolvedArguments(String filterArgs, AnalyzedTokenReadings[] patternTokens, int patternTokenPos, List tokenPositions) {
Map result = new HashMap<>();
- String[] arguments = filterArgs.split("\\s+");
+ String[] arguments = WHITESPACE.split(filterArgs);
for (String arg : arguments) {
int delimPos = arg.indexOf(':');
if (delimPos == -1) {
diff --git a/languagetool-core/src/main/java/org/languagetool/rules/spelling/ForeignLanguageChecker.java b/languagetool-core/src/main/java/org/languagetool/rules/spelling/ForeignLanguageChecker.java
index 00432dab9bfe0..956cfbc542a10 100644
--- a/languagetool-core/src/main/java/org/languagetool/rules/spelling/ForeignLanguageChecker.java
+++ b/languagetool-core/src/main/java/org/languagetool/rules/spelling/ForeignLanguageChecker.java
@@ -21,6 +21,7 @@
package org.languagetool.rules.spelling;
import lombok.extern.slf4j.Slf4j;
+import org.jetbrains.annotations.NotNull;
import org.languagetool.DetectedLanguage;
import org.languagetool.Language;
import org.languagetool.language.identifier.LanguageIdentifier;
@@ -28,20 +29,24 @@
import java.io.IOException;
import java.util.Collections;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
@Slf4j
public class ForeignLanguageChecker {
private static final float ERROR_THRESHOLD = 0.45f;
private static final int MIN_SENTENCE_THRESHOLD = 3;
+ private static final int MAX_SCORING_LANGUAGES = 5;
public static final String NO_FOREIGN_LANG_DETECTED = "NO_FOREIGN_LANG_DETECTED";
+
private final String languageShortCode;
private final String sentence;
private final long sentenceLength;
private final List preferredLanguages;
-
+
public ForeignLanguageChecker(String languageShortCode, String sentence, Long sentenceLength, List preferredLanguages) {
this.languageShortCode = languageShortCode;
this.sentence = sentence;
@@ -49,30 +54,38 @@ public ForeignLanguageChecker(String languageShortCode, String sentence, Long se
this.preferredLanguages = Collections.unmodifiableList(preferredLanguages);
}
- public String check(int matchesSoFar) throws IOException {
+ @NotNull
+ public Map check(int matchesSoFar) throws IOException {
float errorRatio = (float) matchesSoFar / sentenceLength;
if (sentenceLength >= MIN_SENTENCE_THRESHOLD && errorRatio >= ERROR_THRESHOLD) {
LanguageIdentifier langIdent = LanguageIdentifierService.INSTANCE.getInitialized();
if (langIdent != null) {
- DetectedLanguage langDetectResults = langIdent.detectLanguage(sentence, Collections.emptyList(), preferredLanguages);
//for now, we just use the result if also in preferredLanguages to prevent false positive
- if (langDetectResults != null) {
- Language detectedLanguage = langDetectResults.getDetectedLanguage();
- if (detectedLanguage != null && !detectedLanguage.getShortCode().equals(languageShortCode) && preferredLanguages.contains(detectedLanguage.getShortCode())) {
+ List