Skip to content

Commit

Permalink
performance: make sure the compound data is loaded only once
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Apr 4, 2015
1 parent ccdaad5 commit 4d0d957
Show file tree
Hide file tree
Showing 14 changed files with 224 additions and 107 deletions.
Expand Up @@ -21,13 +21,9 @@
import org.languagetool.AnalyzedSentence;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.tools.StringTools;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;

Expand All @@ -38,11 +34,7 @@
*/
public abstract class AbstractCompoundRule extends Rule {

private static final int MAX_TERMS = 5;

private final Set<String> incorrectCompounds = new HashSet<>();
private final Set<String> noDashSuggestion = new HashSet<>();
private final Set<String> onlyDashSuggestion = new HashSet<>();
static final int MAX_TERMS = 5;

private final String withHyphenMessage;
private final String withoutHyphenMessage;
Expand All @@ -56,26 +48,21 @@ public abstract class AbstractCompoundRule extends Rule {
@Override
public abstract String getDescription();

/** @since 3.0 */
protected abstract CompoundRuleData getCompoundRuleData();

/**
* @since 2.8
* @since 3.0
*/
public AbstractCompoundRule(ResourceBundle messages, List<String> fileNames,
public AbstractCompoundRule(ResourceBundle messages,
String withHyphenMessage, String withoutHyphenMessage, String withOrWithoutHyphenMessage) throws IOException {
super.setCategory(new Category(messages.getString("category_misc")));
for (String fileName : fileNames) {
loadCompoundFile(fileName, "UTF-8");
}
this.withHyphenMessage = withHyphenMessage;
this.withoutHyphenMessage = withoutHyphenMessage;
this.withOrWithoutHyphenMessage = withOrWithoutHyphenMessage;
setLocQualityIssueType(ITSIssueType.Misspelling);
}

public AbstractCompoundRule(final ResourceBundle messages, final String fileName,
final String withHyphenMessage, final String withoutHyphenMessage, final String withOrWithoutHyphenMessage) throws IOException {
this(messages, Collections.singletonList(fileName), withHyphenMessage, withoutHyphenMessage, withOrWithoutHyphenMessage);
}

public void setShort(final String shortDescription) {
shortDesc = shortDescription;
}
Expand Down Expand Up @@ -124,15 +111,15 @@ public RuleMatch[] match(final AnalyzedSentence sentence) {
for (int k = stringsToCheck.size()-1; k >= 0; k--) {
final String stringToCheck = stringsToCheck.get(k);
final String origStringToCheck = origStringsToCheck.get(k);
if (incorrectCompounds.contains(stringToCheck)) {
if (getCompoundRuleData().getIncorrectCompounds().contains(stringToCheck)) {
final AnalyzedTokenReadings atr = stringToToken.get(stringToCheck);
String msg = null;
final List<String> replacement = new ArrayList<>();
if (!noDashSuggestion.contains(stringToCheck)) {
if (!getCompoundRuleData().getNoDashSuggestion().contains(stringToCheck)) {
replacement.add(origStringToCheck.replace(' ', '-'));
msg = withHyphenMessage;
}
if (isNotAllUppercase(origStringToCheck) && !onlyDashSuggestion.contains(stringToCheck)) {
if (isNotAllUppercase(origStringToCheck) && !getCompoundRuleData().getOnlyDashSuggestion().contains(stringToCheck)) {
replacement.add(mergeCompound(origStringToCheck));
msg = withoutHyphenMessage;
}
Expand Down Expand Up @@ -231,45 +218,6 @@ private void addToQueue(final AnalyzedTokenReadings token, final Queue<AnalyzedT
}
}

private void loadCompoundFile(final String fileName, final String encoding) throws IOException {
InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(fileName);
try (
InputStreamReader reader = new InputStreamReader(stream, encoding);
BufferedReader br = new BufferedReader(reader)
) {
String line;
while ((line = br.readLine()) != null) {
if (line.length() < 1 || line.charAt(0) == '#') {
continue; // ignore comments
}
// the set contains the incorrect spellings, i.e. the ones without hyphen
line = line.replace('-', ' ');
final String[] parts = line.split(" ");
if (parts.length > MAX_TERMS) {
throw new IOException("Too many compound parts in file " + fileName + ": " + line + ", maximum allowed: " + MAX_TERMS);
}
if (parts.length == 1) {
throw new IOException("Not a compound in file " + fileName + ": " + line);
}
if (line.endsWith("+")) {
line = removeLastCharacter(line);
noDashSuggestion.add(line.toLowerCase());
} else if (line.endsWith("*")) {
line = removeLastCharacter(line);
onlyDashSuggestion.add(line.toLowerCase());
}
if (incorrectCompounds.contains(line.toLowerCase())) {
throw new RuntimeException("Duplicated word in file " + fileName + ": " + line);
}
incorrectCompounds.add(line.toLowerCase());
}
}
}

private String removeLastCharacter(String str) {
return str.substring(0, str.length() - 1);
}

@Override
public void reset() {
}
Expand Down
@@ -0,0 +1,107 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules;

import org.languagetool.JLanguageTool;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
* Data about words that are compounds and should thus not be written
* as separate words.
* @since 3.0
*/
public class CompoundRuleData {

private final Set<String> incorrectCompounds = new HashSet<>();
private final Set<String> noDashSuggestion = new HashSet<>();
private final Set<String> onlyDashSuggestion = new HashSet<>();

public CompoundRuleData(String path) {
this(new String[] {path});
}

public CompoundRuleData(String... paths) {
for (String path : paths) {
try {
loadCompoundFile(path);
} catch (IOException e) {
throw new RuntimeException("Could not load compound data from " + path, e);
}
}
}

Set<String> getIncorrectCompounds() {
return Collections.unmodifiableSet(incorrectCompounds);
}

Set<String> getNoDashSuggestion() {
return Collections.unmodifiableSet(noDashSuggestion);
}

Set<String> getOnlyDashSuggestion() {
return Collections.unmodifiableSet(onlyDashSuggestion);
}

private void loadCompoundFile(final String path) throws IOException {
InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path);
try (
InputStreamReader reader = new InputStreamReader(stream, "utf-8");
BufferedReader br = new BufferedReader(reader)
) {
String line;
while ((line = br.readLine()) != null) {
if (line.length() < 1 || line.charAt(0) == '#') {
continue; // ignore comments
}
// the set contains the incorrect spellings, i.e. the ones without hyphen
line = line.replace('-', ' ');
final String[] parts = line.split(" ");
if (parts.length > AbstractCompoundRule.MAX_TERMS) {
throw new IOException("Too many compound parts in file " + path + ": " + line + ", maximum allowed: " + AbstractCompoundRule.MAX_TERMS);
}
if (parts.length == 1) {
throw new IOException("Not a compound in file " + path + ": " + line);
}
if (line.endsWith("+")) {
line = removeLastCharacter(line);
noDashSuggestion.add(line.toLowerCase());
} else if (line.endsWith("*")) {
line = removeLastCharacter(line);
onlyDashSuggestion.add(line.toLowerCase());
}
if (incorrectCompounds.contains(line.toLowerCase())) {
throw new RuntimeException("Duplicated word in file " + path + ": " + line);
}
incorrectCompounds.add(line.toLowerCase());
}
}
}

private String removeLastCharacter(String str) {
return str.substring(0, str.length() - 1);
}

}
Expand Up @@ -19,11 +19,10 @@
package org.languagetool.rules.de;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ResourceBundle;

import org.languagetool.rules.AbstractCompoundRule;
import org.languagetool.rules.CompoundRuleData;
import org.languagetool.rules.Example;

/**
Expand All @@ -34,10 +33,10 @@
*/
public class CompoundRule extends AbstractCompoundRule {

private static final List<String> FILE_NAMES = Arrays.asList("/de/compounds.txt", "/de/compound-cities.txt");
private static final CompoundRuleData compoundData = new CompoundRuleData("/de/compounds.txt", "/de/compound-cities.txt");

public CompoundRule(final ResourceBundle messages) throws IOException {
super(messages, FILE_NAMES,
super(messages,
"Dieses Wort wird mit Bindestrich geschrieben.",
"Dieses Wort wird zusammengeschrieben.",
"Dieses Wort wird zusammen oder mit Bindestrich geschrieben.");
Expand All @@ -46,7 +45,6 @@ public CompoundRule(final ResourceBundle messages) throws IOException {
Example.fixed("Wenn es schlimmer wird, solltest Du zum <marker>HNO-Arzt</marker> gehen."));
}


@Override
public String getId() {
return "DE_COMPOUNDS";
Expand All @@ -56,4 +54,9 @@ public String getId() {
public String getDescription() {
return "Zusammenschreibung von Wörtern, z.B. 'CD-ROM' statt 'CD ROM'";
}

@Override
protected CompoundRuleData getCompoundRuleData() {
return compoundData;
}
}
Expand Up @@ -22,25 +22,27 @@
import java.util.ResourceBundle;

import org.languagetool.rules.AbstractCompoundRule;
import org.languagetool.rules.CompoundRuleData;
import org.languagetool.rules.Example;

/**
* Checks that compounds (if in the list) are not written as separate words.
*/
public class CompoundRule extends AbstractCompoundRule {

private static final String FILE_NAME = "/en/compounds.txt";

// static to make sure this gets loaded only once:
private static final CompoundRuleData compoundData = new CompoundRuleData("/en/compounds.txt");

public CompoundRule(final ResourceBundle messages) throws IOException {
super(messages, FILE_NAME,
super(messages,
"This word is normally spelled with hyphen.",
"This word is normally spelled as one.",
"This expression is normally spelled as one or with hyphen.");
super.setShort("Hyphenation problem");
addExamplePair(Example.wrong("I now have a <marker>part time</marker> job."),
Example.fixed("I now have a <marker>part-time</marker> job."));
}

@Override
public String getId() {
return "EN_COMPOUNDS";
Expand All @@ -49,6 +51,11 @@ public String getId() {
@Override
public String getDescription() {
return "Hyphenated words, e.g., 'case-sensitive' instead of 'case sensitive'";
}
}

@Override
protected CompoundRuleData getCompoundRuleData() {
return compoundData;
}

}
Expand Up @@ -22,17 +22,18 @@
import java.util.ResourceBundle;

import org.languagetool.rules.AbstractCompoundRule;
import org.languagetool.rules.CompoundRuleData;
import org.languagetool.rules.Example;

/**
* Checks that compounds (if in the list) are not written as separate words.
*/
public class CompoundRule extends AbstractCompoundRule {

private static final String FILE_NAME = "/fr/compounds.txt";
private static final CompoundRuleData compoundData = new CompoundRuleData("/fr/compounds.txt");

public CompoundRule(final ResourceBundle messages) throws IOException {
super(messages, FILE_NAME,
super(messages,
"Écrivez avec un trait d’union.",
"Écrivez avec un mot seul sans espace ni trait d’union.",
"Écrivez avec un mot seul ou avec trait d’union.");
Expand All @@ -51,4 +52,9 @@ public String getDescription() {
return "Mots avec trait d’union";
}

@Override
protected CompoundRuleData getCompoundRuleData() {
return compoundData;
}

}
Expand Up @@ -19,6 +19,7 @@
package org.languagetool.rules.nl;

import org.languagetool.rules.AbstractCompoundRule;
import org.languagetool.rules.CompoundRuleData;

import java.io.IOException;
import java.util.ResourceBundle;
Expand All @@ -28,10 +29,10 @@
*/
public class CompoundRule extends AbstractCompoundRule {

private static final String FILE_NAME = "/nl/compounds.txt";
private static final CompoundRuleData compoundData = new CompoundRuleData("/nl/compounds.txt");

public CompoundRule(final ResourceBundle messages) throws IOException {
super(messages, FILE_NAME,
super(messages,
"Hier wordt een koppelteken verwacht.",
"Dit woord hoort waarschijnlijk aaneengeschreven.",
"Deze uitdrukking hoort mogelijk aan elkaar, eventueel met een koppelteken.");
Expand All @@ -48,4 +49,9 @@ public String getDescription() {
return "Woorden die aaneen horen met koppeltekens, bijvoorbeeld 'zee-egel' i.p.v. 'zee egel'.\n";
}

@Override
protected CompoundRuleData getCompoundRuleData() {
return compoundData;
}

}

0 comments on commit 4d0d957

Please sign in to comment.