Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
performance: make sure the compound data is loaded only once
- Loading branch information
1 parent
ccdaad5
commit 4d0d957
Showing
14 changed files
with
224 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
107 changes: 107 additions & 0 deletions
107
languagetool-core/src/main/java/org/languagetool/rules/CompoundRuleData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
/* LanguageTool, a natural language style checker | ||
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) | ||
* | ||
* This library is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 2.1 of the License, or (at your option) any later version. | ||
* | ||
* This library is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public | ||
* License along with this library; if not, write to the Free Software | ||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | ||
* USA | ||
*/ | ||
package org.languagetool.rules; | ||
|
||
import org.languagetool.JLanguageTool; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
/** | ||
* Data about words that are compounds and should thus not be written | ||
* as separate words. | ||
* @since 3.0 | ||
*/ | ||
public class CompoundRuleData { | ||
|
||
private final Set<String> incorrectCompounds = new HashSet<>(); | ||
private final Set<String> noDashSuggestion = new HashSet<>(); | ||
private final Set<String> onlyDashSuggestion = new HashSet<>(); | ||
|
||
public CompoundRuleData(String path) { | ||
this(new String[] {path}); | ||
} | ||
|
||
public CompoundRuleData(String... paths) { | ||
for (String path : paths) { | ||
try { | ||
loadCompoundFile(path); | ||
} catch (IOException e) { | ||
throw new RuntimeException("Could not load compound data from " + path, e); | ||
} | ||
} | ||
} | ||
|
||
Set<String> getIncorrectCompounds() { | ||
return Collections.unmodifiableSet(incorrectCompounds); | ||
} | ||
|
||
Set<String> getNoDashSuggestion() { | ||
return Collections.unmodifiableSet(noDashSuggestion); | ||
} | ||
|
||
Set<String> getOnlyDashSuggestion() { | ||
return Collections.unmodifiableSet(onlyDashSuggestion); | ||
} | ||
|
||
private void loadCompoundFile(final String path) throws IOException { | ||
InputStream stream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path); | ||
try ( | ||
InputStreamReader reader = new InputStreamReader(stream, "utf-8"); | ||
BufferedReader br = new BufferedReader(reader) | ||
) { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.length() < 1 || line.charAt(0) == '#') { | ||
continue; // ignore comments | ||
} | ||
// the set contains the incorrect spellings, i.e. the ones without hyphen | ||
line = line.replace('-', ' '); | ||
final String[] parts = line.split(" "); | ||
if (parts.length > AbstractCompoundRule.MAX_TERMS) { | ||
throw new IOException("Too many compound parts in file " + path + ": " + line + ", maximum allowed: " + AbstractCompoundRule.MAX_TERMS); | ||
} | ||
if (parts.length == 1) { | ||
throw new IOException("Not a compound in file " + path + ": " + line); | ||
} | ||
if (line.endsWith("+")) { | ||
line = removeLastCharacter(line); | ||
noDashSuggestion.add(line.toLowerCase()); | ||
} else if (line.endsWith("*")) { | ||
line = removeLastCharacter(line); | ||
onlyDashSuggestion.add(line.toLowerCase()); | ||
} | ||
if (incorrectCompounds.contains(line.toLowerCase())) { | ||
throw new RuntimeException("Duplicated word in file " + path + ": " + line); | ||
} | ||
incorrectCompounds.add(line.toLowerCase()); | ||
} | ||
} | ||
} | ||
|
||
private String removeLastCharacter(String str) { | ||
return str.substring(0, str.length() - 1); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.