Skip to content

Commit

Permalink
cleanup to avoid code duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Dec 15, 2015
1 parent 4dd5dde commit 2102dd5
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 51 deletions.
Expand Up @@ -18,59 +18,24 @@
*/ */
package org.languagetool.tokenizers; package org.languagetool.tokenizers;


import net.loomchild.segment.srx.SrxDocument;
import org.languagetool.Language; import org.languagetool.Language;


import java.util.List;
import java.util.Objects;

/** /**
* Class to tokenize sentences using an SRX file. * Class to tokenize sentences using an SRX file.
* See <a href="http://wiki.languagetool.org/customizing-sentence-segmentation-in-srx-rules">our wiki</a> * See <a href="http://wiki.languagetool.org/customizing-sentence-segmentation-in-srx-rules">our wiki</a>
* for a description of how we use SRX. * for a description of how we use SRX.
* @see SRXSentenceTokenizer * @see SRXSentenceTokenizer
* @since 2.6 * @since 2.6
* @deprecated use {@link SRXSentenceTokenizer} instead (deprecated since 3.2)
*/ */
public class LocalSRXSentenceTokenizer implements SentenceTokenizer { @Deprecated

public class LocalSRXSentenceTokenizer extends SRXSentenceTokenizer {
private final SrxDocument srxDocument;
private final Language language;

private String parCode;


/** /**
* @param srxInClassPath the path to an SRX file in the classpath * @param srxInClassPath the path to an SRX file in the classpath
*/ */
public LocalSRXSentenceTokenizer(Language language, String srxInClassPath) { public LocalSRXSentenceTokenizer(Language language, String srxInClassPath) {
this.language = Objects.requireNonNull(language); super(language, srxInClassPath);
this.srxDocument = SrxTools.createSrxDocument(srxInClassPath);
setSingleLineBreaksMarksParagraph(false);
}

@Override
public final List<String> tokenize(final String text) {
return SrxTools.tokenize(text, srxDocument, language.getShortName() + parCode);
}

@Override
public final boolean singleLineBreaksMarksPara() {
return "_one".equals(parCode);
}

/**
* @param lineBreakParagraphs
* if <code>true</code>, single lines breaks are assumed to end a
* paragraph; if <code>false</code>, only two ore more consecutive
* line breaks end a paragraph
*/
@Override
public final void setSingleLineBreaksMarksParagraph(
final boolean lineBreakParagraphs) {
if (lineBreakParagraphs) {
parCode = "_one";
} else {
parCode = "_two";
}
} }

} }
Expand Up @@ -22,31 +22,41 @@
import org.languagetool.Language; import org.languagetool.Language;


import java.util.List; import java.util.List;
import java.util.Objects;


/** /**
* Class to tokenize sentences using LanguageTool's global SRX file for all * Class to tokenize sentences using rules from an SRX file.
* languages. If you add a language that's not part of the official LanguageTool
* distribution, see {@link LocalSRXSentenceTokenizer} instead.
*
* @author Marcin Miłkowski * @author Marcin Miłkowski
* @author Jarek Lipski * @author Jarek Lipski
*/ */
public class SRXSentenceTokenizer implements SentenceTokenizer { public class SRXSentenceTokenizer implements SentenceTokenizer {


private static final SrxDocument DOCUMENT = SrxTools.createSrxDocument("/segment.srx"); private final SrxDocument srxDocument;

private final Language language; private final Language language;


private String parCode; private String parCode;


public SRXSentenceTokenizer(final Language language) { /**
this.language = language; * Build a sentence tokenizer based on the rules in the {@code segment.srx} file
* that comes with LanguageTool.
*/
public SRXSentenceTokenizer(Language language) {
this(language, "/segment.srx");
}

/**
* @param srxInClassPath the path to an SRX file in the classpath
* @since 3.2
*/
public SRXSentenceTokenizer(Language language, String srxInClassPath) {
this.language = Objects.requireNonNull(language);
this.srxDocument = SrxTools.createSrxDocument(srxInClassPath);
setSingleLineBreaksMarksParagraph(false); setSingleLineBreaksMarksParagraph(false);
} }


@Override @Override
public final List<String> tokenize(final String text) { public final List<String> tokenize(final String text) {
return SrxTools.tokenize(text, DOCUMENT, language.getShortName() + parCode); return SrxTools.tokenize(text, srxDocument, language.getShortName() + parCode);
} }


@Override @Override
Expand Down
Expand Up @@ -28,10 +28,10 @@


/** /**
* A very simple sentence tokenizer that splits on {@code [.!?…]} followed by whitespace * A very simple sentence tokenizer that splits on {@code [.!?…]} followed by whitespace
* or an uppercase letter. You probably want to use an adapted {@link LocalSRXSentenceTokenizer} instead. * or an uppercase letter. You probably want to use an adapted {@link SRXSentenceTokenizer} instead.
* @since 2.6 * @since 2.6
*/ */
public class SimpleSentenceTokenizer extends LocalSRXSentenceTokenizer { public class SimpleSentenceTokenizer extends SRXSentenceTokenizer {


public SimpleSentenceTokenizer() { public SimpleSentenceTokenizer() {
super(new AnyLanguage(), "/org/languagetool/tokenizers/segment-simple.srx"); super(new AnyLanguage(), "/org/languagetool/tokenizers/segment-simple.srx");
Expand Down

0 comments on commit 2102dd5

Please sign in to comment.