Skip to content

Commit

Permalink
fix ngram logic
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Apr 28, 2015
1 parent 9262e5f commit 55c4ce6
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 8 deletions.
Expand Up @@ -254,11 +254,10 @@ private int getLastPosition(Sentence sentence, String token) {
return pos; return pos;
} }


private List<String> getContext(Sentence sentence, int pos, String newToken, int toLeft, int toRight) { List<String> getContext(String plainText, int pos, String newToken, int toLeft, int toRight) {
String plainText = sentence.getText();
List<String> tokens = removeWhitespaceTokens(tokenizer.tokenize(plainText)); List<String> tokens = removeWhitespaceTokens(tokenizer.tokenize(plainText));
List<String> result = new ArrayList<>(); List<String> result = new ArrayList<>();
for (int i = 1; i > 0 && i <= toLeft; i++) { for (int i = toLeft; i > 0 && i <= toLeft; i--) {
if (pos-i < 0 ) { if (pos-i < 0 ) {
result.add(LanguageModel.GOOGLE_SENTENCE_START); // NOTE: only in v2 of the data! result.add(LanguageModel.GOOGLE_SENTENCE_START); // NOTE: only in v2 of the data!
} else { } else {
Expand All @@ -281,12 +280,13 @@ private double[] getFeatures(Sentence sentence, String token, String newToken) {


int position = getLastPosition(sentence, token); int position = getLastPosition(sentence, token);


double ngram2Left = getCountForTuple(getContext(sentence, position, newToken, 1, 0), maxVal); String text = sentence.getText();
double ngram2Right = getCountForTuple(getContext(sentence, position, newToken, 0, 1), maxVal); double ngram2Left = getCountForTuple(getContext(text, position, newToken, 1, 0), maxVal);
double ngram2Right = getCountForTuple(getContext(text, position, newToken, 0, 1), maxVal);


double ngram3Left = getCountForTriple(getContext(sentence, position, newToken, 0, 2), maxVal); double ngram3Left = getCountForTriple(getContext(text, position, newToken, 0, 2), maxVal);
double ngram3Middle = getCountForTriple(getContext(sentence, position, newToken, 1, 1), maxVal); double ngram3Middle = getCountForTriple(getContext(text, position, newToken, 1, 1), maxVal);
double ngram3Right = getCountForTriple(getContext(sentence, position, newToken, 2, 0), maxVal); double ngram3Right = getCountForTriple(getContext(text, position, newToken, 2, 0), maxVal);


return new double[] {ngram2Left, ngram2Right, ngram3Middle, ngram3Left, ngram3Right}; return new double[] {ngram2Left, ngram2Right, ngram3Middle, ngram3Left, ngram3Right};
} }
Expand Down
@@ -0,0 +1,52 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.errorcorpus;

import org.junit.Test;
import org.languagetool.language.English;
import org.languagetool.languagemodel.LanguageModel;

import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;

public class TrainingDataGeneratorTest {

@Test
public void test() {
TrainingDataGenerator prg = new TrainingDataGenerator(new English(), new FakeLanguageModel());
assertThat(prg.getContext("This is a test.", 2, "XX", 1, 1).toString(), is("[is, XX, test]"));
assertThat(prg.getContext("This is a test.", 2, "XX", 0, 2).toString(), is("[XX, test, _END_]"));
assertThat(prg.getContext("This is a test.", 2, "XX", 2, 0).toString(), is("[This, is, XX]"));
assertThat(prg.getContext("This is a test.", 2, "XX", 3, 0).toString(), is("[_START_, This, is, XX]"));
}

static class FakeLanguageModel implements LanguageModel {
@Override
public long getCount(String token1, String token2) {
return 1;
}
@Override
public long getCount(String token1, String token2, String token3) {
return 2;
}
@Override
public void close() {}
}

}

0 comments on commit 55c4ce6

Please sign in to comment.