Skip to content

Commit

Permalink
training class can now work on pre-extracted sentences (much faster t…
Browse files Browse the repository at this point in the history
…han parsing the Wikipedia XML on each run)
  • Loading branch information
danielnaber committed Apr 28, 2015
1 parent 1a7419d commit 9346c05
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 19 deletions.
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import org.encog.neural.networks.BasicNetwork; import org.encog.neural.networks.BasicNetwork;
import org.languagetool.Language; import org.languagetool.Language;
import org.languagetool.Languages; import org.languagetool.Languages;
import org.languagetool.dev.dumpcheck.PlainTextSentenceSource;
import org.languagetool.dev.dumpcheck.Sentence; import org.languagetool.dev.dumpcheck.Sentence;
import org.languagetool.dev.dumpcheck.SentenceSource;
import org.languagetool.dev.dumpcheck.WikipediaSentenceSource; import org.languagetool.dev.dumpcheck.WikipediaSentenceSource;
import org.languagetool.dev.eval.FMeasure; import org.languagetool.dev.eval.FMeasure;
import org.languagetool.languagemodel.LanguageModel; import org.languagetool.languagemodel.LanguageModel;
Expand Down Expand Up @@ -64,17 +66,17 @@ class WikipediaTrainingDataGenerator {
this.languageModel = languageModel; this.languageModel = languageModel;
} }


private void run(File corpusFile, String token, String homophoneToken) throws IOException { private void run(File corpusFileOrDir, String token, String homophoneToken) throws IOException {


List<Sentence> allSentences = getRelevantSentences(corpusFile, token, MAX_SENTENCES_CORRECT); List<Sentence> allSentences = getRelevantSentences(corpusFileOrDir, token, MAX_SENTENCES_CORRECT);
ListSplit<Sentence> split = split(allSentences, TEST_SET_FACTOR); ListSplit<Sentence> split = split(allSentences, TEST_SET_FACTOR);
List<Sentence> trainingSentences = split.trainingList; List<Sentence> trainingSentences = split.trainingList;
List<Sentence> testSentences = split.testList; List<Sentence> testSentences = split.testList;
System.out.println("Found " + trainingSentences.size() + " training sentences with '" + token + "'"); System.out.println("Found " + trainingSentences.size() + " training sentences with '" + token + "'");
System.out.println("Found " + testSentences.size() + " test sentences with '" + token + "'"); System.out.println("Found " + testSentences.size() + " test sentences with '" + token + "'");


// Load the sentences with a homophone to and later replace it so we get error sentences: // Load the sentences with a homophone to and later replace it so we get error sentences:
List<Sentence> allHomophoneSentences = getRelevantSentences(corpusFile, homophoneToken, MAX_SENTENCES_ERROR); List<Sentence> allHomophoneSentences = getRelevantSentences(corpusFileOrDir, homophoneToken, MAX_SENTENCES_ERROR);
ListSplit<Sentence> homophoneSplit = split(allHomophoneSentences, TEST_SET_FACTOR); ListSplit<Sentence> homophoneSplit = split(allHomophoneSentences, TEST_SET_FACTOR);
List<Sentence> homophoneTrainingSentences = homophoneSplit.trainingList; List<Sentence> homophoneTrainingSentences = homophoneSplit.trainingList;
List<Sentence> homophoneTestSentences = homophoneSplit.testList; List<Sentence> homophoneTestSentences = homophoneSplit.testList;
Expand Down Expand Up @@ -153,20 +155,37 @@ private String asString(boolean b) {
return b ? "+" : "-"; return b ? "+" : "-";
} }


private List<Sentence> getRelevantSentences(File corpusFile, String token, int maxSentences) throws IOException { private List<Sentence> getRelevantSentences(File corpusFileOrDir, String token, int maxSentences) throws IOException {
List<Sentence> sentences;
if (corpusFileOrDir.isDirectory()) {
File file = new File(corpusFileOrDir, token + ".txt");
if (!file.exists()) {
throw new RuntimeException("File with example sentences not found: " + file);
}
try (FileInputStream fis = new FileInputStream(file)) {
SentenceSource sentenceSource = new PlainTextSentenceSource(fis, language);
sentences = getSentencesFromSource(corpusFileOrDir, token, maxSentences, sentenceSource);
}
} else {
try (FileInputStream fis = new FileInputStream(corpusFileOrDir)) {
SentenceSource sentenceSource = new WikipediaSentenceSource(fis, language);
sentences = getSentencesFromSource(corpusFileOrDir, token, maxSentences, sentenceSource);
}
}
return sentences;
}

private List<Sentence> getSentencesFromSource(File corpusFile, String token, int maxSentences, SentenceSource sentenceSource) {
List<Sentence> sentences = new ArrayList<>(); List<Sentence> sentences = new ArrayList<>();
try (FileInputStream fis = new FileInputStream(corpusFile)) { while (sentenceSource.hasNext()) {
WikipediaSentenceSource source = new WikipediaSentenceSource(fis, language); Sentence sentence = sentenceSource.next();
while (source.hasNext()) { if (sentence.getText().matches(".*\\b" + token + "\\b.*")) {
Sentence sentence = source.next(); sentences.add(sentence);
if (sentence.getText().matches(".*\\b" + token + "\\b.*")) { // TODO: use real tokenizer? if (sentences.size() % 25 == 0) {
sentences.add(sentence); System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + corpusFile.getName());
if (sentences.size() % 25 == 0) { }
System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + corpusFile.getName()); if (sentences.size() >= maxSentences) {
} break;
if (sentences.size() >= maxSentences) {
break;
}
} }
} }
} }
Expand Down Expand Up @@ -296,8 +315,10 @@ static class ListSplit<T> {
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
if (args.length != 3) { if (args.length != 3) {
System.err.println("Usage: " + WikipediaTrainingDataGenerator.class.getSimpleName() System.err.println("Usage: " + WikipediaTrainingDataGenerator.class.getSimpleName()
+ " <langCode> <wikipediaXml> <languageModelTopDir>"); + " <langCode> <wikipediaXml|dir> <languageModelTopDir>");
System.err.println(" <languageModelTopDir> is a directory with sub-directories '2grams' and/or '3grams' with Lucene indexes"); System.err.println(" <languageModelTopDir> is a directory with sub-directories '2grams' and/or '3grams' with Lucene indexes");
System.err.println(" <wikipediaXml|dir> either a Wikipedia XML dump or");
System.err.println(" a directory with example sentences (where <word>.txt contains only the sentences for <word>)");
System.exit(1); System.exit(1);
} }
Language lang = Languages.getLanguageForShortName(args[0]); Language lang = Languages.getLanguageForShortName(args[0]);
Expand Down
Original file line number Original file line Diff line number Diff line change
@@ -0,0 +1,80 @@
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.dumpcheck;

import org.languagetool.Language;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Scanner;

/**
* Provides access to the relevant sentences of a plain text file
* with one sentence per line.
* @since 3.0
*/
public class PlainTextSentenceSource extends SentenceSource {

private final List<String> sentences;
private final Scanner scanner;

// Each sentence is one article, but count anyway so it's coherent with what the Wikipedia code does:
private int articleCount = 0;

public PlainTextSentenceSource(InputStream textInput, Language language) {
super(language);
scanner = new Scanner(textInput);
sentences = new ArrayList<>();
}

@Override
public boolean hasNext() {
fillSentences();
return sentences.size() > 0;
}

@Override
public Sentence next() {
fillSentences();
if (sentences.size() == 0) {
throw new NoSuchElementException();
}
return new Sentence(sentences.remove(0), getSource(), "<plaintext>", null, ++articleCount);
}

@Override
public String getSource() {
return "plaintext";
}

private void fillSentences() {
while (sentences.size() == 0 && scanner.hasNextLine()) {
String line = scanner.nextLine();
if (line.isEmpty()) {
continue;
}
if (acceptSentence(line)) {
sentences.add(line);
}
}
}

}
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
* or plain text sources. * or plain text sources.
* @since 2.4 * @since 2.4
*/ */
abstract class SentenceSource implements Iterator<Sentence> { public abstract class SentenceSource implements Iterator<Sentence> {


private static final int MIN_SENTENCE_SIZE = 10; private static final int MIN_SENTENCE_SIZE = 10;
private static final int MIN_SENTENCE_TOKEN_COUNT = 4; private static final int MIN_SENTENCE_TOKEN_COUNT = 4;
Expand Down
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


/** /**
* Provides access to the sentences of a Tatoeba (http://tatoeba.org) text * Provides access to the sentences of a Tatoeba (http://tatoeba.org) text
* file that has already been filtered to contain only one language, * file (tab separated) that has already been filtered to contain only one language.
* @since 2.4 * @since 2.4
*/ */
class TatoebaSentenceSource extends SentenceSource { class TatoebaSentenceSource extends SentenceSource {
Expand Down

0 comments on commit 9346c05

Please sign in to comment.