training class can now work on pre-extracted sentences (much faster t…

…han parsing the Wikipedia XML on each run)
languagetool-org · Apr 28, 2015 · 9346c05 · 9346c05
1 parent 1a7419d
commit 9346c05
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 19 deletions.
diff --git a/...ol-dev/src/main/java/org/languagetool/dev/errorcorpus/WikipediaTrainingDataGenerator.java b/...ol-dev/src/main/java/org/languagetool/dev/errorcorpus/WikipediaTrainingDataGenerator.java
@@ -22,7 +22,9 @@
 import org.encog.neural.networks.BasicNetwork;
 import org.languagetool.Language;
 import org.languagetool.Languages;
+import org.languagetool.dev.dumpcheck.PlainTextSentenceSource;
 import org.languagetool.dev.dumpcheck.Sentence;
+import org.languagetool.dev.dumpcheck.SentenceSource;
 import org.languagetool.dev.dumpcheck.WikipediaSentenceSource;
 import org.languagetool.dev.eval.FMeasure;
 import org.languagetool.languagemodel.LanguageModel;
@@ -64,17 +66,17 @@ class WikipediaTrainingDataGenerator {
     this.languageModel = languageModel;
   }
 
-  private void run(File corpusFile, String token, String homophoneToken) throws IOException {
+  private void run(File corpusFileOrDir, String token, String homophoneToken) throws IOException {
 
-    List<Sentence> allSentences = getRelevantSentences(corpusFile, token, MAX_SENTENCES_CORRECT);
+    List<Sentence> allSentences = getRelevantSentences(corpusFileOrDir, token, MAX_SENTENCES_CORRECT);
     ListSplit<Sentence> split = split(allSentences, TEST_SET_FACTOR);
     List<Sentence> trainingSentences = split.trainingList;
     List<Sentence> testSentences = split.testList;
     System.out.println("Found " + trainingSentences.size() + " training sentences with '" + token + "'");
     System.out.println("Found " + testSentences.size() + " test sentences with '" + token + "'");
 
     // Load the sentences with a homophone to and later replace it so we get error sentences:
-    List<Sentence> allHomophoneSentences = getRelevantSentences(corpusFile, homophoneToken, MAX_SENTENCES_ERROR);
+    List<Sentence> allHomophoneSentences = getRelevantSentences(corpusFileOrDir, homophoneToken, MAX_SENTENCES_ERROR);
     ListSplit<Sentence> homophoneSplit = split(allHomophoneSentences, TEST_SET_FACTOR);
     List<Sentence> homophoneTrainingSentences = homophoneSplit.trainingList;
     List<Sentence> homophoneTestSentences = homophoneSplit.testList;    
@@ -153,20 +155,37 @@ private String asString(boolean b) {
     return b ? "+" : "-";
   }
 
-  private List<Sentence> getRelevantSentences(File corpusFile, String token, int maxSentences) throws IOException {
+  private List<Sentence> getRelevantSentences(File corpusFileOrDir, String token, int maxSentences) throws IOException {
+    List<Sentence> sentences;
+    if (corpusFileOrDir.isDirectory()) {
+      File file = new File(corpusFileOrDir, token + ".txt");
+      if (!file.exists()) {
+        throw new RuntimeException("File with example sentences not found: " + file);
+      }
+      try (FileInputStream fis = new FileInputStream(file)) {
+        SentenceSource sentenceSource = new PlainTextSentenceSource(fis, language);
+        sentences = getSentencesFromSource(corpusFileOrDir, token, maxSentences, sentenceSource);
+      }
+    } else {
+      try (FileInputStream fis = new FileInputStream(corpusFileOrDir)) {
+        SentenceSource sentenceSource = new WikipediaSentenceSource(fis, language);
+        sentences = getSentencesFromSource(corpusFileOrDir, token, maxSentences, sentenceSource);
+      }
+    }
+    return sentences;
+  }
+
+  private List<Sentence> getSentencesFromSource(File corpusFile, String token, int maxSentences, SentenceSource sentenceSource) {
     List<Sentence> sentences = new ArrayList<>();
-    try (FileInputStream fis = new FileInputStream(corpusFile)) {
+    while (sentenceSource.hasNext()) {
-      WikipediaSentenceSource source = new WikipediaSentenceSource(fis, language);
+      Sentence sentence = sentenceSource.next();
-      while (source.hasNext()) {
+      if (sentence.getText().matches(".*\\b" + token + "\\b.*")) {
-        Sentence sentence = source.next();
+        sentences.add(sentence);
-        if (sentence.getText().matches(".*\\b" + token + "\\b.*")) { // TODO: use real tokenizer?
+        if (sentences.size() % 25 == 0) {
-          sentences.add(sentence);
+          System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + corpusFile.getName());
-          if (sentences.size() % 25 == 0) {
+        }
-            System.out.println("Loaded sentence " + sentences.size() + " with '" + token + "' from " + corpusFile.getName());
+        if (sentences.size() >= maxSentences) {
-          }
+          break;
-          if (sentences.size() >= maxSentences) {
-            break;
-          }
         }
       }
     }
@@ -296,8 +315,10 @@ static class ListSplit<T> {
   public static void main(String[] args) throws IOException {
     if (args.length != 3) {
       System.err.println("Usage: " + WikipediaTrainingDataGenerator.class.getSimpleName()
-              + " <langCode> <wikipediaXml> <languageModelTopDir>");
+              + " <langCode> <wikipediaXml|dir> <languageModelTopDir>");
       System.err.println("   <languageModelTopDir> is a directory with sub-directories '2grams' and/or '3grams' with Lucene indexes");
+      System.err.println("   <wikipediaXml|dir> either a Wikipedia XML dump or");
+      System.err.println("                      a directory with example sentences (where <word>.txt contains only the sentences for <word>)");
       System.exit(1);
     }
     Language lang = Languages.getLanguageForShortName(args[0]);

diff --git a/...etool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/PlainTextSentenceSource.java b/...etool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/PlainTextSentenceSource.java
@@ -0,0 +1,80 @@
+/* LanguageTool, a natural language style checker
+ * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+package org.languagetool.dev.dumpcheck;
+
+import org.languagetool.Language;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Scanner;
+
+/**
+ * Provides access to the relevant sentences of a plain text file
+ * with one sentence per line.
+ * @since 3.0
+ */
+public class PlainTextSentenceSource extends SentenceSource {
+
+  private final List<String> sentences;
+  private final Scanner scanner;
+
+  // Each sentence is one article, but count anyway so it's coherent with what the Wikipedia code does:
+  private int articleCount = 0;
+
+  public PlainTextSentenceSource(InputStream textInput, Language language) {
+    super(language);
+    scanner = new Scanner(textInput);
+    sentences = new ArrayList<>();
+  }
+
+  @Override
+  public boolean hasNext() {
+    fillSentences();
+    return sentences.size() > 0;
+  }
+
+  @Override
+  public Sentence next() {
+    fillSentences();
+    if (sentences.size() == 0) {
+      throw new NoSuchElementException();
+    }
+    return new Sentence(sentences.remove(0), getSource(), "<plaintext>", null, ++articleCount);
+  }
+
+  @Override
+  public String getSource() {
+    return "plaintext";
+  }
+
+  private void fillSentences() {
+    while (sentences.size() == 0 && scanner.hasNextLine()) {
+      String line = scanner.nextLine();
+      if (line.isEmpty()) {
+        continue;
+      }
+      if (acceptSentence(line)) {
+        sentences.add(line);
+      }
+    }
+  }
+
+}
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSource.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSource.java
@@ -29,7 +29,7 @@
  * or plain text sources.
  * @since 2.4
  */
-abstract class SentenceSource implements Iterator<Sentence> {
+public abstract class SentenceSource implements Iterator<Sentence> {
 
   private static final int MIN_SENTENCE_SIZE = 10;
   private static final int MIN_SENTENCE_TOKEN_COUNT = 4;

diff --git a/...agetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/TatoebaSentenceSource.java b/...agetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/TatoebaSentenceSource.java
@@ -25,7 +25,7 @@
 
 /**
  * Provides access to the sentences of a Tatoeba (http://tatoeba.org) text
- * file that has already been filtered to contain only one language,
+ * file (tab separated) that has already been filtered to contain only one language.
  * @since 2.4
  */
 class TatoebaSentenceSource extends SentenceSource {