Makes GermanTagger more accurate and adds support for detecting dash-…

…separated words
languagetool-org · Sep 30, 2017 · d13570c · d13570c
1 parent ad43253
commit d13570c
Showing 1 changed file with 53 additions and 16 deletions.
diff --git a/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/de/GermanTagger.java b/languagetool-language-modules/de/src/main/java/org/languagetool/tagging/de/GermanTagger.java
@@ -59,6 +59,20 @@ public GermanTagger() {
     }
   }
 
+  //Removes the first part of dash-linked words (SSL-Zertifikat -> Zertifikat)
+  private static String sanitizeWord(String word) {
+    String result = word;
+    String[] splitWord = word.split("-");
+    if (splitWord.length > 1) {
+      for (int i = splitWord.length - 1; i >= 0; i--) {
+        if (!splitWord[i].trim().equals("") && !splitWord[i].trim().equals(null)) {
+          return splitWord[i];
+        }
+      }
+    }
+    return result;
+  }
+
   @Override
   public String getManualAdditionsFileName() {
     return "/de/added.txt";
@@ -95,47 +109,70 @@ public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens, boolean igno
     int pos = 0;
 
     for (String word : sentenceTokens) {
-      List<AnalyzedToken> l = new ArrayList<>();
+      List<AnalyzedToken> readings = new ArrayList<>();
       List<TaggedWord> taggerTokens = getWordTagger().tag(word);
+
+      //Only first iteration
       if (firstWord && taggerTokens.isEmpty() && ignoreCase) { // e.g. "Das" -> "das" at start of sentence
         taggerTokens = getWordTagger().tag(word.toLowerCase());
         firstWord = word.matches("^\\W?$");
       } else if (pos == 0 && ignoreCase) {   // "Haben", "Sollen", "Können", "Gerade" etc. at start of sentence
         taggerTokens.addAll(getWordTagger().tag(word.toLowerCase()));
       }
-      if (taggerTokens.size() > 0) {
-        l.addAll(getAnalyzedTokens(taggerTokens, word));
-      } else {
-        // word not known, try to decompose it and use the last part for POS tagging:
+
+      //1+ iterations
+      if (taggerTokens.size() > 0) { //Word known, just add analyzed token to readings
+        readings.addAll(getAnalyzedTokens(taggerTokens, word));
+      } else { // Word not known, try to decompose it and use the last part for POS tagging:
         if (!StringTools.isEmpty(word.trim())) {
           List<String> compoundParts = compoundTokenizer.tokenize(word);
-          if (compoundParts.size() <= 1) {
-            // recognize alternative imperative forms (e.g., "Geh bitte!" in addition to "Gehe bitte!")
+
+          if (compoundParts.size() <= 1) { //Could not find simple compound parts
+            // Recognize alternative imperative forms (e.g., "Geh bitte!" in addition to "Gehe bitte!")
             List<AnalyzedToken> imperativeFormList = getImperativeForm(word, sentenceTokens, pos);
             if (imperativeFormList != null && imperativeFormList.size() > 0) {
-              l.addAll(imperativeFormList);
-            } else {
-              l.add(getNoInfoToken(word));
+              readings.addAll(imperativeFormList);
+            } else { //Separate dash-linked words
+              //Only check single word tokens
+              if (word.split(" ").length == 1) {
+                String wordOrig = word;
+                word = sanitizeWord(word);
+                List<String> compoundedWord = compoundTokenizer.tokenize(word);
+                if (compoundedWord.size() > 1) { //Only start word with uppercase if it's a result of splitting
+                  word = StringTools.uppercaseFirstChar(compoundedWord.get(compoundedWord.size() - 1));
+                } else {
+                  word = compoundedWord.get(compoundedWord.size() - 1);
+                }
+                List<TaggedWord> linkedTaggerTokens = getWordTagger().tag(word); //Try to analyze the last part found
+                word = wordOrig;
+                if (linkedTaggerTokens.size() > 0) {
+                  readings.addAll(getAnalyzedTokens(linkedTaggerTokens, wordOrig, compoundedWord));
+                } else {
+                  readings.add(getNoInfoToken(wordOrig));
+                }
+              } else {
+                readings.add(getNoInfoToken(word));
+              }
             }
-          } else {
+          }
+          else {
             // last part governs a word's POS:
             String lastPart = compoundParts.get(compoundParts.size()-1);
             if (StringTools.startsWithUppercase(word)) {
               lastPart = StringTools.uppercaseFirstChar(lastPart);
             }
             List<TaggedWord> partTaggerTokens = getWordTagger().tag(lastPart);
             if (partTaggerTokens.size() > 0) {
-              l.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
+              readings.addAll(getAnalyzedTokens(partTaggerTokens, word, compoundParts));
             } else {
-              l.add(getNoInfoToken(word));
+              readings.add(getNoInfoToken(word));
             }
           }
         } else {
-          l.add(getNoInfoToken(word));
+          readings.add(getNoInfoToken(word));
         }
       }
-
-      tokenReadings.add(new AnalyzedTokenReadings(l.toArray(new AnalyzedToken[l.size()]), pos));
+      tokenReadings.add(new AnalyzedTokenReadings(readings.toArray(new AnalyzedToken[readings.size()]), pos));
       pos += word.length();
     }
     return tokenReadings;