language specification for sentence segmentation process

kermitt2 · Jan 7, 2021 · 4ff805e · 4ff805e
1 parent aa19f66
commit 4ff805e
Show file tree

Hide file tree

Showing 8 changed files with 75 additions and 25 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/data/Figure.java b/grobid-core/src/main/java/org/grobid/core/data/Figure.java
@@ -338,7 +338,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
             }
 
             if (desc != null && config.isWithSentenceSegmentation()) {
-                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config);
+                formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
 
                 // we need a sentence segmentation of the figure caption, for that we need to introduce 
                 // a <div>, then a <p>

diff --git a/grobid-core/src/main/java/org/grobid/core/data/Table.java b/grobid-core/src/main/java/org/grobid/core/data/Table.java
@@ -135,7 +135,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (desc != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config);
+                        formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());
 
                         // we need a sentence segmentation of the table caption, for that we need to introduce 
                         // a <div>, then a <p>
@@ -202,7 +202,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
                     }
 
                     if (noteNode != null && config.isWithSentenceSegmentation()) {
-                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config);
+                        formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());
 
                         // we need a sentence segmentation of the figure caption, for that we need to introduce 
                         // a <p>

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -89,6 +89,10 @@ public enum SchemaDeclaration {
 
     private static Pattern startNum = Pattern.compile("^(\\d+)(.*)");
 
+    private static final String SCHEMA_XSD_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd";
+    private static final String SCHEMA_DTD_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/dtd/Grobid.dtd";
+    private static final String SCHEMA_RNG_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/rng/Grobid.rng";
+
     public TEIFormatter(Document document, FullTextParser fullTextParser) {
         this.doc = document;
         this.fullTextParser = fullTextParser;
@@ -112,29 +116,21 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
             tei.append("<?xml-stylesheet type=\"text/xsl\" href=\"../jsp/xmlverbatimwrapper.xsl\"?> \n");
         }
         if (schemaDeclaration == SchemaDeclaration.DTD) {
-            tei.append("<!DOCTYPE TEI SYSTEM \"" + GrobidProperties.get_GROBID_HOME_PATH()
-                    + "/schemas/dtd/Grobid.dtd" + "\">\n");
+            tei.append("<!DOCTYPE TEI SYSTEM \"" + SCHEMA_DTD_LOCATION + "\">\n");
         } else if (schemaDeclaration == SchemaDeclaration.XSD) {
             // XML schema
             tei.append("<TEI xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\" \n" +
                     "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" \n" +
-                    //"\n xsi:noNamespaceSchemaLocation=\"" +
-                    //GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/xsd/Grobid.xsd\""	+
                     "xsi:schemaLocation=\"http://www.tei-c.org/ns/1.0 " +
-                    GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/xsd/Grobid.xsd\"" +
-                    "\n xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n");
+                    SCHEMA_XSD_LOCATION +
+                    "\"\n xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n");
 //				"\n xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">\n");
         } else if (schemaDeclaration == SchemaDeclaration.RNG) {
             // standard RelaxNG
-            tei.append("<?xml-model href=\"file://" +
-                    GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/rng/Grobid.rng" +
+            tei.append("<?xml-model href=\"" + SCHEMA_RNG_LOCATION +
                     "\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n");
-        } else if (schemaDeclaration == SchemaDeclaration.RNC) {
-            // compact RelaxNG
-            tei.append("<?xml-model href=\"file://" +
-                    GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/rng/Grobid.rnc" +
-                    "\" type=\"application/relax-ng-compact-syntax\"?>\n");
-        }
+        } 
+
         // by default there is no schema association
 
         if (schemaDeclaration != SchemaDeclaration.XSD) {
@@ -1271,7 +1267,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                 String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
                 if (isNewParagraph(lastClusterLabel, curParagraph)) {
                     if (curParagraph != null && config.isWithSentenceSegmentation()) {
-                        segmentIntoSentences(curParagraph, curParagraphTokens, config);
+                        segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
                     }
                     curParagraph = teiElement("p");
                     if (config.isGenerateTeiIds()) {
@@ -1329,7 +1325,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
 
         // in case we segment paragraph into sentences, we still need to do it for the last paragraph 
         if (curParagraph != null && config.isWithSentenceSegmentation()) {
-            segmentIntoSentences(curParagraph, curParagraphTokens, config);
+            segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
         }
 
         // remove possibly empty div in the div list
@@ -1395,7 +1391,7 @@ private boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagra
                 && lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null;
     }
 
-    public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config) {
+    public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
         // in order to avoid having a sentence boundary in the middle of a ref element 
         // (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
         // we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
@@ -1430,7 +1426,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
         }
 
         List<OffsetPosition> theSentences = 
-            SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens);
+            SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
 
         /*if (theSentences.size() == 0) {
             // this should normally not happen, but it happens (depending on sentence splitter, usually the text 

diff --git a/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/SentenceDetector.java
@@ -15,4 +15,14 @@ public interface SentenceDetector {
      *         position of the recognized sentence in the text
      */
     public List<OffsetPosition> detect(String text);
+
+
+    /**
+     * Detects sentence boundaries using a specified language
+     * @param text text to detect sentence boundaries
+     * @param lang language to be used for detecting sentence boundaries
+     * @return a list of offset positions indicating start and end character 
+     *         position of the recognized sentence in the text
+     */
+    public List<OffsetPosition> detect(String text, Language lang);
 }
diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/OpenNLPSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/OpenNLPSentenceDetector.java
@@ -5,6 +5,7 @@
 import opennlp.tools.util.Span;
 
 import org.grobid.core.lang.SentenceDetector;
+import org.grobid.core.lang.Language;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.GrobidProperties;
 
@@ -38,6 +39,11 @@ public OpenNLPSentenceDetector() {
 
     @Override
     public List<OffsetPosition> detect(String text) {
+        return detect(text, new Language(Language.EN)); 
+    }
+
+    @Override
+    public List<OffsetPosition> detect(String text, Language lang) {
         // unfortunately OpenNLP sentence detector is not thread safe, only the model can be share 
         SentenceDetectorME detector = new SentenceDetectorME(model);
         Span spans[] = detector.sentPosDetect(text); 

diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java
@@ -6,6 +6,7 @@
 import org.jruby.embed.LocalVariableBehavior;
 
 import org.grobid.core.lang.SentenceDetector;
+import org.grobid.core.lang.Language;
 import org.grobid.core.utilities.OffsetPosition;
 import org.grobid.core.utilities.GrobidProperties;
 
@@ -48,8 +49,17 @@ public PragmaticSentenceDetector() {
 
     @Override
     public List<OffsetPosition> detect(String text) {
+        return detect(text, new Language(Language.EN));  
+    }
+
+    @Override
+    public List<OffsetPosition> detect(String text, Language lang) {
         instance.put("text", text);
-        String script = "ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)\nps.segment";
+        String script = null;
+        if (lang == null || "en".equals(lang.getLang()))
+            script = "ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)\nps.segment";
+        else
+            script = "ps = PragmaticSegmenter::Segmenter.new(text: text, language: '" + lang.getLang() + "', clean: false)\nps.segment";
         Object ret = instance.runScriptlet(script);
 
 //System.out.println(text);

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java b/grobid-core/src/main/java/org/grobid/core/utilities/GrobidProperties.java
@@ -175,6 +175,10 @@ public static File getGrobidHomePath() {
         return GROBID_HOME_PATH;
     }
 
+    public static String getGrobidHome() {
+        return GROBID_HOME_PATH.getPath();
+    }
+
     /**
      * Set the GROBID_HOME path.
      */

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java
@@ -2,6 +2,7 @@
 
 import org.grobid.core.exceptions.GrobidException;
 import org.grobid.core.lang.SentenceDetectorFactory;
+import org.grobid.core.lang.Language;
 import org.grobid.core.layout.LayoutToken;
 
 import java.util.*;
@@ -75,6 +76,27 @@ public List<OffsetPosition> runSentenceDetection(String text) {
         }
     }
 
+    /**
+     * Basic run for sentence identification with a specified language to be considered when segmenting, 
+     * return the offset positions of the identified sentences
+     *
+     * @param text
+     *            text to segment into sentences
+     * @param lang 
+     *            specified language to be used when segmenting text  
+     * @return list of offset positions for the identified sentence, relative to the input text
+     */
+    public List<OffsetPosition> runSentenceDetection(String text, Language lang) {
+        if (text == null)
+            return null;
+        try {
+            return sdf.getInstance().detect(text, lang);
+        } catch (Exception e) {
+            LOGGER.warn("Cannot detect sentences. ", e);
+            return null;
+        }
+    }
+
     /**
      * Run for sentence identification with some forbidden span constraints, return the offset positions of the 
      * identified sentences without sentence boundaries within a forbidden span (typically a reference marker
@@ -87,7 +109,7 @@ public List<OffsetPosition> runSentenceDetection(String text) {
      * @return list of offset positions for the identified sentence, relative to the input text
      */
     public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden) {
-        return runSentenceDetection(text, forbidden, null);
+        return runSentenceDetection(text, forbidden, null, null);
     }
 
     /**
@@ -103,13 +125,15 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
      * @param textLayoutTokens
      *            list of LayoutToken objects from which the text has been created, if this list is null
      *            we consider that we have a pure textual input (e.g. text is not from a PDF)
+     * @param lang 
+     *            specified language to be used when segmenting text  
      * @return list of offset positions for the identified sentence, relative to the input text
      */
-    public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden, List<LayoutToken> textLayoutTokens) {
+    public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden, List<LayoutToken> textLayoutTokens, Language lang) {
         if (text == null)
             return null;
         try {
-            List<OffsetPosition> sentencePositions = sdf.getInstance().detect(text);
+            List<OffsetPosition> sentencePositions = sdf.getInstance().detect(text, lang);
 
             // to be sure, we sort the forbidden positions
             if (forbidden == null)