Merge dbc7312 into 506c00a

kermitt2 · Nov 5, 2020 · 0143025 · 0143025
2 parents 506c00a + dbc7312
commit 0143025
Show file tree

Hide file tree

Showing 8 changed files with 926 additions and 52 deletions.
diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java
@@ -507,7 +507,6 @@ static public Pair<String, LayoutTokenization> getBodyTextFeatured(Document doc,
                 else if (nbNumbType > (bibDataSets.size() / 2))
                     bibRefCalloutType = "AUTHOR";
             } catch(EntityMatcherException e) {
-                e.printStackTrace();
                 LOGGER.info("Could not build the bibliographical matcher", e);
             }
         }

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java
@@ -5,13 +5,15 @@
 import org.grobid.core.layout.LayoutToken;
 
 import java.util.*;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
  * Class for using sentence segmentation (singleton). The actual sentence segmentation implementation
- * is specified in the Grobid configuration. See org.grobid.core.lang.impl.* for the available 
+ * is specified in the Grobid configuration. See org.grobid.core.lang.impl.* for the available
  * implementations.
  *
  */
@@ -60,8 +62,7 @@ private SentenceUtilities() {
      * Basic run for sentence identification, return the offset positions of the 
      * identified sentences
      *
-     * @param text
-     *            text to segment into sentences
+     * @param text text to segment into sentences
      * @return list of offset positions for the identified sentence, relative to the input text
      */
     public List<OffsetPosition> runSentenceDetection(String text) {
@@ -80,21 +81,19 @@ public List<OffsetPosition> runSentenceDetection(String text) {
      * identified sentences without sentence boundaries within a forbidden span (typically a reference marker
      * and we don't want a sentence end/start in the middle of that).
      *
-     * @param text
-     *            text to segment into sentences
-     * @param forbidden
-     *            list of offset positions where sentence boundaries are forbidden
+     * @param text      text to segment into sentences
+     * @param forbidden list of offset positions where sentence boundaries are forbidden
      * @return list of offset positions for the identified sentence, relative to the input text
      */
     public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden) {
         return runSentenceDetection(text, forbidden, null);
     }
 
     /**
-     * Run for sentence identification with some forbidden span constraints, return the offset positions of the 
+     * Run for sentence identification with some forbidden span constraints, return the offset positions of the
      * identified sentences without sentence boundaries within a forbidden span (typically a reference marker
      * and we don't want a sentence end/start in the middle of that). The original LayoutToken objects are
-     * provided, which allows to apply additional heuristics based on document layout and font features. 
+     * provided, which allows to apply additional heuristics based on document layout and font features.
      *
      * @param text
      *            text to segment into sentences
@@ -114,6 +113,7 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
             // to be sure, we sort the forbidden positions
             if (forbidden == null)
                 return sentencePositions;
+
             Collections.sort(forbidden);
 
             // cancel sentence boundaries within the forbidden spans
@@ -139,21 +139,48 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
                 finalSentencePositions.add(position);
             }
 
-            // as a heuristics for all implementations, because they clearly all fail for this case, we 
-            // attached to the right sentence the numerical bibliographical references markers expressed 
+            // adjust the forbidden spans - if they are present at the beginning of the sentence, move them to the
+            // end of the previous sentence
+
+            for (int index = 0; index < finalSentencePositions.size(); index++) {
+                OffsetPosition currentSentence = finalSentencePositions.get(index);
+                for (OffsetPosition forbiddenSpan : forbidden) {
+                    if (forbiddenSpan.start == currentSentence.start && index > 0) {
+                        // Adjust the previous sentence to include this span
+                        OffsetPosition previousSentence = finalSentencePositions.get(index - 1);
+                        previousSentence.end = forbiddenSpan.end;
+                        currentSentence.start = forbiddenSpan.end;
+                        while (text.charAt(currentSentence.start) == ' ') {
+                            if (currentSentence.start == text.length() - 1) {
+                                break;
+                            } else {
+                                currentSentence.start++;
+                            }
+                        }
+                    }
+                }
+            }
+
+            finalSentencePositions = finalSentencePositions
+                .stream()
+                .filter(offsetPosition -> offsetPosition.end - offsetPosition.start > 0)
+                .collect(Collectors.toList());
+
+            // as a heuristics for all implementations, because they clearly all fail for this case, we
+            // attached to the right sentence the numerical bibliographical references markers expressed
             // in superscript just *after* the final sentence comma, e.g.
             // "Laboratory tests at the time of injury were not predictive of outcome. 32"
             // or
-            // "CSF-1 has been linked to tumor growth and progression in breast cancer, 5,6 and has been 
-            // shown to effectively reduce the number of tumor-associated macrophages in different tumor 
+            // "CSF-1 has been linked to tumor growth and progression in breast cancer, 5,6 and has been
+            // shown to effectively reduce the number of tumor-associated macrophages in different tumor
             // types. 4,5"
-            // or 
+            // or
             // "Even if the symmetry is s- like, it does not necessarily indicate that the
             // superconductivity is not exotic, because the s- like symmetry or the fully gapped state
             // may be realized by the pairing mediated by the interband excitations of the electrons. 23) "
 
             if (finalSentencePositions.size() == 0) {
-                // this should normally not happen, but it happens (depending on sentence splitter, usually the text 
+                // this should normally not happen, but it happens (depending on sentence splitter, usually the text
                 // is just a punctuation)
                 // in this case we consider the current text as a unique sentence as fall back
                 finalSentencePositions.add(new OffsetPosition(0, text.length()));
@@ -166,14 +193,14 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
 
             // init sentence index
             int currentSentenceIndex = 0;
-            String sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start, 
+            String sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
                 finalSentencePositions.get(currentSentenceIndex).end);
             boolean moved = false;
 
             // iterate on layout tokens in sync with sentences
             for(int i=0; i<textLayoutTokens.size(); i++) {
                 LayoutToken token = textLayoutTokens.get(i);
-                if (token.getText() == null || token.getText().length() == 0) 
+                if (token.getText() == null || token.getText().length() == 0)
                     continue;
 
                 if (this.toSkipToken(token.getText()))
@@ -191,10 +218,10 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
                     int j = i;
                     for(; j<textLayoutTokens.size(); j++) {
                         LayoutToken nextToken = textLayoutTokens.get(j);
-                        if (nextToken.getText() == null || nextToken.getText().length() == 0) 
+                        if (nextToken.getText() == null || nextToken.getText().length() == 0)
                             continue;
 
-                        // we don't look beyond an end of line (to prevent from numbered list/notes) 
+                        // we don't look beyond an end of line (to prevent from numbered list/notes)
                         if (nextToken.getText().equals("\n"))
                             break;
 
@@ -210,7 +237,7 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
                         if (this.isValidSuperScriptNumericalReferenceMarker(nextToken)) {
                             pushedEnd += buffer + nextToken.getText().length();
                             buffer = 0;
-                        } else 
+                        } else
                             break;
                     }
 
@@ -240,30 +267,52 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
                         currentSentenceIndex++;
                         if (currentSentenceIndex >= finalSentencePositions.size())
                             break;
-                        sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start, 
+                        sentenceChunk = text.substring(finalSentencePositions.get(currentSentenceIndex).start,
                             finalSentencePositions.get(currentSentenceIndex).end);
                         moved = false;
                     }
                     pos = 0;
                 }
-                
+
                 if (currentSentenceIndex >= finalSentencePositions.size())
                     break;
             }
 
+
             // other heuristics/post-corrections based on layout/style features of the tokens could be added
-            // here, for instance non-breakable italic or bold chunks, or adding sentence split based on 
+            // here, for instance non-breakable italic or bold chunks, or adding sentence split based on
             // spacing/indent
 
             return finalSentencePositions;
+
         } catch (Exception e) {
             LOGGER.warn("Cannot detect sentences. ", e);
             return null;
         }
+
     }
 
+    public String getXml(String text, List<OffsetPosition> offsetPositions) {
+        StringBuilder outputText = new StringBuilder();
+
+        outputText.append(text.substring(offsetPositions.get(offsetPositions.size() - 1).end));
+        int previousStart = -1;
+
+        for (int i = offsetPositions.size() - 1; i >= 0; i--) {
+            if (previousStart != -1) {
+                outputText.insert(0, text.substring(offsetPositions.get(i).end, previousStart));
+            }
+            outputText.insert(0, "<s>" + text.substring(offsetPositions.get(i).start, offsetPositions.get(i).end)
+                + "</s>");
+            previousStart = offsetPositions.get(i).start;
+        }
+
+        outputText.insert(0, text.substring(0, offsetPositions.get(0).start));
+
+        return "<sents>" + outputText.toString() + "</sents>";
+    }
     /**
-     * Return true if the token should be skipped when considering sentence content. 
+     * Return true if the token should be skipped when considering sentence content.
      */
     public static boolean toSkipToken(String tok) {
         // the hyphen is considered to be skipped to cover the case of word hyphenation
@@ -282,7 +331,7 @@ private static boolean toSkipTokenNoHyphen(String tok) {
 
 
     /**
-     * Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript. 
+     * Return true if the token is a valid numerical reference markers ([0-9,())\-\]\[) in supercript.
      */
     private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken token) {
 
@@ -292,10 +341,10 @@ private static boolean isValidSuperScriptNumericalReferenceMarker(LayoutToken to
             return true;
         }
         if (token.isSuperscript() && token.getText().matches("[0-9,\\-\\(\\)\\[\\]]+")) {
-//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> true");            
+//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> true");
             return true;
         } else {
-//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> false");                        
+//System.out.println("isValidSuperScriptNumericalReferenceMarker: " + token.getText() + " -> false");
             return false;
         }
     }

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/crossref/CrossrefRequest.java b/grobid-core/src/main/java/org/grobid/core/utilities/crossref/CrossrefRequest.java
@@ -200,12 +200,12 @@ public Void handleResponse(HttpResponse response) throws ClientProtocolException
 	}
 
 	public String toString() {
-		String str = " (";
+		StringBuilder str = new StringBuilder(" (");
 		if (params != null) {
 			for (Entry<String, String> cursor : params.entrySet())
-				str += ","+cursor.getKey()+"="+cursor.getValue();
+				str.append(",").append(cursor.getKey()).append("=").append(cursor.getValue());
 		}
-		str += ")";
-		return str;
+		str.append(")");
+		return str.toString();
 	}
 }
diff --git a/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java b/grobid-core/src/test/java/org/grobid/core/utilities/SentenceUtilitiesTest.java
@@ -4,13 +4,13 @@
 import org.junit.Before;
 import org.junit.Test;
 
-import java.util.List;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 import static org.hamcrest.CoreMatchers.is;
-import static org.hamcrest.Matchers.hasSize;
-import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
 
 public class SentenceUtilitiesTest {
 
@@ -66,4 +66,41 @@ public void testTwoSentencesTextWithUsefullForbidden() throws Exception {
         List<OffsetPosition> theSentences = SentenceUtilities.getInstance().runSentenceDetection(text, forbidden);
         assertThat(theSentences.size(), is(1));
     }
+
+    @Test
+    public void testGetText() throws Exception {
+        String text = "Bla bla bla. Bli bli bli.";
+
+        List<OffsetPosition> offsetPositions = Arrays.asList(
+            new OffsetPosition(0, 12),
+            new OffsetPosition(14, 21)
+        );
+
+        String outputText = SentenceUtilities.getInstance().getXml(text, offsetPositions);
+
+        assertThat(outputText, is("<sents><s>Bla bla bla.</s> B<s>li bli </s>bli.</sents>"));
+    }
+
+    @Test
+    public void testParagraphWithMarkersOutsideTheSentence() throws Exception {
+        String text = "Precisely controlling surface chemistry using self-assembled monolayers (SAMs) and bilayers " +
+            "has been a central focus of research in both synthetic and biological interfaces. " +
+            "1−4 Much synthetic monolayer chemistry has its basis in the formation of SAMs of alkanethiols " +
+            "on gold and the coinage metals, pioneered by groups including those of Whitesides, " +
+            "Nuzzo, and Allara in the 1980s. 5−8 ";
+
+        List<OffsetPosition> forbidden = Arrays.asList(
+            new OffsetPosition(174, 177),
+            new OffsetPosition(383, 386)
+        );
+
+        List<OffsetPosition> offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(text, forbidden);
+        assertThat(SentenceUtilities.getInstance().getXml(text, offsetPositions), is("<sents><s>Precisely controlling " +
+            "surface chemistry using self-assembled monolayers (SAMs) and bilayers " +
+            "has been a central focus of research in both synthetic and biological interfaces. " +
+            "1−4</s> <s>Much synthetic monolayer chemistry has its basis in the formation of SAMs of alkanethiols " +
+            "on gold and the coinage metals, pioneered by groups including those of Whitesides, " +
+            "Nuzzo, and Allara in the 1980s. 5−8</s> </sents>"));
+
+    }
 }
diff --git a/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java b/grobid-service/src/main/java/org/grobid/service/GrobidRestService.java
@@ -702,6 +702,21 @@ public Response annotatePDFPatentCitation(
         return restProcessFiles.annotateCitationPatentPDF(inputStream, consol, includeRaw);
     }
 
+    @Path("/segmentSentence")
+    @Consumes(MediaType.MULTIPART_FORM_DATA)
+    @Produces(MediaType.APPLICATION_XML)
+    @POST
+    public Response segmentSentenceText_post(@FormDataParam(INPUT) String text) {
+        return restProcessString.segmentSentences(text);
+    }
+
+    @Path("/segmentSentence")
+    @Produces(MediaType.APPLICATION_XML)
+    @GET
+    public Response segmentSentenceText_get(@QueryParam(INPUT) String text) {
+        return restProcessString.segmentSentences(text);
+    }
+
     public void setRestProcessFiles(GrobidRestProcessFiles restProcessFiles) {
         this.restProcessFiles = restProcessFiles;
     }