Merge branch 'master' of https://github.com/kermitt2/grobid

kermitt2 · Nov 1, 2020 · 0d568c5 · 0d568c5
2 parents e686dfc + a9eae59
commit 0d568c5
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 28 deletions.
diff --git a/doc/Coordinates-in-PDF.md b/doc/Coordinates-in-PDF.md
@@ -11,11 +11,12 @@ Since April 2017, GROBID version 0.4.2 and higher, coordinate areas can be obtai
 * ```persName``` for a complete author name,
 * ```figure``` for figure AND table,
 * ```formula``` for mathematical equations,
-* ```s``` for optional sentence structure. 
+* ```head``` for section titles,
+* ```s``` for optional sentence structure (the GROBID fulltext service must be called with the `segmentSentences` parameter to provide the optional sentence-level elements). 
 
 However, there is normally no particular limitation to the type of structures which can have their coordinates in the results, the implementation is on-going, see [issue #69](https://github.com/kermitt2/grobid/issues/69), and it is expected that more or less any structures could be associated with their coordinates in the orginal PDF. 
 
-Coordinates are currently available in full text processing (returning a TEI document) and the PDF annotation services (returning JSON). 
+Coordinates are currently available in full text processing (returning a TEI document) and the PDF annotation services (returning JSON for `ref`, `figure` and `formula` only). 
 
 ### GROBID service
 
@@ -47,8 +48,7 @@ Example (under the project main directory `grobid/`):
 > java -Xmx1024m -jar grobid-core/build/libs/grobid-core-0.5.0-onejar.jar -gH grobid-home -dIn /path/to/input/directory -dOut /path/to/output/directory -teiCoordinates -exe processFullText 
 ```
 
-See the [batch mode details](https://grobid.readthedocs.io/en/latest/Grobid-batch/#processfulltext). With the batch mode, it is currenlty not possible to cherry pick up certain elements, coordinates will appear for all.
-
+See the [batch mode details](https://grobid.readthedocs.io/en/latest/Grobid-batch/#processfulltext). With the batch mode, it is currenlty not possible to cherry pick up certain elements, coordinates will appear for all. Again, we recommend to use the service for significantly better performances and more customization options. 
 
 ## Coordinate system in the PDF
 

diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java
@@ -1195,11 +1195,6 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
             if (clusterLabel.equals(TaggingLabels.SECTION)) {
                 String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
                 curDiv = teiElement("div");
-                /*if (config.isGenerateTeiIds()) {
-                    String divID = KeyGen.getKey().substring(0, 7);
-                    addXmlId(curDiv, "_" + divID);
-                }*/
-
                 Element head = teiElement("head");
                 // section numbers
                 org.grobid.core.utilities.Pair<String, String> numb = getSectionNumber(clusterContent);
@@ -1215,6 +1210,13 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
                     addXmlId(head, "_" + divID);
                 }
 
+                if (config.isGenerateTeiCoordinates("head") ) {
+                    String coords = LayoutTokensUtil.getCoordsString(cluster.concatTokens());
+                    if (coords != null) {
+                        head.addAttribute(new Attribute("coords", coords));
+                    }
+                }
+
                 curDiv.appendChild(head);
                 divResults.add(curDiv);
             } else if (clusterLabel.equals(TaggingLabels.EQUATION) || 

diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/SentenceUtilities.java
@@ -110,10 +110,6 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
             return null;
         try {
             List<OffsetPosition> sentencePositions = sdf.getInstance().detect(text);
-/*System.out.println(text); 
-for(OffsetPosition position : sentencePositions) {
-    System.out.println("detect: " + text.substring(position.start, position.end));
-}*/
 
             // to be sure, we sort the forbidden positions
             if (forbidden == null)
@@ -166,12 +162,6 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
             if (textLayoutTokens == null || textLayoutTokens.size() == 0)
                 return finalSentencePositions;
 
-
-/*System.out.println("before finalSentencePositions.size(): " + finalSentencePositions.size());
-for(OffsetPosition position : finalSentencePositions) {
-    System.out.println(text.substring(position.start, position.end));
-}*/
-
             int pos = 0;
 
             // init sentence index
@@ -225,8 +215,6 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
                     }
 
                     if (pushedEnd > 0) {
-//System.out.println("found extra ref marker: " + text.substring(finalSentencePositions.get(currentSentenceIndex).end, 
-//    finalSentencePositions.get(currentSentenceIndex).end+pushedEnd+1));
 
                         OffsetPosition newPosition = finalSentencePositions.get(currentSentenceIndex);
                         newPosition.end += pushedEnd+1;
@@ -267,12 +255,6 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
             // here, for instance non-breakable italic or bold chunks, or adding sentence split based on 
             // spacing/indent
 
-/*System.out.println(text);            
-System.out.println("after finalSentencePositions.size(): " + finalSentencePositions.size());
-for(OffsetPosition position : finalSentencePositions) {
-    System.out.println(text.substring(position.start, position.end));
-}*/
-
             return finalSentencePositions;
         } catch (Exception e) {
             LOGGER.warn("Cannot detect sentences. ", e);

diff --git a/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java b/grobid-service/src/main/java/org/grobid/service/process/GrobidRestProcessFiles.java
@@ -677,7 +677,6 @@ public Response processPDFReferenceAnnotation(final InputStream inputStream,
                 .generateTeiCoordinates(elementWithCoords)
                 .consolidateCitations(consolidateCitations)
                 .includeRawCitations(includeRawCitations)
-                .generateTeiCoordinates(elementWithCoords)
                 .build();
 
             DocumentSource documentSource = DocumentSource.fromPdf(originFile);