Skip to content

Commit

Permalink
implement #523; put back optional coordinates for persName
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jun 1, 2020
1 parent 27ece93 commit 672fa61
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 5 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Expand Up @@ -408,7 +408,7 @@ project(":grobid-trainer") {
return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal;
}

// run like:
// run like this:
// ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet
// ./gradlew PubMedCentralEval -Pp2t=/path/to/goldenSet -Prun=1 -PfileRatio=0.1
// ./gradlew PrepareDOIMatching -Pp2t=ABS_PATH_TO_PMC/PMC_sample_1943
Expand Down
21 changes: 20 additions & 1 deletion doc/Coordinates-in-PDF.md
Expand Up @@ -115,7 +115,26 @@ The GROBID console offers a reference implementation with PDF.js for dynamically

Coordinates for a given structure appear via an extra attribute ```@coord```. This is part of the [customization to the TEI](TEI-encoding-of-results.md) used by GROBID.

Similarly as for JSON, the coordinates of a structure is provided as a list of bounding boxes, each one separated by a semicolon ```;```, each bounding box being defined by 5 attributes separated by a comma ```,```:
* the list of page size is encoded under the TEI element `<facsimile>`. The dimension of each page is given successively by the TEI attributes `@lrx` and `@lry` of the element `<surface>` to be conformant with the TEI (`@ulx` and `@uly` are used to set the orgine coordinates, which is always `(0,0)` for us).

Example:


```xml
...
</teiHeader>
<facsimile>
<surface n="1" ulx="0.0" uly="0.0" lrx="612.0" lry="794.0"/>
<surface n="2" ulx="0.0" uly="0.0" lrx="612.0" lry="794.0"/>
<surface n="3" ulx="0.0" uly="0.0" lrx="612.0" lry="794.0"/>
<surface n="4" ulx="0.0" uly="0.0" lrx="612.0" lry="794.0"/>
<surface n="5" ulx="0.0" uly="0.0" lrx="612.0" lry="794.0"/>
</facsimile>
<text xml:lang="en">
...
```

* for each entity, similarly as for JSON, the coordinates of a structure is provided as a list of bounding boxes, each one separated by a semicolon ```;```, each bounding box being defined by 5 attributes separated by a comma ```,```:

Example 1:
```xml
Expand Down
7 changes: 7 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Person.java
Expand Up @@ -182,6 +182,13 @@ public List<LayoutToken> getLayoutTokens() {
return layoutTokens;
}

public void addLayoutTokens(List<LayoutToken> theTokens) {
if (layoutTokens == null) {
layoutTokens = new ArrayList<LayoutToken>();
}
layoutTokens.addAll(theTokens);
}

public String toTEI(boolean withCoordinates) {
if ( (firstName == null) && (middleName == null) &&
(lastName == null) ) {
Expand Down
Expand Up @@ -27,6 +27,7 @@
import org.grobid.core.layout.GraphicObject;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.LayoutTokenization;
import org.grobid.core.layout.Page;
import org.grobid.core.tokenization.TaggingTokenCluster;
import org.grobid.core.tokenization.TaggingTokenClusteror;
import org.grobid.core.utilities.*;
Expand Down Expand Up @@ -923,6 +924,13 @@ else if (biblio.getE_Year().length() == 4)

tei.append("\t</teiHeader>\n");

// output pages dimensions in the case coordinates will also be provided for some structures
try {
tei = toTEIPages(tei, doc, config);
} catch(Exception e) {
LOGGER.warn("Problem when serializing page size", e);
}

if (doc.getLanguage() != null) {
tei.append("\t<text xml:lang=\"").append(doc.getLanguage()).append("\">\n");
} else {
Expand Down Expand Up @@ -1690,4 +1698,35 @@ private String normalizeText(String localText) {

return localText.trim();
}

/**
* In case, the coordinates of structural elements are provided in the TEI
* representation, we need the page sizes in order to scale the coordinates
* appropriately. These size information are provided via the TEI facsimile
* element, with a surface element for each page carrying the page size info.
*/
public StringBuilder toTEIPages(StringBuilder buffer,
Document doc,
GrobidAnalysisConfig config) throws Exception {
if (!config.isGenerateTeiCoordinates()) {
// no cooredinates, nothing to do
return buffer;
}

// page height and width
List<Page> pages = doc.getPages();
int pageNumber = 1;
buffer.append("\t<facsimile>\n");
for(Page page : pages) {
buffer.append("\t\t<surface ");
buffer.append("n=\"" + pageNumber + "\" ");
buffer.append("ulx=\"0.0\" uly=\"0.0\" ");
buffer.append("lrx=\"" + page.getWidth() + "\" lry=\"" + page.getHeight() + "\"");
buffer.append("/>\n");
pageNumber++;
}
buffer.append("\t</facsimile>\n");

return buffer;
}
}
Expand Up @@ -160,6 +160,7 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
} else {
aut.setTitle(clusterContent);
}
aut.addLayoutTokens(cluster.concatTokens());
} else if (clusterLabel.equals(TaggingLabels.NAMES_HEADER_FORENAME) ||
clusterLabel.equals(TaggingLabels.NAMES_CITATION_FORENAME)) {
if (newMarker) {
Expand All @@ -177,6 +178,7 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
} else {
aut.setFirstName(clusterContent);
}
aut.addLayoutTokens(cluster.concatTokens());
} else if (clusterLabel.equals(TaggingLabels.NAMES_HEADER_MIDDLENAME) ||
clusterLabel.equals(TaggingLabels.NAMES_CITATION_MIDDLENAME)) {
if (newMarker) {
Expand All @@ -187,6 +189,7 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
} else {
aut.setMiddleName(clusterContent);
}
aut.addLayoutTokens(cluster.concatTokens());
} else if (clusterLabel.equals(TaggingLabels.NAMES_HEADER_SURNAME) ||
clusterLabel.equals(TaggingLabels.NAMES_CITATION_SURNAME)) {
if (newMarker) {
Expand All @@ -204,6 +207,7 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
} else {
aut.setLastName(clusterContent);
}
aut.addLayoutTokens(cluster.concatTokens());
} else if (clusterLabel.equals(TaggingLabels.NAMES_HEADER_SUFFIX) ||
clusterLabel.equals(TaggingLabels.NAMES_CITATION_SUFFIX)) {
/*if (newMarker) {
Expand All @@ -215,6 +219,7 @@ public List<Person> processing(List<LayoutToken> tokens, boolean head) {
} else {
aut.setSuffix(clusterContent);
}
aut.addLayoutTokens(cluster.concatTokens());
}
}

Expand Down
Expand Up @@ -225,6 +225,10 @@ public List<String> getGenerateTeiCoordinates() {
return generateTeiCoordinates;
}

public boolean isGenerateTeiCoordinates() {
return getGenerateTeiCoordinates() != null && getGenerateTeiCoordinates().size()>0;
}

public boolean isGenerateTeiCoordinates(String type) {
return getGenerateTeiCoordinates() != null && getGenerateTeiCoordinates().contains(type);
}
Expand Down
6 changes: 3 additions & 3 deletions grobid-service/src/main/resources/web/index.html
Expand Up @@ -161,7 +161,7 @@ <h2>
</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest" type="submit" value="Submit" class="btn"/>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest" type="submit" value="Submit" class="btn btn-success"/>
<input id="btn_download" class="btn" value="Download TEI Result"/></td>
</tr>
</table>
Expand Down Expand Up @@ -214,7 +214,7 @@ <h2>
</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest2" value="Submit" class="btn"/></td>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest2" value="Submit" class="btn btn-success"/></td>
</tr>
</table>
</form>
Expand Down Expand Up @@ -280,7 +280,7 @@ <h2>
</td>
</tr>
<tr>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest3" value="Submit" class="btn"/>
<td>&nbsp;</td><td colspan="2"><input id="submitRequest3" value="Submit" class="btn btn-success"/>
<input id="btn_download3" class="btn" value="Download TEI Result"/></td>
</tr>
</table>
Expand Down

0 comments on commit 672fa61

Please sign in to comment.