Skip to content

Commit

Permalink
Merge ca98798 into 53044fd
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitaliy-1 committed Feb 25, 2020
2 parents 53044fd + ca98798 commit 60388e5
Show file tree
Hide file tree
Showing 5 changed files with 595 additions and 1 deletion.
43 changes: 42 additions & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import org.grobid.core.GrobidModels;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.data.table.Cell;
import org.grobid.core.data.table.Line;
import org.grobid.core.data.table.LinePart;
import org.grobid.core.data.table.Row;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.document.Document;
import org.grobid.core.document.TEIFormatter;
Expand Down Expand Up @@ -131,7 +135,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form


Element contentEl = XmlBuilderUtils.teiElement("table");
contentEl.appendChild(LayoutTokensUtil.toText(getContentTokens()));
processTableContent(contentEl, this.getContentTokens());
if ((config.getGenerateTeiCoordinates() != null) && (config.getGenerateTeiCoordinates().contains("figure"))) {
XmlBuilderUtils.addCoords(contentEl, LayoutTokensUtil.getCoordsStringForOneBox(getContentTokens()));
}
Expand Down Expand Up @@ -179,6 +183,43 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
// return theTable.toString();
}

/**
*
* @param contentEl table element to append parsed rows and cells.
* @param contentTokens tokens that are used to build cells
* Line-based algorithm for parsing tables, uses tokens' coordinates to identify lines
*/
void processTableContent(Element contentEl, List<LayoutToken> contentTokens) {
// Join Layout Tokens into cell lines originally created by PDFAlto
List<LinePart> lineParts = Line.extractLineParts(contentTokens);

// Build lines by comparing borders
List<Line> lines = Line.extractLines(lineParts);

// Build rows and cells
List<Row> rows = Row.extractRows(lines);

int columnCount = Row.columnCount(rows);

Row.insertEmptyCells(rows, columnCount);

Row.mergeMulticolumnCells(rows);

for (Row row: rows) {
Element tr = XmlBuilderUtils.teiElement("row");
contentEl.appendChild(tr);
List<Cell> cells = row.getContent();
for (Cell cell: cells) {
Element td = XmlBuilderUtils.teiElement("cell");
tr.appendChild(td);
if (cell.getColspan() > 1) {
td.addAttribute(new Attribute("cols", Integer.toString(cell.getColspan())));
}
td.appendChild(cell.getText().trim());
}
}
}

private String cleanString(String input) {
return input.replace("\n", " ").replace(" ", " ").trim();
}
Expand Down
60 changes: 60 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/table/Cell.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package org.grobid.core.data.table;

import java.util.ArrayList;
import java.util.List;

public class Cell extends Line {

private int positionRow = -1;
private int positionColumn = -1;
private int colspan = 1;
private boolean merged = false;

public boolean linePartInBorders(LinePart linePart) {
if (this.getContent().isEmpty()) return true;

if ((this.getLeft() > linePart.getRight()) || this.getRight() < linePart.getLeft()) return false;

return true;
}

public int getColspan() {
return this.colspan;
}

public void setColspan(int colspan) {
this.colspan = colspan;
}

public int getPositionRow() {
return positionRow;
}

public void setPositionRow(int positionRow) {
this.positionRow = positionRow;
}

public int getPositionColumn() {
return positionColumn;
}

public void setPositionColumn(int positionColumn) {
this.positionColumn = positionColumn;
}

public void setRight(double rightpos) {
this.right = rightpos;
}

public void setLeft(double leftpos) {
this.left = leftpos;
}

public void setMerged(boolean merged) {
this.merged = merged;
}

public boolean isMerged() {
return this.merged;
}
}
149 changes: 149 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/table/Line.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package org.grobid.core.data.table;

import org.grobid.core.layout.LayoutToken;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

public class Line extends LinePart {

private List<LinePart> contentParts = new ArrayList<>();

public void add(LinePart contentPart) {
contentParts.add(contentPart);

setTop(contentPart);
setBottom(contentPart);
setLeft(contentPart);
setRight(contentPart);
}

private void setTop(LinePart contentPart) {
double partTop = contentPart.getTop();

if (top == GROBID_TOKEN_DEFAULT_DOUBLE || top > partTop) {
top = partTop;
}
}

private void setBottom(LinePart contentPart) {
double partBottom = contentPart.getBottom();

if (bottom == GROBID_TOKEN_DEFAULT_DOUBLE || bottom < partBottom) {
bottom = partBottom;
}
}

private void setLeft(LinePart contentPart) {
double partLeft = contentPart.getLeft();

if (left == GROBID_TOKEN_DEFAULT_DOUBLE || left > partLeft) {
left = partLeft;
}
}

private void setRight(LinePart contentPart) {
double partRight = contentPart.getRight();

if (right == GROBID_TOKEN_DEFAULT_DOUBLE || right < partRight) {
right = partRight;
}
}

public List<LinePart> getContent() {
if (!this.contentParts.isEmpty()) return this.contentParts;
return null;
}

public boolean isEmpty() {
return this.contentParts.size() == 0;
}

public boolean linePartInBorders(LinePart linePart) {
if (this.contentParts.isEmpty()) return true;

// token is fully above the line or below, it doesn't overlap
if ((this.getTop() > linePart.getBottom()) || this.getBottom() < linePart.getTop()) return false;

return true;
}

@Override
public String getText() {
StringBuilder stringBuilder = new StringBuilder();
for (LinePart linePart: contentParts) {
stringBuilder.append(linePart.getText());
}

return stringBuilder.toString();
}

public static List<LinePart> extractLineParts(List<LayoutToken> contentTokens) {
List<LinePart> lineParts = new ArrayList<>();
LinePart currentLinePart = null;
for (int i = 0; i < contentTokens.size(); i++) {
LayoutToken contentToken = contentTokens.get(i);
if (i == 0) {
currentLinePart = new LinePart();
lineParts.add(currentLinePart);
}

if (!contentToken.getText().equals("\n")) {
currentLinePart.add(contentToken);
}

if (contentToken.getText().equals("\n")) {
LinePart newLinePart = new LinePart();
lineParts.add(newLinePart);
currentLinePart = newLinePart;
}
}
return lineParts;
}

/*
* Algorithm for extracting lines.
* See algorithm 1: Burcu Yildiz, Katharina Kaiser, Silvia Miksch. pdf2table: A Method to Extract Table Information
* from PDF Files.
*/
public static List<Line> extractLines(List<LinePart> lineParts) {
List<Line> lines = new ArrayList<>();
Line currentLine = null;
int i = lineParts.size() - 1;
while (!lineParts.isEmpty() && i >= 0) {
LinePart linePart = lineParts.get(i);
if (linePart.getText().isEmpty()) {
lineParts.remove(i);
i--;
continue;
}

if (currentLine == null) {
currentLine = new Line();
lines.add(currentLine);
currentLine.add(linePart);
lineParts.remove(i);
i--;
continue;
}

if (currentLine.linePartInBorders(linePart)){
currentLine.add(linePart);
lineParts.remove(i);
i = lineParts.size() - 1; // return to the first item and recheck borders
continue;
}

if (i == 0) {
currentLine = null;
i = lineParts.size() - 1;
} else {
i--;
}
}

lines.sort(Comparator.comparingDouble(Line::getTop)); // sorting by top position
return lines;
}
}
118 changes: 118 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/table/LinePart.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package org.grobid.core.data.table;

import org.grobid.core.layout.LayoutToken;

import java.util.ArrayList;
import java.util.List;

public class LinePart {

public static final double GROBID_TOKEN_DEFAULT_DOUBLE = -1.0;
private List<LayoutToken> contentTokens = new ArrayList<>();

double top = GROBID_TOKEN_DEFAULT_DOUBLE;
double bottom = GROBID_TOKEN_DEFAULT_DOUBLE;
double left = GROBID_TOKEN_DEFAULT_DOUBLE;
double right = GROBID_TOKEN_DEFAULT_DOUBLE;

public void add(LayoutToken contentToken) {
contentTokens.add(contentToken);
setTop(contentToken);
setBottom(contentToken);
setLeft(contentToken);
setRight(contentToken);
}

private void setTop(LayoutToken contentToken) {
double tokenY = contentToken.getY();
if (tokenY == GROBID_TOKEN_DEFAULT_DOUBLE) return;

if (top == GROBID_TOKEN_DEFAULT_DOUBLE) {
top = tokenY;
return;
}

if (tokenY < top) {
top = tokenY;
}
}

private void setBottom(LayoutToken contentToken) {
double tokenY = contentToken.getY();
double tokenHeight = contentToken.getHeight();

if (tokenY == GROBID_TOKEN_DEFAULT_DOUBLE || tokenHeight == GROBID_TOKEN_DEFAULT_DOUBLE) return;

double tokenBottom = Double.sum(tokenY, tokenHeight);

if (bottom == GROBID_TOKEN_DEFAULT_DOUBLE) {
bottom = tokenBottom;
return;
}

if (tokenBottom > bottom) {
bottom = tokenBottom;
}
}

private void setLeft(LayoutToken contentToken) {
double tokenX = contentToken.getX();
if (tokenX == GROBID_TOKEN_DEFAULT_DOUBLE) return;

if (left == GROBID_TOKEN_DEFAULT_DOUBLE) {
left = tokenX;
return;
}

if (tokenX < left) {
left = tokenX;
}
}

private void setRight(LayoutToken contentToken) {
double tokenX = contentToken.getX();
double tokenWidth = contentToken.getWidth();

if (tokenX == GROBID_TOKEN_DEFAULT_DOUBLE || tokenWidth == GROBID_TOKEN_DEFAULT_DOUBLE) return;

double tokenRight = Double.sum(tokenX, tokenWidth);

if (right == GROBID_TOKEN_DEFAULT_DOUBLE) {
right = tokenRight;
return;
}

if (tokenRight > right) {
right = tokenRight;
}
}

public double getTop() {
return top;
}

public double getBottom() {
return bottom;
}

public double getLeft() {
return left;
}

public double getRight() {
return right;
}

public String getText() {
StringBuilder stringBuilder = new StringBuilder();
for (LayoutToken token: contentTokens) {
stringBuilder.append(token.getText());
}

return stringBuilder.toString();
}

public boolean isEmpty() {
return this.contentTokens.size() == 0;
}
}

0 comments on commit 60388e5

Please sign in to comment.