-
Notifications
You must be signed in to change notification settings - Fork 428
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
595 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
grobid-core/src/main/java/org/grobid/core/data/table/Cell.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package org.grobid.core.data.table; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class Cell extends Line { | ||
|
||
private int positionRow = -1; | ||
private int positionColumn = -1; | ||
private int colspan = 1; | ||
private boolean merged = false; | ||
|
||
public boolean linePartInBorders(LinePart linePart) { | ||
if (this.getContent().isEmpty()) return true; | ||
|
||
if ((this.getLeft() > linePart.getRight()) || this.getRight() < linePart.getLeft()) return false; | ||
|
||
return true; | ||
} | ||
|
||
public int getColspan() { | ||
return this.colspan; | ||
} | ||
|
||
public void setColspan(int colspan) { | ||
this.colspan = colspan; | ||
} | ||
|
||
public int getPositionRow() { | ||
return positionRow; | ||
} | ||
|
||
public void setPositionRow(int positionRow) { | ||
this.positionRow = positionRow; | ||
} | ||
|
||
public int getPositionColumn() { | ||
return positionColumn; | ||
} | ||
|
||
public void setPositionColumn(int positionColumn) { | ||
this.positionColumn = positionColumn; | ||
} | ||
|
||
public void setRight(double rightpos) { | ||
this.right = rightpos; | ||
} | ||
|
||
public void setLeft(double leftpos) { | ||
this.left = leftpos; | ||
} | ||
|
||
public void setMerged(boolean merged) { | ||
this.merged = merged; | ||
} | ||
|
||
public boolean isMerged() { | ||
return this.merged; | ||
} | ||
} |
149 changes: 149 additions & 0 deletions
149
grobid-core/src/main/java/org/grobid/core/data/table/Line.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package org.grobid.core.data.table; | ||
|
||
import org.grobid.core.layout.LayoutToken; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Comparator; | ||
import java.util.List; | ||
|
||
public class Line extends LinePart { | ||
|
||
private List<LinePart> contentParts = new ArrayList<>(); | ||
|
||
public void add(LinePart contentPart) { | ||
contentParts.add(contentPart); | ||
|
||
setTop(contentPart); | ||
setBottom(contentPart); | ||
setLeft(contentPart); | ||
setRight(contentPart); | ||
} | ||
|
||
private void setTop(LinePart contentPart) { | ||
double partTop = contentPart.getTop(); | ||
|
||
if (top == GROBID_TOKEN_DEFAULT_DOUBLE || top > partTop) { | ||
top = partTop; | ||
} | ||
} | ||
|
||
private void setBottom(LinePart contentPart) { | ||
double partBottom = contentPart.getBottom(); | ||
|
||
if (bottom == GROBID_TOKEN_DEFAULT_DOUBLE || bottom < partBottom) { | ||
bottom = partBottom; | ||
} | ||
} | ||
|
||
private void setLeft(LinePart contentPart) { | ||
double partLeft = contentPart.getLeft(); | ||
|
||
if (left == GROBID_TOKEN_DEFAULT_DOUBLE || left > partLeft) { | ||
left = partLeft; | ||
} | ||
} | ||
|
||
private void setRight(LinePart contentPart) { | ||
double partRight = contentPart.getRight(); | ||
|
||
if (right == GROBID_TOKEN_DEFAULT_DOUBLE || right < partRight) { | ||
right = partRight; | ||
} | ||
} | ||
|
||
public List<LinePart> getContent() { | ||
if (!this.contentParts.isEmpty()) return this.contentParts; | ||
return null; | ||
} | ||
|
||
public boolean isEmpty() { | ||
return this.contentParts.size() == 0; | ||
} | ||
|
||
public boolean linePartInBorders(LinePart linePart) { | ||
if (this.contentParts.isEmpty()) return true; | ||
|
||
// token is fully above the line or below, it doesn't overlap | ||
if ((this.getTop() > linePart.getBottom()) || this.getBottom() < linePart.getTop()) return false; | ||
|
||
return true; | ||
} | ||
|
||
@Override | ||
public String getText() { | ||
StringBuilder stringBuilder = new StringBuilder(); | ||
for (LinePart linePart: contentParts) { | ||
stringBuilder.append(linePart.getText()); | ||
} | ||
|
||
return stringBuilder.toString(); | ||
} | ||
|
||
public static List<LinePart> extractLineParts(List<LayoutToken> contentTokens) { | ||
List<LinePart> lineParts = new ArrayList<>(); | ||
LinePart currentLinePart = null; | ||
for (int i = 0; i < contentTokens.size(); i++) { | ||
LayoutToken contentToken = contentTokens.get(i); | ||
if (i == 0) { | ||
currentLinePart = new LinePart(); | ||
lineParts.add(currentLinePart); | ||
} | ||
|
||
if (!contentToken.getText().equals("\n")) { | ||
currentLinePart.add(contentToken); | ||
} | ||
|
||
if (contentToken.getText().equals("\n")) { | ||
LinePart newLinePart = new LinePart(); | ||
lineParts.add(newLinePart); | ||
currentLinePart = newLinePart; | ||
} | ||
} | ||
return lineParts; | ||
} | ||
|
||
/* | ||
* Algorithm for extracting lines. | ||
* See algorithm 1: Burcu Yildiz, Katharina Kaiser, Silvia Miksch. pdf2table: A Method to Extract Table Information | ||
* from PDF Files. | ||
*/ | ||
public static List<Line> extractLines(List<LinePart> lineParts) { | ||
List<Line> lines = new ArrayList<>(); | ||
Line currentLine = null; | ||
int i = lineParts.size() - 1; | ||
while (!lineParts.isEmpty() && i >= 0) { | ||
LinePart linePart = lineParts.get(i); | ||
if (linePart.getText().isEmpty()) { | ||
lineParts.remove(i); | ||
i--; | ||
continue; | ||
} | ||
|
||
if (currentLine == null) { | ||
currentLine = new Line(); | ||
lines.add(currentLine); | ||
currentLine.add(linePart); | ||
lineParts.remove(i); | ||
i--; | ||
continue; | ||
} | ||
|
||
if (currentLine.linePartInBorders(linePart)){ | ||
currentLine.add(linePart); | ||
lineParts.remove(i); | ||
i = lineParts.size() - 1; // return to the first item and recheck borders | ||
continue; | ||
} | ||
|
||
if (i == 0) { | ||
currentLine = null; | ||
i = lineParts.size() - 1; | ||
} else { | ||
i--; | ||
} | ||
} | ||
|
||
lines.sort(Comparator.comparingDouble(Line::getTop)); // sorting by top position | ||
return lines; | ||
} | ||
} |
118 changes: 118 additions & 0 deletions
118
grobid-core/src/main/java/org/grobid/core/data/table/LinePart.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package org.grobid.core.data.table; | ||
|
||
import org.grobid.core.layout.LayoutToken; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public class LinePart { | ||
|
||
public static final double GROBID_TOKEN_DEFAULT_DOUBLE = -1.0; | ||
private List<LayoutToken> contentTokens = new ArrayList<>(); | ||
|
||
double top = GROBID_TOKEN_DEFAULT_DOUBLE; | ||
double bottom = GROBID_TOKEN_DEFAULT_DOUBLE; | ||
double left = GROBID_TOKEN_DEFAULT_DOUBLE; | ||
double right = GROBID_TOKEN_DEFAULT_DOUBLE; | ||
|
||
public void add(LayoutToken contentToken) { | ||
contentTokens.add(contentToken); | ||
setTop(contentToken); | ||
setBottom(contentToken); | ||
setLeft(contentToken); | ||
setRight(contentToken); | ||
} | ||
|
||
private void setTop(LayoutToken contentToken) { | ||
double tokenY = contentToken.getY(); | ||
if (tokenY == GROBID_TOKEN_DEFAULT_DOUBLE) return; | ||
|
||
if (top == GROBID_TOKEN_DEFAULT_DOUBLE) { | ||
top = tokenY; | ||
return; | ||
} | ||
|
||
if (tokenY < top) { | ||
top = tokenY; | ||
} | ||
} | ||
|
||
private void setBottom(LayoutToken contentToken) { | ||
double tokenY = contentToken.getY(); | ||
double tokenHeight = contentToken.getHeight(); | ||
|
||
if (tokenY == GROBID_TOKEN_DEFAULT_DOUBLE || tokenHeight == GROBID_TOKEN_DEFAULT_DOUBLE) return; | ||
|
||
double tokenBottom = Double.sum(tokenY, tokenHeight); | ||
|
||
if (bottom == GROBID_TOKEN_DEFAULT_DOUBLE) { | ||
bottom = tokenBottom; | ||
return; | ||
} | ||
|
||
if (tokenBottom > bottom) { | ||
bottom = tokenBottom; | ||
} | ||
} | ||
|
||
private void setLeft(LayoutToken contentToken) { | ||
double tokenX = contentToken.getX(); | ||
if (tokenX == GROBID_TOKEN_DEFAULT_DOUBLE) return; | ||
|
||
if (left == GROBID_TOKEN_DEFAULT_DOUBLE) { | ||
left = tokenX; | ||
return; | ||
} | ||
|
||
if (tokenX < left) { | ||
left = tokenX; | ||
} | ||
} | ||
|
||
private void setRight(LayoutToken contentToken) { | ||
double tokenX = contentToken.getX(); | ||
double tokenWidth = contentToken.getWidth(); | ||
|
||
if (tokenX == GROBID_TOKEN_DEFAULT_DOUBLE || tokenWidth == GROBID_TOKEN_DEFAULT_DOUBLE) return; | ||
|
||
double tokenRight = Double.sum(tokenX, tokenWidth); | ||
|
||
if (right == GROBID_TOKEN_DEFAULT_DOUBLE) { | ||
right = tokenRight; | ||
return; | ||
} | ||
|
||
if (tokenRight > right) { | ||
right = tokenRight; | ||
} | ||
} | ||
|
||
public double getTop() { | ||
return top; | ||
} | ||
|
||
public double getBottom() { | ||
return bottom; | ||
} | ||
|
||
public double getLeft() { | ||
return left; | ||
} | ||
|
||
public double getRight() { | ||
return right; | ||
} | ||
|
||
public String getText() { | ||
StringBuilder stringBuilder = new StringBuilder(); | ||
for (LayoutToken token: contentTokens) { | ||
stringBuilder.append(token.getText()); | ||
} | ||
|
||
return stringBuilder.toString(); | ||
} | ||
|
||
public boolean isEmpty() { | ||
return this.contentTokens.size() == 0; | ||
} | ||
} |
Oops, something went wrong.