Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pdfalto parser fixes #496

Merged
merged 10 commits into from Oct 14, 2019
@@ -1,5 +1,6 @@
package org.grobid.core.sax;

import org.apache.commons.lang3.StringUtils;
import org.grobid.core.analyzers.Analyzer;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.document.Document;
Expand All @@ -16,6 +17,8 @@
import java.util.HashMap;
import java.util.List;

import static shadedwipo.org.apache.commons.lang3.StringUtils.isNotBlank;


/**
* SAX parser for XML ALTO representation of fixed layout documents. Typographical and layout information
Expand Down Expand Up @@ -612,39 +615,49 @@ public void startElement(String namespaceURI, String localName,
String name = atts.getQName(i);
String value = atts.getValue(i);

if ((name != null) && (value != null)) {
if (isNotBlank(name)&& isNotBlank(value)) {
if (name.equals("ID")) {
fontId = value;
blabla.append(" ");
} else if (name.equals("FONTFAMILY")) {
if (StringUtils.containsIgnoreCase(value, "bold") || StringUtils.endsWithIgnoreCase(value, "_bd")) {
kermitt2 marked this conversation as resolved.
Show resolved Hide resolved
textStyle.setBold(true);
}

if (StringUtils.containsIgnoreCase(value, "italic") || StringUtils.endsWithIgnoreCase(value, "_it")) {
textStyle.setItalic(true);
}

textStyle.setFontName(value);
blabla.append(" ");
} else if (name.equals("FONTSIZE")) {
double fontSize = Double.parseDouble(value);
textStyle.setFontSize(fontSize);
blabla.append(" ");
} else if (name.equals("FONTSTYLE")) {
if (value.contains("bold")) {
// font properties, we are interested by subscript or superscript
if (StringUtils.containsIgnoreCase(value, "subscript")) {
textStyle.setSubscript(true);
}

if (StringUtils.containsIgnoreCase(value, "superscript")) {
textStyle.setSuperscript(true);
}

if (StringUtils.containsIgnoreCase(value, "bold")) {
textStyle.setBold(true);
} else if (value.contains("italics")){
}

if (StringUtils.containsIgnoreCase(value, "italic") || StringUtils.containsIgnoreCase(value, "italics")) {
textStyle.setItalic(true);
} else {
textStyle.setBold(false);
textStyle.setItalic(false);
}

blabla.append(" ");
} else if (name.equals("FONTCOLOR")) {
textStyle.setFontColor(value);
}
else if (name.equals("FONTTYPE")) {
// value can be empty or a sequency of font properties separated by space, out of these
// font properties, we are interested by subscript or superscript
if ( (value != null) && (value.length() > 0) ) {
if ( (value.indexOf("subscript") != -1) || (value.indexOf("SUBSCRIPT") != -1) )
textStyle.setSubscript(true);
else if ( (value.indexOf("supercript") != -1) || (value.indexOf("SUPERSCRIPT") != -1) )
textStyle.setSuperscript(true);
}
// value can be empty or a sequence of font properties separated by space, out of these
/*if (value.equals("serif")) {
textStyle.setSerif(true);
} else {
Expand Down
Expand Up @@ -16,6 +16,8 @@

import static org.easymock.EasyMock.createMock;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;

Expand Down Expand Up @@ -82,6 +84,64 @@ public void testParsing_shouldWork() throws Exception {
List<LayoutToken> tokenList = target.getTokenization();

assertThat(tokenList.stream().filter(t -> t.getText().equals("newly")).count(), is(1L));

assertThat(tokenList.get(0).getText(), is("Microscopic"));
assertThat(tokenList.get(0).getBold(), is(true));
assertThat(tokenList.get(25).getText(), is("BaFe"));
assertThat(tokenList.get(25).isSubscript(), is(false));
assertThat(tokenList.get(27).getText(), is("2"));
assertThat(tokenList.get(27).isSubscript(), is(true));
}

@Test
public void testParsing_BoldItalic_shouldWork() throws Exception {
InputStream inputStream = this.getClass().getResourceAsStream("s3xKQzHmBR.xml");

SAXParser p = spf.newSAXParser();
p.parse(inputStream, target);

List<LayoutToken> tokenList = target.getTokenization();

assertThat(tokenList.stream().filter(LayoutToken::isSuperscript).count(), is(4L));
assertThat(tokenList.stream().filter(LayoutToken::isSubscript).count(), is(3L));

assertThat(tokenList, hasSize(greaterThan(0)));

assertThat(tokenList.get(0).getText(), is("We"));
assertThat(tokenList.get(0).isSubscript(), is(false));
assertThat(tokenList.get(0).isSuperscript(), is(false));
assertThat(tokenList.get(0).getBold(), is(false));
assertThat(tokenList.get(0).getItalic(), is(false));

assertThat(tokenList.get(14).getText(), is("CO"));
assertThat(tokenList.get(14).isSubscript(), is(false));
assertThat(tokenList.get(14).isSuperscript(), is(false));
assertThat(tokenList.get(14).getBold(), is(false));
assertThat(tokenList.get(14).getItalic(), is(false));

assertThat(tokenList.get(16).getText(), is("2"));
assertThat(tokenList.get(16).isSubscript(), is(true));
assertThat(tokenList.get(16).isSuperscript(), is(false));
assertThat(tokenList.get(16).getBold(), is(false));
assertThat(tokenList.get(16).getItalic(), is(false));

assertThat(tokenList.get(35).getText(), is("Ur"));
assertThat(tokenList.get(35).isSubscript(), is(false));
assertThat(tokenList.get(35).isSuperscript(), is(false));
assertThat(tokenList.get(35).getBold(), is(true));
assertThat(tokenList.get(35).getItalic(), is(true));

assertThat(tokenList.get(37).getText(), is("123"));
assertThat(tokenList.get(37).isSubscript(), is(true));
assertThat(tokenList.get(37).isSuperscript(), is(false));
assertThat(tokenList.get(37).getBold(), is(true));
assertThat(tokenList.get(37).getItalic(), is(true));

assertThat(tokenList.get(39).getText(), is("6a"));
assertThat(tokenList.get(39).isSubscript(), is(false));
assertThat(tokenList.get(39).isSuperscript(), is(true));
assertThat(tokenList.get(39).getBold(), is(false));
assertThat(tokenList.get(39).getItalic(), is(true));
}

}
102 changes: 102 additions & 0 deletions grobid-core/src/test/resources/org/grobid/core/sax/s3xKQzHmBR.xml
@@ -0,0 +1,102 @@
<?xml version="1.0" encoding="UTF-8"?>
<alto xmlns="http://www.loc.gov/standards/alto/v3/alto.xsd">
<Description>
<MeasurementUnit>pixel</MeasurementUnit>
<sourceImageInformation>
<fileName>/Users/lfoppiano/development/projects/grobid/grobid-home/tmp/origin1823875201361402337.pdf
</fileName>
</sourceImageInformation>
<OCRProcessing ID="IdOcr">
<ocrProcessingStep>
<processingDateTime>Wed Sep 4 08:46:44 2019
</processingDateTime>
<processingSoftware>
<softwareCreator>CONTRIBUTORS</softwareCreator>
<softwareName>pdfalto</softwareName>
<softwareVersion>0.1</softwareVersion>
</processingSoftware>
</ocrProcessingStep>
</OCRProcessing>
</Description>
<Styles>
<TextStyle ID="font0" FONTFAMILY="baaaaa+liberationserif" FONTSIZE="12.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE=""/>
<TextStyle ID="font1" FONTFAMILY="caaaaa+liberationserif-italic" FONTSIZE="12.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="italics"/>
<TextStyle ID="font2" FONTFAMILY="daaaaa+liberationserif-bold" FONTSIZE="12.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="bold"/>
<TextStyle ID="font3" FONTFAMILY="eaaaaa+liberationserif-bolditalic" FONTSIZE="12.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="bold italics"/>
<TextStyle ID="font4" FONTFAMILY="baaaaa+liberationserif" FONTSIZE="7.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="subscript"/>
<TextStyle ID="font5" FONTFAMILY="daaaaa+liberationserif-bold" FONTSIZE="7.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="bold superscript"/>
<TextStyle ID="font6" FONTFAMILY="eaaaaa+liberationserif-bolditalic" FONTSIZE="7.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="bold italics subscript"/>
<TextStyle ID="font7" FONTFAMILY="caaaaa+liberationserif-italic" FONTSIZE="7.000" FONTTYPE="sans-serif"
FONTWIDTH="proportional" FONTCOLOR="#000000" FONTSTYLE="italics superscript"/>
</Styles>
<Layout>
<Page ID="Page1" PHYSICAL_IMG_NR="1" WIDTH="595.304" HEIGHT="841.89">
<PrintSpace>
<TextBlock ID="p1_b1" HPOS="56.8" VPOS="56.7088" HEIGHT="-56.708" WIDTH="-56.8">
<TextLine WIDTH="333.233" HEIGHT="13.504" ID="p1_t1" HPOS="56.8" VPOS="56.6638">
<String ID="p1_w1" CONTENT="We" HPOS="56.8" VPOS="56.7088" WIDTH="15.72" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="2.98" VPOS="56.7088" HPOS="72.52"/>
<String ID="p1_w2" CONTENT="went" HPOS="75.5" VPOS="56.7088" WIDTH="22.62" HEIGHT="13.284"
STYLEREFS="font1"/>
<SP WIDTH="3.08" VPOS="56.7088" HPOS="98.12"/>
<String ID="p1_w3" CONTENT="to" HPOS="101.2" VPOS="56.7088" WIDTH="9.3" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3.084" VPOS="56.7088" HPOS="110.5"/>
<String ID="p1_w4" CONTENT="the" HPOS="113.584" VPOS="56.7088" WIDTH="14.616" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3" VPOS="56.7088" HPOS="128.2"/>
<String ID="p1_w5" CONTENT="lab" HPOS="131.2" VPOS="56.7088" WIDTH="15.972" HEIGHT="13.284"
STYLEREFS="font2"/>
<SP WIDTH="3.028" VPOS="56.7088" HPOS="147.172"/>
<String ID="p1_w6" CONTENT="and" HPOS="150.2" VPOS="56.7088" WIDTH="17.304" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="2.996" VPOS="56.7088" HPOS="167.504"/>
<String ID="p1_w7" CONTENT="tested" HPOS="170.5" VPOS="56.7088" WIDTH="27.9" HEIGHT="13.284"
STYLEREFS="font3"/>
<SP WIDTH="3.1" VPOS="56.7088" HPOS="198.4"/>
<String ID="p1_w8" CONTENT="CO" HPOS="201.5" VPOS="56.7088" WIDTH="16.56" HEIGHT="13.284"
STYLEREFS="font0"/>
<String ID="p1_w9" CONTENT="2" HPOS="218.2" VPOS="62.4638" WIDTH="3.5" HEIGHT="7.749"
STYLEREFS="font4"/>
<String ID="p1_w10" CONTENT="Al" HPOS="221.7" VPOS="56.7088" WIDTH="12.012" HEIGHT="13.284"
STYLEREFS="font0"/>
<String ID="p1_w11" CONTENT="3" HPOS="233.7" VPOS="62.4638" WIDTH="3.5" HEIGHT="7.749"
STYLEREFS="font4"/>
<String ID="p1_w12" CONTENT="4)" HPOS="237.2" VPOS="56.6638" WIDTH="5.831" HEIGHT="7.749"
STYLEREFS="font5"/>
<SP WIDTH="1.669" VPOS="56.6638" HPOS="243.031"/>
<String ID="p1_w13" CONTENT="and" HPOS="244.7" VPOS="56.7088" WIDTH="17.304" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3" VPOS="56.7088" HPOS="262.004"/>
<String ID="p1_w14" CONTENT="we" HPOS="265.004" VPOS="56.7088" WIDTH="14.004" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="2.988" VPOS="56.7088" HPOS="279.008"/>
<String ID="p1_w15" CONTENT="found" HPOS="281.996" VPOS="56.7088" WIDTH="27.996" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3" VPOS="56.7088" HPOS="309.992"/>
<String ID="p1_w16" CONTENT="traces" HPOS="312.992" VPOS="56.7088" WIDTH="27.96" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3.024" VPOS="56.7088" HPOS="340.952"/>
<String ID="p1_w17" CONTENT="of" HPOS="343.976" VPOS="56.7088" WIDTH="9.996" HEIGHT="13.284"
STYLEREFS="font0"/>
<SP WIDTH="3.028" VPOS="56.7088" HPOS="353.972"/>
<String ID="p1_w18" CONTENT="Ur" HPOS="357" VPOS="56.7088" WIDTH="13.272" HEIGHT="13.284"
STYLEREFS="font3"/>
<String ID="p1_w19" CONTENT="123" HPOS="370.4" VPOS="62.4638" WIDTH="10.5" HEIGHT="7.749"
STYLEREFS="font6"/>
<String ID="p1_w20" CONTENT="6a)" HPOS="380.8" VPOS="56.6638" WIDTH="9.233" HEIGHT="7.749"
STYLEREFS="font7"/>
</TextLine>
</TextBlock>
</PrintSpace>
</Page>
</Layout>
</alto>