Skip to content

Commit

Permalink
remove suffix space when there is no more text
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Sep 12, 2022
1 parent b82cc43 commit 053b235
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 6 deletions.
2 changes: 0 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Expand Up @@ -33,11 +33,9 @@
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Node;
import nu.xom.Text;

import static org.grobid.core.document.TEIFormatter.applyStyleList;
import static org.grobid.core.document.TEIFormatter.extractStylesList;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.document.xml.XmlBuilderUtils.addXmlId;
import static org.grobid.core.document.xml.XmlBuilderUtils.textNode;

Expand Down
Expand Up @@ -1424,7 +1424,7 @@ public static Element applyStyleList(Element paragraphElem, String text, List<Tr
String subString = text.substring(lastPosition, offsetStyle.start);
String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
String suffixSpace = "";
if (subString.length() > 1) {
if (subString.length() > prefixSpace.length()) {
suffixSpace = StringUtils.endsWith(subString, " ") ? " " : "";
}
paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")) + suffixSpace);
Expand All @@ -1435,7 +1435,11 @@ public static Element applyStyleList(Element paragraphElem, String text, List<Tr
paragraphElem.appendChild(rend);
}
String subString = text.substring(lastPosition);
String prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
String subStringNormalized = StringUtils.normalizeSpace(subString);
String prefixSpace = "";
if (subStringNormalized.length() > 0) {
prefixSpace = StringUtils.startsWith(subString, " ") ? " " : "";
}
paragraphElem.appendChild(prefixSpace + StringUtils.normalizeSpace(subString.replace("\n", " ")));

return paragraphElem;
Expand Down Expand Up @@ -1819,10 +1823,12 @@ public static List<Triple<String, String, OffsetPosition>> extractStylesList(Lis
styleList.set(styleList.size()-1, Triple.of(last.getLeft(), value.toString(), new OffsetPosition(last.getRight().start, endOffset)));
} else {
styleList.add(Triple.of(styleNameTrimmed, value.toString(), new OffsetPosition(startOffset, endOffset)));
// value = new StringBuilder();
}

previousStyleName = styleNameTrimmed;
}
// List<Triple<String, String, OffsetPosition>> postProcessedStyleList = styleList.stream().map(s -> Triple.of(s.getLeft(), s.getMiddle().substring(s.getRight().start, s.getRight().end), s.getRight())).collect(Collectors.toList());

return styleList;
}
Expand Down
Expand Up @@ -14,6 +14,7 @@
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.SentenceUtilities;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

import java.util.ArrayList;
Expand All @@ -23,7 +24,6 @@
import java.util.stream.Collectors;

import static org.grobid.core.document.TEIFormatter.*;
import static org.hamcrest.CoreMatchers.any;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;
Expand Down Expand Up @@ -144,7 +144,7 @@ public void testSegmentIntoSentences_Style_ShouldWork() throws Exception {
new TEIFormatter(null, null).segmentIntoSentences(currentParagraph, tokens, config, "en");

assertThat(currentParagraph.toXML(),
is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi> <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
is("<p xmlns=\"http://www.tei-c.org/ns/1.0\"><s><hi rend=\"bold\">One</hi> <hi rend=\"bold italic\">sentence</hi> <ref>(Foppiano et al.)</ref>.</s><s>Second sentence <ref>(Lopez et al.)</ref>.</s></p>"));
}

@Test
Expand Down Expand Up @@ -527,6 +527,32 @@ public void testExtractStylesList_ignoreBold_shouldWork() throws Exception {
assertThat(pairs.get(1).getRight().end, is(86));
}

@Ignore("The middle is actually not used")
public void testExtractStylesList_checkProducedText_ShouldWork() throws Exception {
String text = "I. Introduction 1.1. Généralités et rappels ";
List<LayoutToken> textTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

textTokens.get(0).setBold(true);
textTokens.get(1).setBold(true);
textTokens.get(3).setBold(true);

textTokens.get(6).setItalic(true);
textTokens.get(7).setItalic(true);
textTokens.get(8).setItalic(true);
textTokens.get(9).setItalic(true);
textTokens.get(11).setItalic(true);
textTokens.get(13).setItalic(true);
textTokens.get(15).setItalic(true);

List<Triple<String, String, OffsetPosition>> pairs = TEIFormatter.extractStylesList(textTokens);

assertThat(pairs, hasSize(2));
assertThat(pairs.get(0).getLeft(), is("bold"));
assertThat(pairs.get(0).getMiddle(), is("I. Introduction"));
assertThat(pairs.get(1).getLeft(), is("italic"));
assertThat(pairs.get(1).getMiddle(), is("1.1. Généralités et rappels"));
}

@Test
public void testGetSectionNumber_simple_ShouldWork() throws Exception {
String text = "3 Supercon 2";
Expand Down

0 comments on commit 053b235

Please sign in to comment.