Skip to content

Commit

Permalink
language specification for sentence segmentation process
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jan 7, 2021
1 parent aa19f66 commit 4ff805e
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 25 deletions.
2 changes: 1 addition & 1 deletion grobid-core/src/main/java/org/grobid/core/data/Figure.java
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (desc != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config);
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());

// we need a sentence segmentation of the figure caption, for that we need to introduce
// a <div>, then a <p>
Expand Down
4 changes: 2 additions & 2 deletions grobid-core/src/main/java/org/grobid/core/data/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (desc != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config);
formatter.segmentIntoSentences(desc, this.captionLayoutTokens, config, doc.getLanguage());

// we need a sentence segmentation of the table caption, for that we need to introduce
// a <div>, then a <p>
Expand Down Expand Up @@ -202,7 +202,7 @@ public String toTEI(GrobidAnalysisConfig config, Document doc, TEIFormatter form
}

if (noteNode != null && config.isWithSentenceSegmentation()) {
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config);
formatter.segmentIntoSentences(noteNode, this.noteLayoutTokens, config, doc.getLanguage());

// we need a sentence segmentation of the figure caption, for that we need to introduce
// a <p>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ public enum SchemaDeclaration {

private static Pattern startNum = Pattern.compile("^(\\d+)(.*)");

private static final String SCHEMA_XSD_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd";
private static final String SCHEMA_DTD_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/dtd/Grobid.dtd";
private static final String SCHEMA_RNG_LOCATION = "https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/rng/Grobid.rng";

public TEIFormatter(Document document, FullTextParser fullTextParser) {
this.doc = document;
this.fullTextParser = fullTextParser;
Expand All @@ -112,29 +116,21 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
tei.append("<?xml-stylesheet type=\"text/xsl\" href=\"../jsp/xmlverbatimwrapper.xsl\"?> \n");
}
if (schemaDeclaration == SchemaDeclaration.DTD) {
tei.append("<!DOCTYPE TEI SYSTEM \"" + GrobidProperties.get_GROBID_HOME_PATH()
+ "/schemas/dtd/Grobid.dtd" + "\">\n");
tei.append("<!DOCTYPE TEI SYSTEM \"" + SCHEMA_DTD_LOCATION + "\">\n");
} else if (schemaDeclaration == SchemaDeclaration.XSD) {
// XML schema
tei.append("<TEI xml:space=\"preserve\" xmlns=\"http://www.tei-c.org/ns/1.0\" \n" +
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" \n" +
//"\n xsi:noNamespaceSchemaLocation=\"" +
//GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/xsd/Grobid.xsd\"" +
"xsi:schemaLocation=\"http://www.tei-c.org/ns/1.0 " +
GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/xsd/Grobid.xsd\"" +
"\n xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n");
SCHEMA_XSD_LOCATION +
"\"\n xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n");
// "\n xmlns:mml=\"http://www.w3.org/1998/Math/MathML\">\n");
} else if (schemaDeclaration == SchemaDeclaration.RNG) {
// standard RelaxNG
tei.append("<?xml-model href=\"file://" +
GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/rng/Grobid.rng" +
tei.append("<?xml-model href=\"" + SCHEMA_RNG_LOCATION +
"\" schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n");
} else if (schemaDeclaration == SchemaDeclaration.RNC) {
// compact RelaxNG
tei.append("<?xml-model href=\"file://" +
GrobidProperties.get_GROBID_HOME_PATH() + "/schemas/rng/Grobid.rnc" +
"\" type=\"application/relax-ng-compact-syntax\"?>\n");
}
}

// by default there is no schema association

if (schemaDeclaration != SchemaDeclaration.XSD) {
Expand Down Expand Up @@ -1271,7 +1267,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(cluster.concatTokens());
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config);
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
}
curParagraph = teiElement("p");
if (config.isGenerateTeiIds()) {
Expand Down Expand Up @@ -1329,7 +1325,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,

// in case we segment paragraph into sentences, we still need to do it for the last paragraph
if (curParagraph != null && config.isWithSentenceSegmentation()) {
segmentIntoSentences(curParagraph, curParagraphTokens, config);
segmentIntoSentences(curParagraph, curParagraphTokens, config, doc.getLanguage());
}

// remove possibly empty div in the div list
Expand Down Expand Up @@ -1395,7 +1391,7 @@ private boolean isNewParagraph(TaggingLabel lastClusterLabel, Element curParagra
&& lastClusterLabel != TaggingLabels.TABLE) || curParagraph == null;
}

public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config) {
public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curParagraphTokens, GrobidAnalysisConfig config, String lang) {
// in order to avoid having a sentence boundary in the middle of a ref element
// (which is frequent given the abbreviation in the reference expression, e.g. Fig.)
// we only consider for sentence segmentation texts under <p> and skip the text under <ref>.
Expand Down Expand Up @@ -1430,7 +1426,7 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens);
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));

/*if (theSentences.size() == 0) {
// this should normally not happen, but it happens (depending on sentence splitter, usually the text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,14 @@ public interface SentenceDetector {
* position of the recognized sentence in the text
*/
public List<OffsetPosition> detect(String text);


/**
* Detects sentence boundaries using a specified language
* @param text text to detect sentence boundaries
* @param lang language to be used for detecting sentence boundaries
* @return a list of offset positions indicating start and end character
* position of the recognized sentence in the text
*/
public List<OffsetPosition> detect(String text, Language lang);
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import opennlp.tools.util.Span;

import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.GrobidProperties;

Expand Down Expand Up @@ -38,6 +39,11 @@ public OpenNLPSentenceDetector() {

@Override
public List<OffsetPosition> detect(String text) {
return detect(text, new Language(Language.EN));
}

@Override
public List<OffsetPosition> detect(String text, Language lang) {
// unfortunately OpenNLP sentence detector is not thread safe, only the model can be share
SentenceDetectorME detector = new SentenceDetectorME(model);
Span spans[] = detector.sentPosDetect(text);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.jruby.embed.LocalVariableBehavior;

import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.GrobidProperties;

Expand Down Expand Up @@ -48,8 +49,17 @@ public PragmaticSentenceDetector() {

@Override
public List<OffsetPosition> detect(String text) {
return detect(text, new Language(Language.EN));
}

@Override
public List<OffsetPosition> detect(String text, Language lang) {
instance.put("text", text);
String script = "ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)\nps.segment";
String script = null;
if (lang == null || "en".equals(lang.getLang()))
script = "ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)\nps.segment";
else
script = "ps = PragmaticSegmenter::Segmenter.new(text: text, language: '" + lang.getLang() + "', clean: false)\nps.segment";
Object ret = instance.runScriptlet(script);

//System.out.println(text);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ public static File getGrobidHomePath() {
return GROBID_HOME_PATH;
}

public static String getGrobidHome() {
return GROBID_HOME_PATH.getPath();
}

/**
* Set the GROBID_HOME path.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.lang.SentenceDetectorFactory;
import org.grobid.core.lang.Language;
import org.grobid.core.layout.LayoutToken;

import java.util.*;
Expand Down Expand Up @@ -75,6 +76,27 @@ public List<OffsetPosition> runSentenceDetection(String text) {
}
}

/**
* Basic run for sentence identification with a specified language to be considered when segmenting,
* return the offset positions of the identified sentences
*
* @param text
* text to segment into sentences
* @param lang
* specified language to be used when segmenting text
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text, Language lang) {
if (text == null)
return null;
try {
return sdf.getInstance().detect(text, lang);
} catch (Exception e) {
LOGGER.warn("Cannot detect sentences. ", e);
return null;
}
}

/**
* Run for sentence identification with some forbidden span constraints, return the offset positions of the
* identified sentences without sentence boundaries within a forbidden span (typically a reference marker
Expand All @@ -87,7 +109,7 @@ public List<OffsetPosition> runSentenceDetection(String text) {
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden) {
return runSentenceDetection(text, forbidden, null);
return runSentenceDetection(text, forbidden, null, null);
}

/**
Expand All @@ -103,13 +125,15 @@ public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPositio
* @param textLayoutTokens
* list of LayoutToken objects from which the text has been created, if this list is null
* we consider that we have a pure textual input (e.g. text is not from a PDF)
* @param lang
* specified language to be used when segmenting text
* @return list of offset positions for the identified sentence, relative to the input text
*/
public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden, List<LayoutToken> textLayoutTokens) {
public List<OffsetPosition> runSentenceDetection(String text, List<OffsetPosition> forbidden, List<LayoutToken> textLayoutTokens, Language lang) {
if (text == null)
return null;
try {
List<OffsetPosition> sentencePositions = sdf.getInstance().detect(text);
List<OffsetPosition> sentencePositions = sdf.getInstance().detect(text, lang);

// to be sure, we sort the forbidden positions
if (forbidden == null)
Expand Down

0 comments on commit 4ff805e

Please sign in to comment.