Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add pragmatic segmenter implementation via jruby
- Loading branch information
Showing
79 changed files
with
2,943 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
64 changes: 64 additions & 0 deletions
64
grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
package org.grobid.core.lang.impl; | ||
|
||
import org.jruby.embed.PathType; | ||
import org.jruby.embed.ScriptingContainer; | ||
import org.jruby.embed.LocalContextScope; | ||
|
||
import org.grobid.core.lang.SentenceDetector; | ||
import org.grobid.core.utilities.OffsetPosition; | ||
import org.grobid.core.utilities.GrobidProperties; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.io.*; | ||
|
||
/** | ||
* Implementation of sentence segmentation via the Pragmatic Segmenter | ||
* | ||
*/ | ||
public class PragmaticSentenceDetector implements SentenceDetector { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class); | ||
|
||
private ScriptingContainer instance = null; | ||
|
||
public PragmaticSentenceDetector() { | ||
String segmenterRbFile = GrobidProperties.getGrobidHomePath() + | ||
File.separator + "lexicon" + File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb"; | ||
String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon"; | ||
String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon" + | ||
File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" + | ||
File.separator + "unicode-0.4.4.4-java" + File.separator + "lib"; | ||
//System.out.println(vendorLoadPath); | ||
|
||
List<String> loadPaths = new ArrayList(); | ||
loadPaths.add(segmenterLoadPath); | ||
loadPaths.add(unicodeLoadPath); | ||
|
||
instance = new ScriptingContainer(LocalContextScope.THREADSAFE); | ||
instance.setClassLoader(instance.getClass().getClassLoader()); | ||
instance.setLoadPaths(loadPaths); | ||
instance.runScriptlet(PathType.ABSOLUTE, segmenterRbFile); | ||
} | ||
|
||
@Override | ||
public List<OffsetPosition> detect(String text) { | ||
instance.put("text", text); | ||
String script = "ps = PragmaticSegmenter::Segmenter.new(text: text)\nps.segment"; | ||
Object ret = instance.runScriptlet(script); | ||
//System.out.println(ret.toString()); | ||
|
||
// build offset positions from the string chunks | ||
List<OffsetPosition> result = new ArrayList<>(); | ||
int pos = 0; | ||
for(String chunk : (List<String>) ret) { | ||
int start = text.indexOf(chunk, pos); | ||
result.add(new OffsetPosition(start, start+chunk.length())); | ||
pos = start+chunk.length(); | ||
} | ||
|
||
return result; | ||
} | ||
} |
29 changes: 29 additions & 0 deletions
29
grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package org.grobid.core.lang.impl; | ||
|
||
import org.grobid.core.lang.SentenceDetector; | ||
import org.grobid.core.lang.SentenceDetectorFactory; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.File; | ||
|
||
/** | ||
* Implementation of a sentence segmenter factory with OpenNLP language identifier | ||
*/ | ||
public class PragmaticSentenceDetectorFactory implements SentenceDetectorFactory { | ||
private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetectorFactory.class); | ||
private static volatile SentenceDetector instance = null; | ||
|
||
public SentenceDetector getInstance() { | ||
if (instance == null) { | ||
synchronized (this) { | ||
if(instance == null) { | ||
LOGGER.debug("synchronized getNewInstance"); | ||
instance = new PragmaticSentenceDetector(); | ||
} | ||
} | ||
} | ||
return instance; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 121 additions & 0 deletions
121
grobid-home/lexicon/pragmatic_segmenter/abbreviation_replacer.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# -*- encoding : utf-8 -*- | ||
# frozen_string_literal: true | ||
|
||
require 'unicode' | ||
|
||
module PragmaticSegmenter | ||
# This class searches for periods within an abbreviation and | ||
# replaces the periods. | ||
class AbbreviationReplacer | ||
|
||
attr_reader :text | ||
def initialize(text:, language: ) | ||
@text = Text.new(text) | ||
@language = language | ||
end | ||
|
||
def replace | ||
@text.apply(@language::PossessiveAbbreviationRule, | ||
@language::KommanditgesellschaftRule, | ||
@language::SingleLetterAbbreviationRules::All) | ||
|
||
@text = search_for_abbreviations_in_string(@text) | ||
@text = replace_multi_period_abbreviations(@text) | ||
@text.apply(@language::AmPmRules::All) | ||
replace_abbreviation_as_sentence_boundary(@text) | ||
end | ||
|
||
private | ||
|
||
def search_for_abbreviations_in_string(txt) | ||
original = txt.dup | ||
downcased = Unicode::downcase(txt) | ||
@language::Abbreviation::ABBREVIATIONS.each do |abbreviation| | ||
stripped = abbreviation.strip | ||
next unless downcased.include?(stripped) | ||
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i) | ||
next if abbrev_match.empty? | ||
next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/ | ||
character_array = @text.scan(next_word_start) | ||
abbrev_match.each_with_index do |am, index| | ||
txt = scan_for_replacements(txt, am, index, character_array) | ||
end | ||
end | ||
txt | ||
end | ||
|
||
def scan_for_replacements(txt, am, index, character_array) | ||
character = character_array[index] | ||
prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS | ||
number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS | ||
upper = /[[:upper:]]/.match(character.to_s) | ||
if upper.nil? || prepositive.include?(Unicode::downcase(am.strip)) | ||
if prepositive.include?(Unicode::downcase(am.strip)) | ||
txt = replace_prepositive_abbr(txt, am) | ||
elsif number_abbr.include?(Unicode::downcase(am.strip)) | ||
txt = replace_pre_number_abbr(txt, am) | ||
else | ||
txt = replace_period_of_abbr(txt, am) | ||
end | ||
end | ||
txt | ||
end | ||
|
||
def replace_abbreviation_as_sentence_boundary(txt) | ||
# As we are being conservative and keeping ambiguous | ||
# sentence boundaries as one sentence instead of | ||
# splitting into two, we can split at words that | ||
# we know for certain never follow these abbreviations. | ||
# Some might say that the set of words that follow an | ||
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than | ||
# the set of words that could start a sentence and | ||
# never follow U.S. However, we are being conservative | ||
# and not splitting by default, so we need to look for places | ||
# where we definitely can split. Obviously SENTENCE_STARTERS | ||
# will never cover all cases, but as the gem is named | ||
# 'Pragmatic Segmenter' we need to be pragmatic | ||
# and try to cover the words that most often start a | ||
# sentence but could never follow one of the abbreviations below. | ||
|
||
# Rubular: http://rubular.com/r/PkBQ3PVBS8 | ||
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word| | ||
escaped = Regexp.escape(word) | ||
regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/ | ||
txt.gsub!(regex, '\1.') | ||
end | ||
txt | ||
end | ||
|
||
def replace_multi_period_abbreviations(txt) | ||
mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX) | ||
return txt if mpa.empty? | ||
mpa.each do |r| | ||
txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}") | ||
end | ||
txt | ||
end | ||
|
||
def replace_pre_number_abbr(txt, abbr) | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯') | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯') | ||
txt | ||
end | ||
|
||
def replace_prepositive_abbr(txt, abbr) | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯') | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯') | ||
txt | ||
end | ||
|
||
def replace_period_of_abbr(txt, abbr) | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯') | ||
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯') | ||
txt | ||
end | ||
|
||
def replace_possessive_abbreviations(txt) | ||
txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯') | ||
txt | ||
end | ||
end | ||
end |
119 changes: 119 additions & 0 deletions
119
grobid-home/lexicon/pragmatic_segmenter/between_punctuation.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# -*- encoding : utf-8 -*- | ||
# frozen_string_literal: true | ||
|
||
module PragmaticSegmenter | ||
# This class searches for punctuation between quotes or parenthesis | ||
# and replaces it | ||
class BetweenPunctuation | ||
# Rubular: http://rubular.com/r/2YFrKWQUYi | ||
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/ | ||
|
||
BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/ | ||
|
||
# Rubular: http://rubular.com/r/3Pw1QlXOjd | ||
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/ | ||
|
||
# Rubular: http://rubular.com/r/x6s4PZK8jc | ||
BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/ | ||
|
||
# Rubular: http://rubular.com/r/JbAIpKdlSq | ||
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/ | ||
|
||
# Rubular: http://rubular.com/r/WX4AvnZvlX | ||
BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/ | ||
|
||
# Rubular: http://rubular.com/r/6tTityPflI | ||
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/ | ||
|
||
# Rubular: http://rubular.com/r/mXf8cW025o | ||
WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/ | ||
|
||
# Rubular: http://rubular.com/r/jTtDKfjxzr | ||
BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/ | ||
|
||
attr_reader :text | ||
def initialize(text:) | ||
@text = text | ||
end | ||
|
||
def replace | ||
sub_punctuation_between_quotes_and_parens(text) | ||
end | ||
|
||
private | ||
|
||
def sub_punctuation_between_quotes_and_parens(txt) | ||
sub_punctuation_between_single_quotes(txt) | ||
sub_punctuation_between_single_quote_slanted(txt) | ||
sub_punctuation_between_double_quotes(txt) | ||
sub_punctuation_between_square_brackets(txt) | ||
sub_punctuation_between_parens(txt) | ||
sub_punctuation_between_quotes_arrow(txt) | ||
sub_punctuation_between_em_dashes(txt) | ||
sub_punctuation_between_quotes_slanted(txt) | ||
end | ||
|
||
def sub_punctuation_between_parens(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_PARENS_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def sub_punctuation_between_square_brackets(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def sub_punctuation_between_single_quotes(txt) | ||
unless !(txt !~ WORD_WITH_LEADING_APOSTROPHE) && txt !~ /'\s/ | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX), | ||
text: txt, | ||
match_type: 'single' | ||
).replace | ||
end | ||
end | ||
|
||
def sub_punctuation_between_single_quote_slanted(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def sub_punctuation_between_double_quotes(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: btwn_dbl_quote(txt), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def btwn_dbl_quote(txt) | ||
txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX) | ||
end | ||
|
||
def sub_punctuation_between_quotes_arrow(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def sub_punctuation_between_em_dashes(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
|
||
def sub_punctuation_between_quotes_slanted(txt) | ||
PragmaticSegmenter::PunctuationReplacer.new( | ||
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX), | ||
text: txt | ||
).replace | ||
end | ||
end | ||
end |
Oops, something went wrong.