Skip to content

Commit

Permalink
add pragmatic segmenter implementation via jruby
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Aug 20, 2020
1 parent 68b8f2c commit 6551c95
Show file tree
Hide file tree
Showing 79 changed files with 2,943 additions and 1 deletion.
1 change: 1 addition & 0 deletions build.gradle
Expand Up @@ -247,6 +247,7 @@ project("grobid-core") {
implementation "org.apache.lucene:lucene-analyzers-common:4.5.1"
implementation 'black.ninia:jep:3.8.2'
implementation 'org.apache.opennlp:opennlp-tools:1.9.1'
implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'

shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
}
Expand Down
@@ -0,0 +1,64 @@
package org.grobid.core.lang.impl;

import org.jruby.embed.PathType;
import org.jruby.embed.ScriptingContainer;
import org.jruby.embed.LocalContextScope;

import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.utilities.OffsetPosition;
import org.grobid.core.utilities.GrobidProperties;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;
import java.io.*;

/**
* Implementation of sentence segmentation via the Pragmatic Segmenter
*
*/
public class PragmaticSentenceDetector implements SentenceDetector {
private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetector.class);

private ScriptingContainer instance = null;

public PragmaticSentenceDetector() {
String segmenterRbFile = GrobidProperties.getGrobidHomePath() +
File.separator + "lexicon" + File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb";
String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon";
String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon" +
File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" +
File.separator + "unicode-0.4.4.4-java" + File.separator + "lib";
//System.out.println(vendorLoadPath);

List<String> loadPaths = new ArrayList();
loadPaths.add(segmenterLoadPath);
loadPaths.add(unicodeLoadPath);

instance = new ScriptingContainer(LocalContextScope.THREADSAFE);
instance.setClassLoader(instance.getClass().getClassLoader());
instance.setLoadPaths(loadPaths);
instance.runScriptlet(PathType.ABSOLUTE, segmenterRbFile);
}

@Override
public List<OffsetPosition> detect(String text) {
instance.put("text", text);
String script = "ps = PragmaticSegmenter::Segmenter.new(text: text)\nps.segment";
Object ret = instance.runScriptlet(script);
//System.out.println(ret.toString());

// build offset positions from the string chunks
List<OffsetPosition> result = new ArrayList<>();
int pos = 0;
for(String chunk : (List<String>) ret) {
int start = text.indexOf(chunk, pos);
result.add(new OffsetPosition(start, start+chunk.length()));
pos = start+chunk.length();
}

return result;
}
}
@@ -0,0 +1,29 @@
package org.grobid.core.lang.impl;

import org.grobid.core.lang.SentenceDetector;
import org.grobid.core.lang.SentenceDetectorFactory;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;

/**
* Implementation of a sentence segmenter factory with OpenNLP language identifier
*/
public class PragmaticSentenceDetectorFactory implements SentenceDetectorFactory {
private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetectorFactory.class);
private static volatile SentenceDetector instance = null;

public SentenceDetector getInstance() {
if (instance == null) {
synchronized (this) {
if(instance == null) {
LOGGER.debug("synchronized getNewInstance");
instance = new PragmaticSentenceDetector();
}
}
}
return instance;
}
}
3 changes: 2 additions & 1 deletion grobid-home/config/grobid.properties
Expand Up @@ -60,7 +60,8 @@ grobid.use_language_id=true
grobid.language_detector_factory=org.grobid.core.lang.impl.CybozuLanguageDetectorFactory

# actual implementation of sentence segmentation to be used
grobid.sentence_detector_factory=org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory
#grobid.sentence_detector_factory=org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory
grobid.sentence_detector_factory=org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory

#determines if properties like the firstnames, lastnames country codes and dictionaries are supposed to be read from $GROBID_HOME path or not (possible values (true|false) dafault is false)
grobid.resources.inHome=true
Expand Down
121 changes: 121 additions & 0 deletions grobid-home/lexicon/pragmatic_segmenter/abbreviation_replacer.rb
@@ -0,0 +1,121 @@
# -*- encoding : utf-8 -*-
# frozen_string_literal: true

require 'unicode'

module PragmaticSegmenter
# This class searches for periods within an abbreviation and
# replaces the periods.
class AbbreviationReplacer

attr_reader :text
def initialize(text:, language: )
@text = Text.new(text)
@language = language
end

def replace
@text.apply(@language::PossessiveAbbreviationRule,
@language::KommanditgesellschaftRule,
@language::SingleLetterAbbreviationRules::All)

@text = search_for_abbreviations_in_string(@text)
@text = replace_multi_period_abbreviations(@text)
@text.apply(@language::AmPmRules::All)
replace_abbreviation_as_sentence_boundary(@text)
end

private

def search_for_abbreviations_in_string(txt)
original = txt.dup
downcased = Unicode::downcase(txt)
@language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
stripped = abbreviation.strip
next unless downcased.include?(stripped)
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
next if abbrev_match.empty?
next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
character_array = @text.scan(next_word_start)
abbrev_match.each_with_index do |am, index|
txt = scan_for_replacements(txt, am, index, character_array)
end
end
txt
end

def scan_for_replacements(txt, am, index, character_array)
character = character_array[index]
prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
upper = /[[:upper:]]/.match(character.to_s)
if upper.nil? || prepositive.include?(Unicode::downcase(am.strip))
if prepositive.include?(Unicode::downcase(am.strip))
txt = replace_prepositive_abbr(txt, am)
elsif number_abbr.include?(Unicode::downcase(am.strip))
txt = replace_pre_number_abbr(txt, am)
else
txt = replace_period_of_abbr(txt, am)
end
end
txt
end

def replace_abbreviation_as_sentence_boundary(txt)
# As we are being conservative and keeping ambiguous
# sentence boundaries as one sentence instead of
# splitting into two, we can split at words that
# we know for certain never follow these abbreviations.
# Some might say that the set of words that follow an
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than
# the set of words that could start a sentence and
# never follow U.S. However, we are being conservative
# and not splitting by default, so we need to look for places
# where we definitely can split. Obviously SENTENCE_STARTERS
# will never cover all cases, but as the gem is named
# 'Pragmatic Segmenter' we need to be pragmatic
# and try to cover the words that most often start a
# sentence but could never follow one of the abbreviations below.

# Rubular: http://rubular.com/r/PkBQ3PVBS8
@language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
escaped = Regexp.escape(word)
regex = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
txt.gsub!(regex, '\1.')
end
txt
end

def replace_multi_period_abbreviations(txt)
mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
return txt if mpa.empty?
mpa.each do |r|
txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
end
txt
end

def replace_pre_number_abbr(txt, abbr)
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
txt
end

def replace_prepositive_abbr(txt, abbr)
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
txt
end

def replace_period_of_abbr(txt, abbr)
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
txt
end

def replace_possessive_abbreviations(txt)
txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
txt
end
end
end
119 changes: 119 additions & 0 deletions grobid-home/lexicon/pragmatic_segmenter/between_punctuation.rb
@@ -0,0 +1,119 @@
# -*- encoding : utf-8 -*-
# frozen_string_literal: true

module PragmaticSegmenter
# This class searches for punctuation between quotes or parenthesis
# and replaces it
class BetweenPunctuation
# Rubular: http://rubular.com/r/2YFrKWQUYi
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/

BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/

# Rubular: http://rubular.com/r/3Pw1QlXOjd
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/

# Rubular: http://rubular.com/r/x6s4PZK8jc
BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/

# Rubular: http://rubular.com/r/JbAIpKdlSq
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/

# Rubular: http://rubular.com/r/WX4AvnZvlX
BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/

# Rubular: http://rubular.com/r/6tTityPflI
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/

# Rubular: http://rubular.com/r/mXf8cW025o
WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/

# Rubular: http://rubular.com/r/jTtDKfjxzr
BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/

attr_reader :text
def initialize(text:)
@text = text
end

def replace
sub_punctuation_between_quotes_and_parens(text)
end

private

def sub_punctuation_between_quotes_and_parens(txt)
sub_punctuation_between_single_quotes(txt)
sub_punctuation_between_single_quote_slanted(txt)
sub_punctuation_between_double_quotes(txt)
sub_punctuation_between_square_brackets(txt)
sub_punctuation_between_parens(txt)
sub_punctuation_between_quotes_arrow(txt)
sub_punctuation_between_em_dashes(txt)
sub_punctuation_between_quotes_slanted(txt)
end

def sub_punctuation_between_parens(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_PARENS_REGEX),
text: txt
).replace
end

def sub_punctuation_between_square_brackets(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
text: txt
).replace
end

def sub_punctuation_between_single_quotes(txt)
unless !(txt !~ WORD_WITH_LEADING_APOSTROPHE) && txt !~ /'\s/
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
text: txt,
match_type: 'single'
).replace
end
end

def sub_punctuation_between_single_quote_slanted(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
text: txt
).replace
end

def sub_punctuation_between_double_quotes(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: btwn_dbl_quote(txt),
text: txt
).replace
end

def btwn_dbl_quote(txt)
txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
end

def sub_punctuation_between_quotes_arrow(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
text: txt
).replace
end

def sub_punctuation_between_em_dashes(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
text: txt
).replace
end

def sub_punctuation_between_quotes_slanted(txt)
PragmaticSegmenter::PunctuationReplacer.new(
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
text: txt
).replace
end
end
end

0 comments on commit 6551c95

Please sign in to comment.