add pragmatic segmenter implementation via jruby

kermitt2 · Aug 20, 2020 · 6551c95 · 6551c95
1 parent 68b8f2c
commit 6551c95
Show file tree

Hide file tree

Showing 79 changed files with 2,943 additions and 1 deletion.
diff --git a/build.gradle b/build.gradle
@@ -247,6 +247,7 @@ project("grobid-core") {
         implementation "org.apache.lucene:lucene-analyzers-common:4.5.1"
         implementation 'black.ninia:jep:3.8.2'
         implementation 'org.apache.opennlp:opennlp-tools:1.9.1'
+        implementation group: 'org.jruby', name: 'jruby-complete', version: '9.2.13.0'
 
         shadedLib "org.apache.lucene:lucene-analyzers-common:4.5.1"
     }

diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetector.java
@@ -0,0 +1,64 @@
+package org.grobid.core.lang.impl;
+
+import org.jruby.embed.PathType;
+import org.jruby.embed.ScriptingContainer;
+import org.jruby.embed.LocalContextScope;
+
+import org.grobid.core.lang.SentenceDetector;
+import org.grobid.core.utilities.OffsetPosition;
+import org.grobid.core.utilities.GrobidProperties;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.io.*;
+
+/**
+ * Implementation of sentence segmentation via the Pragmatic Segmenter
+ * 
+ */
+public class PragmaticSentenceDetector implements SentenceDetector {
+    private static final Logger LOGGER  = LoggerFactory.getLogger(PragmaticSentenceDetector.class);
+
+    private ScriptingContainer instance = null;
+
+    public PragmaticSentenceDetector() {
+        String segmenterRbFile = GrobidProperties.getGrobidHomePath() + 
+            File.separator + "lexicon" + File.separator + "pragmatic_segmenter"+ File.separator + "segmenter.rb";
+        String segmenterLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon";  
+        String unicodeLoadPath = GrobidProperties.getGrobidHomePath() + File.separator + "lexicon" + 
+            File.separator + "pragmatic_segmenter" + File.separator + "gem" + File.separator + "gems" +
+            File.separator + "unicode-0.4.4.4-java" + File.separator + "lib";    
+//System.out.println(vendorLoadPath);
+
+        List<String> loadPaths = new ArrayList();
+        loadPaths.add(segmenterLoadPath);
+        loadPaths.add(unicodeLoadPath);
+
+        instance = new ScriptingContainer(LocalContextScope.THREADSAFE);
+        instance.setClassLoader(instance.getClass().getClassLoader());
+        instance.setLoadPaths(loadPaths);
+        instance.runScriptlet(PathType.ABSOLUTE, segmenterRbFile);
+    }
+
+    @Override
+    public List<OffsetPosition> detect(String text) {
+        instance.put("text", text);
+        String script = "ps = PragmaticSegmenter::Segmenter.new(text: text)\nps.segment";
+        Object ret = instance.runScriptlet(script);
+        //System.out.println(ret.toString());
+
+        // build offset positions from the string chunks
+        List<OffsetPosition> result = new ArrayList<>();
+        int pos = 0;
+        for(String chunk : (List<String>) ret) {
+            int start = text.indexOf(chunk, pos);
+            result.add(new OffsetPosition(start, start+chunk.length()));
+            pos = start+chunk.length();
+        }
+
+        return result;
+    }
+}
diff --git a/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorFactory.java b/grobid-core/src/main/java/org/grobid/core/lang/impl/PragmaticSentenceDetectorFactory.java
@@ -0,0 +1,29 @@
+package org.grobid.core.lang.impl;
+
+import org.grobid.core.lang.SentenceDetector;
+import org.grobid.core.lang.SentenceDetectorFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+
+/**
+ * Implementation of a sentence segmenter factory with OpenNLP language identifier
+ */
+public class PragmaticSentenceDetectorFactory implements SentenceDetectorFactory {
+    private static final Logger LOGGER = LoggerFactory.getLogger(PragmaticSentenceDetectorFactory.class);
+    private static volatile SentenceDetector instance = null;
+
+    public SentenceDetector getInstance() {
+        if (instance == null) {
+            synchronized (this) {
+                if(instance == null) {
+                    LOGGER.debug("synchronized getNewInstance");
+                    instance = new PragmaticSentenceDetector();
+                }
+            }
+        }
+        return instance;
+    }
+}
diff --git a/grobid-home/config/grobid.properties b/grobid-home/config/grobid.properties
@@ -60,7 +60,8 @@ grobid.use_language_id=true
 grobid.language_detector_factory=org.grobid.core.lang.impl.CybozuLanguageDetectorFactory
 
 # actual implementation of sentence segmentation to be used
-grobid.sentence_detector_factory=org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory
+#grobid.sentence_detector_factory=org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory
+grobid.sentence_detector_factory=org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory
 
 #determines if properties like the firstnames, lastnames country codes and dictionaries are supposed to be read from $GROBID_HOME path or not (possible values (true|false) dafault is false)
 grobid.resources.inHome=true

diff --git a/grobid-home/lexicon/pragmatic_segmenter/abbreviation_replacer.rb b/grobid-home/lexicon/pragmatic_segmenter/abbreviation_replacer.rb
@@ -0,0 +1,121 @@
+# -*- encoding : utf-8 -*-
+# frozen_string_literal: true
+
+require 'unicode'
+
+module PragmaticSegmenter
+  # This class searches for periods within an abbreviation and
+  # replaces the periods.
+  class AbbreviationReplacer
+
+    attr_reader :text
+    def initialize(text:, language: )
+      @text = Text.new(text)
+      @language = language
+    end
+
+    def replace
+      @text.apply(@language::PossessiveAbbreviationRule,
+        @language::KommanditgesellschaftRule,
+        @language::SingleLetterAbbreviationRules::All)
+
+      @text = search_for_abbreviations_in_string(@text)
+      @text = replace_multi_period_abbreviations(@text)
+      @text.apply(@language::AmPmRules::All)
+      replace_abbreviation_as_sentence_boundary(@text)
+    end
+
+    private
+
+    def search_for_abbreviations_in_string(txt)
+      original = txt.dup
+      downcased = Unicode::downcase(txt)
+      @language::Abbreviation::ABBREVIATIONS.each do |abbreviation|
+        stripped = abbreviation.strip
+        next unless downcased.include?(stripped)
+        abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(stripped)}/i)
+        next if abbrev_match.empty?
+        next_word_start = /(?<=#{Regexp.escape(stripped)} ).{1}/
+        character_array = @text.scan(next_word_start)
+        abbrev_match.each_with_index do |am, index|
+          txt = scan_for_replacements(txt, am, index, character_array)
+        end
+      end
+      txt
+    end
+
+    def scan_for_replacements(txt, am, index, character_array)
+      character = character_array[index]
+      prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
+      number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
+      upper = /[[:upper:]]/.match(character.to_s)
+      if upper.nil? || prepositive.include?(Unicode::downcase(am.strip))
+        if prepositive.include?(Unicode::downcase(am.strip))
+          txt = replace_prepositive_abbr(txt, am)
+        elsif number_abbr.include?(Unicode::downcase(am.strip))
+          txt = replace_pre_number_abbr(txt, am)
+        else
+          txt = replace_period_of_abbr(txt, am)
+        end
+      end
+      txt
+    end
+
+    def replace_abbreviation_as_sentence_boundary(txt)
+      # As we are being conservative and keeping ambiguous
+      # sentence boundaries as one sentence instead of
+      # splitting into two, we can split at words that
+      # we know for certain never follow these abbreviations.
+      # Some might say that the set of words that follow an
+      # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
+      # the set of words that could start a sentence and
+      # never follow U.S. However, we are being conservative
+      # and not splitting by default, so we need to look for places
+      # where we definitely can split. Obviously SENTENCE_STARTERS
+      # will never cover all cases, but as the gem is named
+      # 'Pragmatic Segmenter' we need to be pragmatic
+      # and try to cover the words that most often start a
+      # sentence but could never follow one of the abbreviations below.
+
+      # Rubular: http://rubular.com/r/PkBQ3PVBS8
+      @language::AbbreviationReplacer::SENTENCE_STARTERS.each do |word|
+        escaped = Regexp.escape(word)
+        regex   = /(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s#{escaped}\s)/
+        txt.gsub!(regex, '\1.')
+      end
+      txt
+    end
+
+    def replace_multi_period_abbreviations(txt)
+      mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
+      return txt if mpa.empty?
+      mpa.each do |r|
+        txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
+      end
+      txt
+    end
+
+    def replace_pre_number_abbr(txt, abbr)
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
+      txt
+    end
+
+    def replace_prepositive_abbr(txt, abbr)
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
+      txt
+    end
+
+    def replace_period_of_abbr(txt, abbr)
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
+      txt
+    end
+
+    def replace_possessive_abbreviations(txt)
+      txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
+      txt
+    end
+  end
+end
diff --git a/grobid-home/lexicon/pragmatic_segmenter/between_punctuation.rb b/grobid-home/lexicon/pragmatic_segmenter/between_punctuation.rb
@@ -0,0 +1,119 @@
+# -*- encoding : utf-8 -*-
+# frozen_string_literal: true
+
+module PragmaticSegmenter
+  # This class searches for punctuation between quotes or parenthesis
+  # and replaces it
+  class BetweenPunctuation
+    # Rubular: http://rubular.com/r/2YFrKWQUYi
+    BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
+
+    BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/
+
+    # Rubular: http://rubular.com/r/3Pw1QlXOjd
+    BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
+
+    # Rubular: http://rubular.com/r/x6s4PZK8jc
+    BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
+
+    # Rubular: http://rubular.com/r/JbAIpKdlSq
+    BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
+
+    # Rubular: http://rubular.com/r/WX4AvnZvlX
+    BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
+
+    # Rubular: http://rubular.com/r/6tTityPflI
+    BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
+
+    # Rubular: http://rubular.com/r/mXf8cW025o
+    WORD_WITH_LEADING_APOSTROPHE = /(?<=\s)'(?:[^']|'[a-zA-Z])*'\S/
+
+    # Rubular: http://rubular.com/r/jTtDKfjxzr
+    BETWEEN_EM_DASHES_REGEX = /\-\-(?>[^\-\-])*\-\-/
+
+    attr_reader :text
+    def initialize(text:)
+      @text = text
+    end
+
+    def replace
+      sub_punctuation_between_quotes_and_parens(text)
+    end
+
+    private
+
+    def sub_punctuation_between_quotes_and_parens(txt)
+      sub_punctuation_between_single_quotes(txt)
+      sub_punctuation_between_single_quote_slanted(txt)
+      sub_punctuation_between_double_quotes(txt)
+      sub_punctuation_between_square_brackets(txt)
+      sub_punctuation_between_parens(txt)
+      sub_punctuation_between_quotes_arrow(txt)
+      sub_punctuation_between_em_dashes(txt)
+      sub_punctuation_between_quotes_slanted(txt)
+    end
+
+    def sub_punctuation_between_parens(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_PARENS_REGEX),
+        text: txt
+      ).replace
+    end
+
+    def sub_punctuation_between_square_brackets(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
+        text: txt
+      ).replace
+    end
+
+    def sub_punctuation_between_single_quotes(txt)
+      unless !(txt !~ WORD_WITH_LEADING_APOSTROPHE) && txt !~ /'\s/
+        PragmaticSegmenter::PunctuationReplacer.new(
+          matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
+          text: txt,
+          match_type: 'single'
+        ).replace
+      end
+    end
+
+    def sub_punctuation_between_single_quote_slanted(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
+        text: txt
+      ).replace
+    end
+
+    def sub_punctuation_between_double_quotes(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: btwn_dbl_quote(txt),
+        text: txt
+      ).replace
+    end
+
+    def btwn_dbl_quote(txt)
+      txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
+    end
+
+    def sub_punctuation_between_quotes_arrow(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
+        text: txt
+      ).replace
+    end
+
+    def sub_punctuation_between_em_dashes(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_EM_DASHES_REGEX),
+        text: txt
+      ).replace
+    end
+
+    def sub_punctuation_between_quotes_slanted(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
+        text: txt
+      ).replace
+    end
+  end
+end