Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
454 lines (397 sloc) 16.7 KB
require 'rubygems'
require "pathname"
require "rjb"
require "singleton"
require "treebank"
gem "treebank", ">= 3.0.0"
rescue LoadError
require "treebank"
require "yaml"
# Wrapper for the {Stanford Natural Language
# Parser}[].
module StanfordParser
require "stanfordparser/java_object"
VERSION = "2.2.1"
# The default sentence segmenter and tokenizer. This is an English-language
# tokenizer with support for Penn Treebank markup.
EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer"
# Path to an English PCFG model that comes with the Stanford Parser. The
# location is relative to the parser root directory. This is a valid value
# for the <em>grammar</em> parameter of the LexicalizedParser constructor.
ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
# This function is executed once when the module is loaded. It initializes
# the Java virtual machine in which the Stanford parser will run. By
# default, it adds the parser installation root to the Java classpath and
# launches the VM with the arguments <tt>-server -Xmx150m</tt>. Different
# values may be specified with the <tt>ruby-stanford-parser.yaml</tt>
# configuration file.
# This function determines which operating system we are running on and sets
# default pathnames accordingly:
# UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml
# Windows:: C:\stanford-parser\current,
# C:\stanford-parser\ruby-stanford-parser.yaml
# This function returns the path of the parser installation root.
def StanfordParser.initialize_on_load
if RUBY_PLATFORM =~ /(win|w)32$/
root ="C:\\stanford-parser\\current ")
config ="C:\\stanford-parser\\ruby-stanford-parser.yaml")
root ="/usr/local/stanford-parser/current")
config ="/etc/ruby-stanford-parser.yaml")
jvmargs = ["-server", "-Xmx150m"]
if config.file?
configuration = open(config) {|f| YAML.load(f)}
if configuration.key?("root") and not configuration["root"].nil?
root =["root"])
if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
jvmargs = configuration["jvmargs"].split
Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
private_class_method :initialize_on_load
# The root directory of the Stanford parser installation.
ROOT = initialize_on_load
# The documentation below is for the original Rjb::JavaObjectWrapper object.
# It is reproduced here because rdoc only takes the last document block
# defined. If Rjb is moved into its own gem, this documentation should go
# with it, and the following should be written as documentation for this
# class:
# Extension of the generic Ruby-Java Bridge wrapper object for the
# StanfordParser module.
# A generic wrapper for a Java object loaded via the {Ruby-Java
# Bridge}[]. The wrapper class handles
# intialization and stringification, and passes other method calls down to
# the underlying Java object. Objects returned by the underlying Java
# object are converted to the appropriate Ruby object.
# Other modules may extend the list of Java objects that are converted by
# adding their own converter functions. See wrap_java_object for details.
# This object is enumerable, yielding items in the order defined by the
# underlying Java object's iterator.
class Rjb::JavaObjectWrapper
# FeatureLabel objects go inside a FeatureLabel wrapper.
def wrap_edu_stanford_nlp_ling_FeatureLabel(object)
# Tree objects go inside a Tree wrapper. Various tree types are aliased
# to this function.
def wrap_edu_stanford_nlp_trees_Tree(object)
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree
alias :wrap_edu_stanford_nlp_trees_SimpleTree :wrap_edu_stanford_nlp_trees_Tree
alias :wrap_edu_stanford_nlp_trees_TreeGraphNode :wrap_edu_stanford_nlp_trees_Tree
protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel
end # Rjb::JavaObjectWrapper
# Lexicalized probabalistic parser.
# This is an wrapper for the
# <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
class LexicalizedParser < Rjb::JavaObjectWrapper
# The grammar used by the parser
attr_reader :grammar
# Create the parser given a grammar and options. The <em>grammar</em>
# argument is a path to a grammar file. This path may contain the string
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
# Stanford Parser. By default, an English PCFG grammar is loaded.
# The <em>options</em> argument is a list of string arguments as they
# would appear on a command line. See the documentaion of
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
# details.
def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
@grammar =\$\(ROOT\)/, ROOT))
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
def to_s
end # LexicalizedParser
# A singleton instance of the default Stanford Natural Language parser. A
# singleton is used because the parser can take a few seconds to load.
class DefaultParser < StanfordParser::LexicalizedParser
include Singleton
# This is a wrapper for
# <tt>edu.stanford.nlp.trees.Tree</tt> objects. It customizes
# stringification.
class Tree < Rjb::JavaObjectWrapper
def initialize(obj = "edu.stanford.nlp.trees.Tree")
# Return the label along with the score if there is one.
def inspect
s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
# The Penn treebank representation. This prints with indenting instead of
# putting everything on one line.
def to_s
end # Tree
# This is a wrapper for
# <tt>edu.stanford.nlp.ling.Word</tt> objects. It customizes
# stringification and adds an equivalence operator.
class Word < Rjb::JavaObjectWrapper
def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
super(obj, *args)
# See the word values.
def inspect
# Equivalence is defined relative to the word value.
def ==(other)
word == other
end # Word
# This is a wrapper for <tt>edu.stanford.nlp.ling.FeatureLabel</tt> objects.
# It customizes stringification.
class FeatureLabel < Rjb::JavaObjectWrapper
def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel")
# Stringify with just the token and its begin and end position.
def to_s
# BUGBUG The position values come back as java.lang.Integer though I
# would expect Rjb to convert them to Ruby integers.
begin_position = get(self.BEGIN_POSITION_KEY)
end_position = get(self.END_POSITION_KEY)
"#{current} [#{begin_position},#{end_position}]"
# More verbose stringification with all the fields and their values.
def inspect
# Tokenizes documents into words and sentences.
# This is a wrapper for the
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
class DocumentPreprocessor < Rjb::JavaObjectWrapper
def initialize(suppressEscaping = false)
super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
# Returns a list of sentences in a string.
def getSentencesFromString(s)
s ="", s)
_invoke(:getSentencesFromText, ";", s.java_object)
def inspect
def to_s
end # DocumentPreprocessor
# A text token that contains raw and normalized token identity (.e.g "(" and
# "-LRB-"), an offset span, and the characters immediately preceding and
# following the token. Given a list of these objects it is possible to
# recreate the text from which they came verbatim.
class StandoffToken <, :word, :before, :after,
:begin_position, :end_position)
def to_s
"#{current} [#{begin_position},#{end_position}]"
# A preprocessor that segments text into sentences and tokens that contain
# character offset and token context information that can be used for
# standoff annotation.
class StandoffDocumentPreprocessor < DocumentPreprocessor
def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER)
# PTBTokenizer.factory is a static function, so use RJB to call it
# directly instead of going through a JavaObjectWrapper. We do it this
# way because the Standford parser Java code does not provide a
# constructor that allows you to specify the second parameter,
# invertible, to true, and we need this to write character offset
# information into the tokens.
ptb_tokenizer_class = Rjb::import(tokenizer)
# See the documentation for
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> for a
# description of these parameters.
ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false)
# Returns a list of sentences in a string. This wraps the returned
# sentences in a StandoffSentence object.
def getSentencesFromString(s)
# A sentence is an array of StandoffToken objects.
class StandoffSentence < Array
# Construct an array of StandoffToken objects from a Java list sentence
# object returned by the preprocessor.
def initialize(stanford_parser_sentence)
# Convert FeatureStructure wrappers to StandoffToken objects.
s = stanford_parser_sentence.to_a.collect do |fs|
current = fs.current
word = fs.word
before = fs.before
after = fs.after
# The to_s.to_i is necessary because the get function returns
# java.lang.Integer objects instead of Ruby integers.
begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i
end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i, word, before, after,
begin_position, end_position)
# Return the original string verbatim.
def to_s
self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current
# Return the original string verbatim.
def inspect
# Standoff syntactic annotation of natural language text which may contain
# multiple sentences.
# This is an Array of StandoffNode objects, one for each sentence in the
# text.
class StandoffParsedText < Array
# Parse the text and create the standoff annotation.
# The default parser is a singleton instance of the English language
# Stanford Natural Langugage parser. There may be a delay of a few
# seconds for it to load the first time it is created.
def initialize(text, nodetype = StandoffNode,
parser = DefaultParser.instance)
preprocessor =
# Segment the text into sentences. Parse each sentence, writing
# standoff annotation information into the terminal nodes.
preprocessor.getSentencesFromString(text).map do |sentence|
parse = parser.apply(sentence.to_s)
push(, sentence))
# Print class name and number of sentences.
def inspect
"<#{}, #{length} sentences>"
# Print parses.
def to_s
flatten.join(" ")
# Standoff syntactic tree annotation of text. Terminal nodes are labeled
# with the appropriate StandoffToken objects. Standoff parses can reproduce
# the original string from which they were generated verbatim, optionally
# with brackets around the yields of specified non-terminal nodes.
class StandoffNode < Treebank::ParentedNode
# Create the standoff tree from a tree returned by the Stanford parser.
# For non-terminal nodes, the <em>tokens</em> argument will be a
# StandoffSentence containing the StandoffToken objects representing all
# the tokens beneath and after this node. For terminal nodes, the
# <em>tokens</em> argument will be a StandoffToken.
def initialize(stanford_parser_node, tokens)
# Annotate this node with a non-terminal label or a StandoffToken as
# appropriate.
super(tokens.instance_of?(StandoffSentence) ?
stanford_parser_node.value : tokens)
# Enumerate the children depth-first. Tokens are removed from the list
# left-to-right as terminal nodes are added to the tree.
stanford_parser_node.children.each do |child|
subtree =, child.leaf? ? tokens.shift : tokens)
# Return the original text string dominated by this node.
def to_original_string
leaves.inject("") do |s, leaf|
s += leaf.label.current + leaf.label.after
# Print the original string with brackets around word spans dominated by
# the specified consituents.
# The constituents to bracket are specified by passing a list of node
# coordinates, which are arrays of integers of the form returned by the
# tree enumerators of Treebank::Node objects.
# _coords_:: the coordinates of the nodes around which to place brackets
# _open_:: the open bracket symbol
# _close_:: the close bracket symbol
def to_bracketed_string(coords, open = "[", close = "]")
# Get a list of all the leaf nodes and their coordinates.
items = depth_first_enumerator(true).find_all {|n| n.first.leaf?}
# Enumerate over all the matching constituents inserting open and close
# brackets around their yields in the items list.
coords.each do |matching|
# Insert using a simple state machine with three states: :start,
# :open, and :close.
state = :start
# Enumerate over the items list looking for nodes that are the
# children of the matching constituent.
items.each_with_index do |item, index|
# Skip inserted bracket characters.
next if item.is_a? String
# Handle terminal node items with the state machine.
node, terminal_coordinate = item
if state == :start
next if not in_yield?(matching, terminal_coordinate)
items.insert(index, open)
state = :open
else # state == :open
next if in_yield?(matching, terminal_coordinate)
items.insert(index, close)
state = :close
end # items.each_with_index
# Handle the case where a matching constituent is flush with the end
# of the sentence.
items << close if state == :open
end # each
# Replace terminal nodes with their string representations. Insert
# spacing characters in the list.
items.each_with_index do |item, index|
next if item.is_a? String
text = item.first.label.current
spacing = item.first.label.after
# Replace the terminal node with its text.
items[index] = text
# Insert the spacing that comes after this text before the first
# non-close bracket character.
close_pos = find_index(items[index+1..-1]) {|item| not item == close}
items.insert(index + close_pos + 1, spacing)
end # to_bracketed_string
# Find the index of the first item in _list_ for which _block_ is true.
# Return 0 if no items are found.
def find_index(list, &block)
list.each_with_index do |item, index|
return index if
# Is the node at _terminal_ in the yield of the node at _node_?
def in_yield?(node, terminal)
# If node A's coordinates match the prefix of node B's coordinates, node
# B is in the yield of node A.
terminal.first(node.length) == node
private :in_yield?, :find_index
end # StandoffNode
end # StanfordParser