Skip to content
jweese edited this page Jul 5, 2011 · 12 revisions
# this is an example Thrax configuration file	 	
# <- this symbol indicates a comment	 	
# each line should be a key-value pair separated by whitespace

input-file  example/nist09.unified.1

# size to chunk input file for parallelization, in bytes (here 4M)
# this can make a significant difference to extraction performance
# by default, hadoop's split size is larger than a normal corpus size. 	
# we make it smaller to force hadoop to parallelize the extraction	 	
max-split-size    4194304

grammar     samt    # or hiero
# for SAMT, at least one of these has to be true
source-is-parsed   false
target-is-parsed   true 

target-is-samt-syntax    true  

min-rule-count  2    # the minimum number of times a rule must be seen in the corpus before we keep it

# default-nt    X   # X is the default anyway
# goal-symbol   GOAL    # NT to use in the glue grammar (GOAL is default)

# not only do these next six options have the suggested values as given
# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
# Thrax's default values! You could comment them out and the resulting grammar
# would be identical.

initial-phrase-length   10  # maximum length of initial phrase pairs (NT span)
arity                   2   # maximum number of NTs in a rule
lexicality              1   # minimum number of aligned terminals in a rule
adjacent-nts    false   # allow adjacent nonterminals on source side
loose           false   # allow unaligned words at boundaries of phrases

allow-abstract-rules    false   # allow rules with no terminals on RHS
allow-nonlexical-x      false   # allow the default NT in rules that have
                                # NTs on the right hand side

nonlex-source-length    5   # maximum number of symbols on nonlexical rule
                            # source side
nonlex-target-length    5   # as above, but counting on the target side
nonlex-source-words     5   # maximum number of terminals on nonlexical rule
                            # source side
nonlex-target-words     5   # as above, but counting on the target side

rule-span-limit         12  # span limit for rule length
allow-full-sentence-rules    false  # ignore span limit if the rule spans an entire sentence

# how to assign constituent labels to spans covered by unary rules
# bottom: use the bottom-most label
# top: use top-most label
# all: use one label that is a combination of all the covering constituents
unary-category-handler  bottom
# allow concatenation of three labels to cover a span
allow-double-plus       true

phrase-penalty  2.718

# a whitespace seperated list of features
# in this example, the features are phrase translation probability,
# lexical probability, wether the rule has no NTs, or only NTs, wether the
# nonterminals are adjacent on the source side, whether default NT is used
# in the rule, whether the rule has source side terminals but no target side
# terminals (or vice-versa), whether the rule is monotonic or has reordering,
# phrase penalty and glue rule indicator.
features        e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic phrase-penalty glue-rule


output-format   joshua  # the only option and default
                        # later we will want to add formats for other decoders
                        # such as moses and cdec, if they use other formats

label-feature-scores true   # label feature scores?
                            # each score will be output as name=score
Clone this wiki locally