Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Auto-formatted code to conform with Google style guide for Java.

  • Loading branch information...
commit 42f4404c3c8a7e82cd6dd271882562c1392416fa 1 parent 21990ad
Juri Ganitkevitch jganitkevitch authored
Showing with 13,985 additions and 14,155 deletions.
  1. +281 −0 .settings/org.eclipse.jdt.core.prefs
  2. +3 −0  .settings/org.eclipse.jdt.ui.prefs
  3. +193 −193 examples/ZMERT/README_ZMERT.txt
  4. +30 −30 examples/ZMERT/ZMERT_config_ex2.txt
  5. +70 −70 examples/ZMERT/config_ex2.txt
  6. +6 −6 examples/ZMERT/params.txt
  7. +74 −74 examples/example/example.config.bloomfilterlm
  8. +117 −128 src/joshua/corpus/AbstractPhrase.java
  9. +101 −103 src/joshua/corpus/BasicPhrase.java
  10. +167 −177 src/joshua/corpus/ContiguousPhrase.java
  11. +143 −170 src/joshua/corpus/Corpus.java
  12. +95 −108 src/joshua/corpus/Phrase.java
  13. +156 −165 src/joshua/corpus/Span.java
  14. +64 −69 src/joshua/corpus/TerminalIterator.java
  15. +272 −284 src/joshua/corpus/Vocabulary.java
  16. +386 −401 src/joshua/corpus/syntax/ArraySyntaxTree.java
  17. +9 −9 src/joshua/corpus/syntax/SyntaxTree.java
  18. +297 −282 src/joshua/decoder/BLEU.java
  19. +140 −151 src/joshua/decoder/DecoderFactory.java
  20. +303 −320 src/joshua/decoder/DecoderThread.java
  21. +159 −165 src/joshua/decoder/InputHandler.java
  22. +508 −506 src/joshua/decoder/JoshuaConfiguration.java
  23. +513 −539 src/joshua/decoder/JoshuaDecoder.java
  24. +401 −396 src/joshua/decoder/NbestMinRiskReranker.java
  25. +333 −355 src/joshua/decoder/ParserThread.java
  26. +83 −85 src/joshua/decoder/Support.java
  27. +117 −124 src/joshua/decoder/Translation.java
  28. +169 −161 src/joshua/decoder/chart_parser/BeamPruner.java
  29. +349 −341 src/joshua/decoder/chart_parser/Cell.java
  30. +580 −596 src/joshua/decoder/chart_parser/Chart.java
  31. +19 −19 src/joshua/decoder/chart_parser/Combiner.java
  32. +189 −196 src/joshua/decoder/chart_parser/ComputeNodeResult.java
  33. +228 −226 src/joshua/decoder/chart_parser/CubePruneCombiner.java
  34. +365 −388 src/joshua/decoder/chart_parser/DotChart.java
  35. +90 −90 src/joshua/decoder/chart_parser/ExhaustiveCombiner.java
  36. +200 −214 src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
  37. +12 −12 src/joshua/decoder/chart_parser/Prunable.java
  38. +59 −64 src/joshua/decoder/chart_parser/SourcePath.java
  39. +35 −39 src/joshua/decoder/chart_parser/SuperNode.java
  40. +52 −57 src/joshua/decoder/ff/ArityPhrasePenaltyFF.java
  41. +79 −79 src/joshua/decoder/ff/DefaultStatefulFF.java
  42. +107 −104 src/joshua/decoder/ff/DefaultStatelessFF.java
  43. +71 −66 src/joshua/decoder/ff/FeatureFunction.java
  44. +27 −29 src/joshua/decoder/ff/OOVFF.java
  45. +56 −56 src/joshua/decoder/ff/PhraseModelFF.java
  46. +3 −3 src/joshua/decoder/ff/SourceDependentFF.java
  47. +28 −29 src/joshua/decoder/ff/SourcePathFF.java
  48. +33 −35 src/joshua/decoder/ff/WordPenaltyFF.java
  49. +106 −112 src/joshua/decoder/ff/lm/AbstractLM.java
  50. +222 −230 src/joshua/decoder/ff/lm/ArpaFile.java
  51. +58 −60 src/joshua/decoder/ff/lm/ArpaNgram.java
  52. +120 −129 src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java
  53. +364 −357 src/joshua/decoder/ff/lm/LanguageModelFF.java
  54. +79 −89 src/joshua/decoder/ff/lm/NGramLanguageModel.java
  55. +93 −89 src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
  56. +57 −58 src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java
  57. +193 −212 src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java
  58. +557 −593 src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
  59. +711 −743 src/joshua/decoder/ff/lm/buildin_lm/LMGrammarJAVA.java
  60. +96 −102 src/joshua/decoder/ff/lm/kenlm/jni/KenLM.java
  61. +216 −219 src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
  62. +26 −28 src/joshua/decoder/ff/state_maintenance/DPState.java
  63. +126 −125 src/joshua/decoder/ff/state_maintenance/NgramDPState.java
  64. +148 −144 src/joshua/decoder/ff/state_maintenance/NgramStateComputer.java
  65. +22 −20 src/joshua/decoder/ff/state_maintenance/StateComputer.java
  66. +168 −183 src/joshua/decoder/ff/tm/AbstractGrammar.java
  67. +120 −124 src/joshua/decoder/ff/tm/BasicRuleCollection.java
  68. +32 −38 src/joshua/decoder/ff/tm/BatchGrammar.java
  69. +147 −165 src/joshua/decoder/ff/tm/BilingualRule.java
  70. +108 −128 src/joshua/decoder/ff/tm/Grammar.java
  71. +25 −31 src/joshua/decoder/ff/tm/GrammarFactory.java
  72. +130 −135 src/joshua/decoder/ff/tm/GrammarReader.java
  73. +314 −297 src/joshua/decoder/ff/tm/MonolingualRule.java
  74. +120 −114 src/joshua/decoder/ff/tm/Rule.java
  75. +62 −72 src/joshua/decoder/ff/tm/RuleCollection.java
  76. +65 −75 src/joshua/decoder/ff/tm/Trie.java
  77. +24 −28 src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
  78. +144 −144 src/joshua/decoder/ff/tm/format/DiskHyperGraphFormatReader.java
  79. +129 −131 src/joshua/decoder/ff/tm/format/HieroFormatReader.java
  80. +145 −146 src/joshua/decoder/ff/tm/format/SamtFormatReader.java
  81. +270 −287 src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
  82. +67 −69 src/joshua/decoder/ff/tm/hash_based/MemoryBasedRuleBin.java
  83. +89 −96 src/joshua/decoder/ff/tm/hash_based/MemoryBasedTrie.java
  84. +456 −481 src/joshua/decoder/ff/tm/packed/PackedGrammar.java
  85. +404 −387 src/joshua/decoder/hypergraph/DefaultInsideOutside.java
Sorry, we could not display the entire diff because it was too big.
281 .settings/org.eclipse.jdt.core.prefs
View
@@ -0,0 +1,281 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.formatter.align_type_members_on_columns=false
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=16|-1|16
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16|4|48
+org.eclipse.jdt.core.formatter.alignment_for_assignment=16
+org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16
+org.eclipse.jdt.core.formatter.alignment_for_compact_if=16
+org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=48
+org.eclipse.jdt.core.formatter.alignment_for_enum_constants=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16|5|80
+org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0
+org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16
+org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16|5|48
+org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80
+org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16|3|49
+org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=32
+org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=32|4|81
+org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=32|4|80
+org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16|4|48
+org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16|4|48
+org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch=16
+org.eclipse.jdt.core.formatter.blank_lines_after_imports=1
+org.eclipse.jdt.core.formatter.blank_lines_after_package=1
+org.eclipse.jdt.core.formatter.blank_lines_before_field=0
+org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration=0
+org.eclipse.jdt.core.formatter.blank_lines_before_imports=0
+org.eclipse.jdt.core.formatter.blank_lines_before_member_type=0
+org.eclipse.jdt.core.formatter.blank_lines_before_method=1
+org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk=1
+org.eclipse.jdt.core.formatter.blank_lines_before_package=0
+org.eclipse.jdt.core.formatter.blank_lines_between_import_groups=1
+org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations=2
+org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_array_initializer=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_block=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_block_in_case=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_enum_constant=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_method_declaration=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_switch=end_of_line
+org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=end_of_line
+org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false
+org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false
+org.eclipse.jdt.core.formatter.comment.format_block_comments=true
+org.eclipse.jdt.core.formatter.comment.format_header=true
+org.eclipse.jdt.core.formatter.comment.format_html=true
+org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true
+org.eclipse.jdt.core.formatter.comment.format_line_comments=true
+org.eclipse.jdt.core.formatter.comment.format_source_code=false
+org.eclipse.jdt.core.formatter.comment.indent_parameter_description=false
+org.eclipse.jdt.core.formatter.comment.indent_root_tags=true
+org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags=insert
+org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter=do not insert
+org.eclipse.jdt.core.formatter.comment.line_length=100
+org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries=true
+org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries=true
+org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=false
+org.eclipse.jdt.core.formatter.compact_else_if=true
+org.eclipse.jdt.core.formatter.continuation_indentation=2
+org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer=2
+org.eclipse.jdt.core.formatter.disabling_tag=@formatter\:off
+org.eclipse.jdt.core.formatter.enabling_tag=@formatter\:on
+org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line=false
+org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column=true
+org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header=true
+org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header=true
+org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header=true
+org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header=true
+org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases=true
+org.eclipse.jdt.core.formatter.indent_empty_lines=false
+org.eclipse.jdt.core.formatter.indent_statements_compare_to_block=true
+org.eclipse.jdt.core.formatter.indent_statements_compare_to_body=true
+org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases=true
+org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch=true
+org.eclipse.jdt.core.formatter.indentation.size=4
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field=insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable=insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method=insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package=insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type=insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_label=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing=insert
+org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body=do not insert
+org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter=insert
+org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator=insert
+org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_binary_operator=insert
+org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters=insert
+org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block=insert
+org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
+org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert=insert
+org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case=insert
+org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional=insert
+org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for=insert
+org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments=insert
+org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters=insert
+org.eclipse.jdt.core.formatter.insert_space_after_ellipsis=insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional=insert
+org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard=do not insert
+org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for=insert
+org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources=insert
+org.eclipse.jdt.core.formatter.insert_space_after_unary_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter=insert
+org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator=insert
+org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_binary_operator=insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert=insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional=insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for=insert
+org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_ellipsis=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try=insert
+org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while=insert
+org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return=insert
+org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw=insert
+org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional=insert
+org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_semicolon=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources=do not insert
+org.eclipse.jdt.core.formatter.insert_space_before_unary_operator=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
+org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
+org.eclipse.jdt.core.formatter.join_lines_in_comments=true
+org.eclipse.jdt.core.formatter.join_wrapped_lines=true
+org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false
+org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false
+org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=true
+org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false
+org.eclipse.jdt.core.formatter.lineSplit=100
+org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false
+org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false
+org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0
+org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve=3
+org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line=false
+org.eclipse.jdt.core.formatter.tabulation.char=space
+org.eclipse.jdt.core.formatter.tabulation.size=2
+org.eclipse.jdt.core.formatter.use_on_off_tags=false
+org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations=false
+org.eclipse.jdt.core.formatter.wrap_before_binary_operator=true
+org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch=true
+org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested=true
3  .settings/org.eclipse.jdt.ui.prefs
View
@@ -0,0 +1,3 @@
+eclipse.preferences.version=1
+formatter_profile=_GoogleStyle
+formatter_settings_version=12
386 examples/ZMERT/README_ZMERT.txt
View
@@ -1,193 +1,193 @@
-(1) Running Z-MERT:
--------------------
-
-((This first section is an expanded version of the Z-MERT section in trunk/README.txt))
-
-Joshua's MERT module, called Z-MERT, consists of a core MERT class, a generic
-EvaluationMetric class, and one class definition for each supported evaluation
-metric. The module is used by launching the driver program (ZMERT.java), which
-expects a config file as its main argument. This config file can be used to
-specify any subset of Z-MERT's 20-some parameters. For a full list of those
-parameters, and their default values, run ZMERT with a single -h argument as
-follows (assuming you're in the trunk folder):
-
- java -cp bin joshua.zmert.ZMERT -h
-
-So what does a Z-MERT config file look like?
-
-Examine the file ZMERT_example/ZMERT_config_ex2.txt. You will find that it
-specifies the following "main" MERT parameters:
-
- (*) -dir dirPrefix: working directory
- (*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset
- (*) -r refFile: target sentences (reference translations) of the MERT dataset
- (*) -rps refsPerSen: number of reference translations per sentence
- (*) -p paramsFile: file containing parameter names, initial values, and ranges
- (*) -maxIt maxMERTIts: maximum number of MERT iterations
- (*) -ipi initsPerIt: number of intermediate initial points per iteration
- (*) -cmd commandFile: name of file containing commands to run the decoder (likely with "./" as a prefix under unix/linux, and must be executable (e.g. .bat) under Windows)
- (*) -decOut decoderOutFile: name of the output file produced by the decoder
- (*) -dcfg decConfigFile: name of decoder config file
- (*) -N N: size of N-best list (per sentence) generated in each MERT iteration
- (*) -v verbosity: output verbosity level (0-2; higher value => more verbose)
- (*) -seed seed: seed used to initialize the random number generator
-
-(Note that the -s parameter is only used if Z-MERT is running Joshua as an
- internal decoder. If Joshua is run as an external decoder, as is the case in
- this README, then this parameter is ignored.)
-
-To test Z-MERT on the 100-sentence test set of example2, provide this config
-file to Z-MERT as follows (assuming you're in the trunk folder):
-
- java -cp bin joshua.zmert.ZMERT ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
-
-This will run Z-MERT for a couple of iterations on the data from the example2
-folder. (Notice that we have made copies of the source and reference files
-from example2 and renamed them as src.txt and ref.* in the MERT_example folder,
-just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run
-is complete, you should be able to inspect the log file to see what kinds of
-things it did. If everything goes well, the run should take a few minutes, of
-which more than 95% is time spent by Z-MERT waiting on Joshua to finish
-decoding the sentences (once per iteration).
-
-The output file you get should be equivalent to ZMERT.out.verbosity1. If you
-rerun the experiment with the verbosity (-v) argument set to 2 instead of 1,
-the output file you get should be equivalent to ZMERT.out.verbosity2, which has
-more interesting details about what Z-MERT does.
-
-Realistic experiments usually involve Z-MERT operating on a much larger dataset
-and for many more iterations, which means that Z-MERT would need a substantial
-amount of memory. If you have enough memory to run a decoder, then you
-probably have more than enough memory to support Z-MERT's needs, but you must
-ensure that Z-MERT is not taking up any memory *while* the decoder is producing
-translations. To do so, you should run ZMERT as follows:
-
- java -cp bin joshua.zmert.ZMERT -maxMem 500 ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
-
-Notice the additional -maxMem argument. It tells Z-MERT that it should not
-persist to use up memory while the decoder is running (during which time Z-MERT
-would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB.
-For more details on this issue, see section (4).
-
-A quick note about Z-MERT's interaction with the decoder. If you examine the
-file decoder_command_ex2, which is provided as the commandFile (-cmd) argument
-in Z-MERT's config file, you'll find it contains the command one would use to
-run the decoder. Z-MERT launches the commandFile as an external process, and
-assumes that it will launch the decoder to produce translations. (Make sure
-decoder_command_ex2 is executable.) After launching this external process,
-Z-MERT waits for it to finish, then uses the resulting output file for
-parameter tuning (in addition to the output files from previous iterations).
-The command file here only has a single command, but your command file could
-have multiple lines. Just make sure the command file itself is executable.
-
-Notice that the Z-MERT arguments configFile and decoderOutFile (-cfg and
--decOut) must match the two Joshua arguments in the commandFile's (-cmd) single
-command. Also, the Z-MERT argument for N must match the value for top_n in
-Joshua's config file, indicated by the Z-MERT argument configFile (-cfg).
-
-
-(2) Z-MERT Iterations:
-----------------------
-
-Z-MERT alternates between producing an N-best candidate list and optimizing
-a weight vector (to yield a better score on that candidate list). In other
-words, each Z-MERT iteration starts with decoding the Z-MERT data set using
-some weight vector. How often should MERT redecode the sentences? You can
-instruct Z-MERT to either redecode once it changes any single weight, or to
-redecode once a local maximum has been reached on the current candidate list.
-That is, Z-MERT can be instructed to either change a single weight per
-iteration, or to "fully" optimize the weight vector in a single iteration.
-This can be specified using the oncePerIt argument (-opi). If it is set to 1,
-each Z-MERT iteration will make a single weight change (the one giving the most
-gain). If it is set to 0, each Z-MERT iteration will perform this process (of
-changing the weight giving the most gain) repeatedly until no weight change can
-improve the score on the current candidate list.
-
-
-(3) Escaping Local Optima:
---------------------------
-
-The error surface may not have a single best optimum. That is, a single Z-MERT
-run (as described so far) might yield a weight vector that is not the best
-globally. The natural way around this would be to run Z-MERT multiple times
-with different randomly chosen weight vectors provided as starting points.
-
-In practice, doing this is quite time-consuming, due to the amount of time
-needed to decode the sentences over and over again. An alternative approach is
-to generate a number of random weight vectors IN EACH MERT ITERATION and
-optimize each of them individually IN ADDITION TO optimizing the weight vector
-that generated the latest N-best list. The initsPerIt argument (-ipi) tells
-Z-MERT how many weight vectors should be used as starting points in each
-iteration, including the one surviving from the previous iteration. For
-instance, Z-MERT as described up until this section can be achieved by setting
--ipi to 1. If you set -ipi to 20 (its default value) then, in addition to
-optimizing the weight vector that generated the latest N-best list, each Z-MERT
-iteration will create 19 random weight vectors and optimize each of them
-individually. Of the 20 intermediate "final" vectors, the one giving the best
-score survives, and is the one used to redecode in the next iteration.
-
-For replicability purposes, all the random numbers in a Z-MERT run are
-generated by a single random number generator, and the seed used to initialize
-that generator can be provided using the -seed argument. This argument can be
-set either to "time" or some numerical value. If a numerical value is
-provided, that value will be used as the seed. If "time" is provided, the seed
-will be the value returned by a Java System.currentTimeMillis() call at the
-start of the Z-MERT run. Either way, Z-MERT will print out the seed as part of
-its output.
-
-
-(4) Z-MERT's Memory Usage:
---------------------------
-
-During a MERT iteration, there are several large data structures needed by the
-optimization process that require a decent amount of memory. However, chances
-are that if you have enough memory to be running a respectable decoder, then
-memory should not be a problem for you, since Z-MERT probably won't need any
-more memory than the decoder would.
-
-The problem is that you may not have enough memory to support *both* the
-decoder and Z-MERT at the same time. It might seem that this should not be an
-issue, since Z-MERT basically sits idle while the decoder is producing
-translations. So why should we be concerned about Z-MERT needing any memory
-while the decoder is running?
-
-The answer is that a Java process does not return the memory allocated to it
-until the process actually terminates, even if no data structures are allocated
-any memory "internally" by that process. In other words, even though no memory
-is needed by the end of a Z-MERT iteration, the Java process will not return
-any memory already allocated to it back to the OS. That is why, when the
-decoder is launched, it would be competing for memory with Z-MERT, since Z-MERT
-is already hogging up quite a bit of memory that it refuses to return.
-
-Indeed, if you rerun this command from section (1):
-
- java -cp bin joshua.zmert.ZMERT ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
-
-and monitor the memory consumption of Z-MERT, you will notice that the memory
-allocated to it never decreases, even when it is idle while the decoder is
-working. This is not a problem with such a small set (100 sentences) and such
-a small number of iterations (2), but could be problematic when scaling up.
-
-But fear not! When you run Z-MERT, you can instruct it to perform each
-iteration as a separate Java process. In other words, the Z-MERT driver would
-launch one external process per MERT iteration, and so the end of that
-iteration is indeed the end of that individual external process. This means
-that the memory required for that iteration will be returned to the OS just
-before the decoder is launched by the Z-MERT driver.
-
-To instruct Z-MERT to function this way, you should use the -maxMem argument:
-
- java -cp bin joshua.zmert.ZMERT -maxMem 500 ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
-
-The -maxMem argument tells Z-MERT to function as explained above, and the value
-tells it the maximum amount of memory (in MB) it is allowed during any single
-iteration. In this example, Z-MERT will see the -maxMem argument and recognize
-that each iteration should be launched as a separate Java process allowed at
-most 500 MB of memory.
-
-If you do run the above command and monitor memory consumption, you will notice
-that this time the only Java process that actually persists across iterations
-is the Z-MERT driver itself, which pretty much requires no memory at all. This
-way, when the decoder is launched, it will not have to compete with any other
-processes for memory.
-
+(1) Running Z-MERT:
+-------------------
+
+((This first section is an expanded version of the Z-MERT section in trunk/README.txt))
+
+Joshua's MERT module, called Z-MERT, consists of a core MERT class, a generic
+EvaluationMetric class, and one class definition for each supported evaluation
+metric. The module is used by launching the driver program (ZMERT.java), which
+expects a config file as its main argument. This config file can be used to
+specify any subset of Z-MERT's 20-some parameters. For a full list of those
+parameters, and their default values, run ZMERT with a single -h argument as
+follows (assuming you're in the trunk folder):
+
+ java -cp bin joshua.zmert.ZMERT -h
+
+So what does a Z-MERT config file look like?
+
+Examine the file ZMERT_example/ZMERT_config_ex2.txt. You will find that it
+specifies the following "main" MERT parameters:
+
+ (*) -dir dirPrefix: working directory
+ (*) -s sourceFile: source sentences (foreign sentences) of the MERT dataset
+ (*) -r refFile: target sentences (reference translations) of the MERT dataset
+ (*) -rps refsPerSen: number of reference translations per sentence
+ (*) -p paramsFile: file containing parameter names, initial values, and ranges
+ (*) -maxIt maxMERTIts: maximum number of MERT iterations
+ (*) -ipi initsPerIt: number of intermediate initial points per iteration
+ (*) -cmd commandFile: name of file containing commands to run the decoder (likely with "./" as a prefix under unix/linux, and must be executable (e.g. .bat) under Windows)
+ (*) -decOut decoderOutFile: name of the output file produced by the decoder
+ (*) -dcfg decConfigFile: name of decoder config file
+ (*) -N N: size of N-best list (per sentence) generated in each MERT iteration
+ (*) -v verbosity: output verbosity level (0-2; higher value => more verbose)
+ (*) -seed seed: seed used to initialize the random number generator
+
+(Note that the -s parameter is only used if Z-MERT is running Joshua as an
+ internal decoder. If Joshua is run as an external decoder, as is the case in
+ this README, then this parameter is ignored.)
+
+To test Z-MERT on the 100-sentence test set of example2, provide this config
+file to Z-MERT as follows (assuming you're in the trunk folder):
+
+ java -cp bin joshua.zmert.ZMERT ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
+
+This will run Z-MERT for a couple of iterations on the data from the example2
+folder. (Notice that we have made copies of the source and reference files
+from example2 and renamed them as src.txt and ref.* in the MERT_example folder,
+just to have all the files needed by Z-MERT in one place.) Once the Z-MERT run
+is complete, you should be able to inspect the log file to see what kinds of
+things it did. If everything goes well, the run should take a few minutes, of
+which more than 95% is time spent by Z-MERT waiting on Joshua to finish
+decoding the sentences (once per iteration).
+
+The output file you get should be equivalent to ZMERT.out.verbosity1. If you
+rerun the experiment with the verbosity (-v) argument set to 2 instead of 1,
+the output file you get should be equivalent to ZMERT.out.verbosity2, which has
+more interesting details about what Z-MERT does.
+
+Realistic experiments usually involve Z-MERT operating on a much larger dataset
+and for many more iterations, which means that Z-MERT would need a substantial
+amount of memory. If you have enough memory to run a decoder, then you
+probably have more than enough memory to support Z-MERT's needs, but you must
+ensure that Z-MERT is not taking up any memory *while* the decoder is producing
+translations. To do so, you should run ZMERT as follows:
+
+ java -cp bin joshua.zmert.ZMERT -maxMem 500 ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
+
+Notice the additional -maxMem argument. It tells Z-MERT that it should not
+persist to use up memory while the decoder is running (during which time Z-MERT
+would be idle). The 500 tells Z-MERT that it can only use a maximum of 500 MB.
+For more details on this issue, see section (4).
+
+A quick note about Z-MERT's interaction with the decoder. If you examine the
+file decoder_command_ex2, which is provided as the commandFile (-cmd) argument
+in Z-MERT's config file, you'll find it contains the command one would use to
+run the decoder. Z-MERT launches the commandFile as an external process, and
+assumes that it will launch the decoder to produce translations. (Make sure
+decoder_command_ex2 is executable.) After launching this external process,
+Z-MERT waits for it to finish, then uses the resulting output file for
+parameter tuning (in addition to the output files from previous iterations).
+The command file here only has a single command, but your command file could
+have multiple lines. Just make sure the command file itself is executable.
+
+Notice that the Z-MERT arguments configFile and decoderOutFile (-cfg and
+-decOut) must match the two Joshua arguments in the commandFile's (-cmd) single
+command. Also, the Z-MERT argument for N must match the value for top_n in
+Joshua's config file, indicated by the Z-MERT argument configFile (-cfg).
+
+
+(2) Z-MERT Iterations:
+----------------------
+
+Z-MERT alternates between producing an N-best candidate list and optimizing
+a weight vector (to yield a better score on that candidate list). In other
+words, each Z-MERT iteration starts with decoding the Z-MERT data set using
+some weight vector. How often should MERT redecode the sentences? You can
+instruct Z-MERT to either redecode once it changes any single weight, or to
+redecode once a local maximum has been reached on the current candidate list.
+That is, Z-MERT can be instructed to either change a single weight per
+iteration, or to "fully" optimize the weight vector in a single iteration.
+This can be specified using the oncePerIt argument (-opi). If it is set to 1,
+each Z-MERT iteration will make a single weight change (the one giving the most
+gain). If it is set to 0, each Z-MERT iteration will perform this process (of
+changing the weight giving the most gain) repeatedly until no weight change can
+improve the score on the current candidate list.
+
+
+(3) Escaping Local Optima:
+--------------------------
+
+The error surface may not have a single best optimum. That is, a single Z-MERT
+run (as described so far) might yield a weight vector that is not the best
+globally. The natural way around this would be to run Z-MERT multiple times
+with different randomly chosen weight vectors provided as starting points.
+
+In practice, doing this is quite time-consuming, due to the amount of time
+needed to decode the sentences over and over again. An alternative approach is
+to generate a number of random weight vectors IN EACH MERT ITERATION and
+optimize each of them individually IN ADDITION TO optimizing the weight vector
+that generated the latest N-best list. The initsPerIt argument (-ipi) tells
+Z-MERT how many weight vectors should be used as starting points in each
+iteration, including the one surviving from the previous iteration. For
+instance, Z-MERT as described up until this section can be achieved by setting
+-ipi to 1. If you set -ipi to 20 (its default value) then, in addition to
+optimizing the weight vector that generated the latest N-best list, each Z-MERT
+iteration will create 19 random weight vectors and optimize each of them
+individually. Of the 20 intermediate "final" vectors, the one giving the best
+score survives, and is the one used to redecode in the next iteration.
+
+For replicability purposes, all the random numbers in a Z-MERT run are
+generated by a single random number generator, and the seed used to initialize
+that generator can be provided using the -seed argument. This argument can be
+set either to "time" or some numerical value. If a numerical value is
+provided, that value will be used as the seed. If "time" is provided, the seed
+will be the value returned by a Java System.currentTimeMillis() call at the
+start of the Z-MERT run. Either way, Z-MERT will print out the seed as part of
+its output.
+
+
+(4) Z-MERT's Memory Usage:
+--------------------------
+
+During a MERT iteration, there are several large data structures needed by the
+optimization process that require a decent amount of memory. However, chances
+are that if you have enough memory to be running a respectable decoder, then
+memory should not be a problem for you, since Z-MERT probably won't need any
+more memory than the decoder would.
+
+The problem is that you may not have enough memory to support *both* the
+decoder and Z-MERT at the same time. It might seem that this should not be an
+issue, since Z-MERT basically sits idle while the decoder is producing
+translations. So why should we be concerned about Z-MERT needing any memory
+while the decoder is running?
+
+The answer is that a Java process does not return the memory allocated to it
+until the process actually terminates, even if no data structures are allocated
+any memory "internally" by that process. In other words, even though no memory
+is needed by the end of a Z-MERT iteration, the Java process will not return
+any memory already allocated to it back to the OS. That is why, when the
+decoder is launched, it would be competing for memory with Z-MERT, since Z-MERT
+is already hogging up quite a bit of memory that it refuses to return.
+
+Indeed, if you rerun this command from section (1):
+
+ java -cp bin joshua.zmert.ZMERT ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
+
+and monitor the memory consumption of Z-MERT, you will notice that the memory
+allocated to it never decreases, even when it is idle while the decoder is
+working. This is not a problem with such a small set (100 sentences) and such
+a small number of iterations (2), but could be problematic when scaling up.
+
+But fear not! When you run Z-MERT, you can instruct it to perform each
+iteration as a separate Java process. In other words, the Z-MERT driver would
+launch one external process per MERT iteration, and so the end of that
+iteration is indeed the end of that individual external process. This means
+that the memory required for that iteration will be returned to the OS just
+before the decoder is launched by the Z-MERT driver.
+
+To instruct Z-MERT to function this way, you should use the -maxMem argument:
+
+ java -cp bin joshua.zmert.ZMERT -maxMem 500 ZMERT_example/ZMERT_config_ex2.txt > ZMERT_example/ZMERT.out
+
+The -maxMem argument tells Z-MERT to function as explained above, and the value
+tells it the maximum amount of memory (in MB) it is allowed during any single
+iteration. In this example, Z-MERT will see the -maxMem argument and recognize
+that each iteration should be launched as a separate Java process allowed at
+most 500 MB of memory.
+
+If you do run the above command and monitor memory consumption, you will notice
+that this time the only Java process that actually persists across iterations
+is the Z-MERT driver itself, which pretty much requires no memory at all. This
+way, when the decoder is launched, it will not have to compete with any other
+processes for memory.
+
60 examples/ZMERT/ZMERT_config_ex2.txt
View
@@ -1,30 +1,30 @@
-### Commonly used parameters
--dir ZMERT_example # working directory (i.e. location of relevant files)
-#-s src.txt # source sentences file name
--r ref # target sentences file name (in this case, file name prefix)
--rps 4 # references per sentence
--p params.txt # parameter file
--m BLEU 4 closest # evaluation metric and its options
--maxIt 2 # maximum MERT iterations
--ipi 20 # number of intermediate initial points per iteration
--cmd ./decoder_command_ex2 # file containing commands to run decoder
--decOut nbest_ex2.out # file prodcued by decoder
--dcfg config_ex2.txt # decoder config file
--N 300 # size of N-best list generated each iteration
--v 1 # verbosity level (0-2; higher value => more verbose)
--seed 12341234 # random number generator seed
-
-# Notice that comments are allowed
-
-### Other parameters (run "ZMERT -h" for default values)
-#-txtNrm # text normalization method
-#-fin # output file for final values
-#-prevIt # previous MERT iterations from which to consider candidates (in addition to the current iteration)
-#-minIt # minimum MERT iterations before considering an early exit
-#-stopIt # number of consecutive iterations an early exit criterion must be satisfied before actually exiting
-#-stopSig # value over which a weight change is "significant" (for early exit purposes)
-#-save # should MERT save intermediate config files or decoder output files? (or both? or neither?)
-#-opi # should MERT modify at most one parameter per iteration?
-#-rand # should first initial point (of first iteration) be initialized randomly?
-#-decExit # return value by decoder indicating success
-#-decV # should decoder output be printed?
+### Commonly used parameters
+-dir ZMERT_example # working directory (i.e. location of relevant files)
+#-s src.txt # source sentences file name
+-r ref # target sentences file name (in this case, file name prefix)
+-rps 4 # references per sentence
+-p params.txt # parameter file
+-m BLEU 4 closest # evaluation metric and its options
+-maxIt 2 # maximum MERT iterations
+-ipi 20 # number of intermediate initial points per iteration
+-cmd ./decoder_command_ex2 # file containing commands to run decoder
+-decOut nbest_ex2.out # file prodcued by decoder
+-dcfg config_ex2.txt # decoder config file
+-N 300 # size of N-best list generated each iteration
+-v 1 # verbosity level (0-2; higher value => more verbose)
+-seed 12341234 # random number generator seed
+
+# Notice that comments are allowed
+
+### Other parameters (run "ZMERT -h" for default values)
+#-txtNrm # text normalization method
+#-fin # output file for final values
+#-prevIt # previous MERT iterations from which to consider candidates (in addition to the current iteration)
+#-minIt # minimum MERT iterations before considering an early exit
+#-stopIt # number of consecutive iterations an early exit criterion must be satisfied before actually exiting
+#-stopSig # value over which a weight change is "significant" (for early exit purposes)
+#-save # should MERT save intermediate config files or decoder output files? (or both? or neither?)
+#-opi # should MERT modify at most one parameter per iteration?
+#-rand # should first initial point (of first iteration) be initialized randomly?
+#-decExit # return value by decoder indicating success
+#-decV # should decoder output be printed?
140 examples/ZMERT/config_ex2.txt
View
@@ -1,70 +1,70 @@
-lm_file=example2/example2.4gram.lm.gz
-
-tm_file=example2/example2.hiero.tm.gz
-tm_format=hiero
-
-glue_file=grammars/hiero.glue
-glue_format=hiero
-
-#lm config
-use_srilm=true
-lm_ceiling_cost=100
-use_left_equivalent_state=false
-use_right_equivalent_state=false
-order=4
-
-
-#tm config
-span_limit=10
-phrase_owner=pt
-mono_owner=mono
-begin_mono_owner=begin_mono
-default_non_terminal=X
-goalSymbol=S
-
-#pruning config
-fuzz1=0.1
-fuzz2=0.1
-max_n_items=30
-relative_threshold=10.0
-max_n_rules=50
-rule_relative_threshold=10.0
-
-#nbest config
-use_unique_nbest=true
-use_tree_nbest=false
-add_combined_cost=true
-top_n=300
-
-
-#remoter lm server config,we should first prepare remote_symbol_tbl before starting any jobs
-use_remote_lm_server=false
-remote_symbol_tbl=./voc.remote.sym
-num_remote_lm_servers=4
-f_remote_server_list=./remote.lm.server.list
-remote_lm_server_port=9000
-
-
-#parallel deocoder: it cannot be used together with remote lm
-num_parallel_decoders=1
-parallel_files_prefix=/tmp/
-
-
-###### model weights
-#lm order weight
-lm 1.0
-
-#phrasemodel owner column(0-indexed) weight
-phrasemodel pt 0 1.066893
-phrasemodel pt 1 0.752247
-phrasemodel pt 2 0.589793
-
-#arityphrasepenalty owner start_arity end_arity weight
-#arityphrasepenalty pt 0 0 1.0
-#arityphrasepenalty pt 1 2 -1.0
-
-#phrasemodel mono 0 0.5
-
-#wordpenalty weight
-wordpenalty -2.844814
-
+lm_file=example2/example2.4gram.lm.gz
+
+tm_file=example2/example2.hiero.tm.gz
+tm_format=hiero
+
+glue_file=grammars/hiero.glue
+glue_format=hiero
+
+#lm config
+use_srilm=true
+lm_ceiling_cost=100
+use_left_equivalent_state=false
+use_right_equivalent_state=false
+order=4
+
+
+#tm config
+span_limit=10
+phrase_owner=pt
+mono_owner=mono
+begin_mono_owner=begin_mono
+default_non_terminal=X
+goalSymbol=S
+
+#pruning config
+fuzz1=0.1
+fuzz2=0.1
+max_n_items=30
+relative_threshold=10.0
+max_n_rules=50
+rule_relative_threshold=10.0
+
+#nbest config
+use_unique_nbest=true
+use_tree_nbest=false
+add_combined_cost=true
+top_n=300
+
+
+#remoter lm server config,we should first prepare remote_symbol_tbl before starting any jobs
+use_remote_lm_server=false
+remote_symbol_tbl=./voc.remote.sym
+num_remote_lm_servers=4
+f_remote_server_list=./remote.lm.server.list
+remote_lm_server_port=9000
+
+
+#parallel deocoder: it cannot be used together with remote lm
+num_parallel_decoders=1
+parallel_files_prefix=/tmp/
+
+
+###### model weights
+#lm order weight
+lm 1.0
+
+#phrasemodel owner column(0-indexed) weight
+phrasemodel pt 0 1.066893
+phrasemodel pt 1 0.752247
+phrasemodel pt 2 0.589793
+
+#arityphrasepenalty owner start_arity end_arity weight
+#arityphrasepenalty pt 0 0 1.0
+#arityphrasepenalty pt 1 2 -1.0
+
+#phrasemodel mono 0 0.5
+
+#wordpenalty weight
+wordpenalty -2.844814
+
12 examples/ZMERT/params.txt
View
@@ -1,6 +1,6 @@
-lm ||| 1.000000 Opt 0.1 +Inf +0.5 +1.5
-phrasemodel pt 0 ||| 1.066893 Opt -Inf +Inf -1 +1
-phrasemodel pt 1 ||| 0.752247 Opt -Inf +Inf -1 +1
-phrasemodel pt 2 ||| 0.589793 Opt -Inf +Inf -1 +1
-wordpenalty ||| -2.844814 Opt -Inf +Inf -5 0
-normalization = absval 1 lm
+lm ||| 1.000000 Opt 0.1 +Inf +0.5 +1.5
+phrasemodel pt 0 ||| 1.066893 Opt -Inf +Inf -1 +1
+phrasemodel pt 1 ||| 0.752247 Opt -Inf +Inf -1 +1
+phrasemodel pt 2 ||| 0.589793 Opt -Inf +Inf -1 +1
+wordpenalty ||| -2.844814 Opt -Inf +Inf -5 0
+normalization = absval 1 lm
148 examples/example/example.config.bloomfilterlm
View
@@ -1,74 +1,74 @@
-lm_file=example/example.bloomfilter.lm.gz
-
-tm_file=example/example.hiero.tm.gz
-tm_format=hiero
-
-glue_file=grammars/hiero.glue
-glue_format=hiero
-
-#lm config
-use_srilm=false
-use_bloomfilter_lm=true
-lm_ceiling_cost=100
-use_left_euqivalent_state=false
-use_right_euqivalent_state=false
-order=3
-
-
-#tm config
-span_limit=10
-phrase_owner=pt
-mono_owner=mono
-begin_mono_owner=begin_mono
-default_non_terminal=X
-goalSymbol=S
-
-#pruning config
-fuzz1=0.1
-fuzz2=0.1
-max_n_items=30
-relative_threshold=10.0
-max_n_rules=50
-rule_relative_threshold=10.0
-
-#nbest config
-use_unique_nbest=true
-use_tree_nbest=false
-add_combined_cost=true
-top_n=300
-
-
-#remoter lm server config,we should first prepare remote_symbol_tbl before starting any jobs
-use_remote_lm_server=false
-remote_symbol_tbl=./voc.remote.sym
-num_remote_lm_servers=4
-f_remote_server_list=./remote.lm.server.list
-remote_lm_server_port=9000
-
-
-#parallel deocoder: it cannot be used together with remote lm
-num_parallel_decoders=1
-parallel_files_prefix=.
-
-#disk hg
-save_disk_hg=false
-
-###### model weights
-#lm order weight
-lm 1.000000
-
-#phrasemodel owner column(0-indexed) weight
-phrasemodel pt 0 1.066893
-phrasemodel pt 1 0.752247
-phrasemodel pt 2 0.589793
-
-#arityphrasepenalty owner start_arity end_arity weight
-#arityphrasepenalty pt 0 0 1.0
-#arityphrasepenalty pt 1 2 -1.0
-
-#phrasemodel mono 0 0.5
-
-#wordpenalty weight
-wordpenalty -2.844814
-#latticecost 1.0
-
+lm_file=example/example.bloomfilter.lm.gz
+
+tm_file=example/example.hiero.tm.gz
+tm_format=hiero
+
+glue_file=grammars/hiero.glue
+glue_format=hiero
+
+#lm config
+use_srilm=false
+use_bloomfilter_lm=true
+lm_ceiling_cost=100
+use_left_euqivalent_state=false
+use_right_euqivalent_state=false
+order=3
+
+
+#tm config
+span_limit=10
+phrase_owner=pt
+mono_owner=mono
+begin_mono_owner=begin_mono
+default_non_terminal=X
+goalSymbol=S
+
+#pruning config
+fuzz1=0.1
+fuzz2=0.1
+max_n_items=30
+relative_threshold=10.0
+max_n_rules=50
+rule_relative_threshold=10.0
+
+#nbest config
+use_unique_nbest=true
+use_tree_nbest=false
+add_combined_cost=true
+top_n=300
+
+
+#remoter lm server config,we should first prepare remote_symbol_tbl before starting any jobs
+use_remote_lm_server=false
+remote_symbol_tbl=./voc.remote.sym
+num_remote_lm_servers=4
+f_remote_server_list=./remote.lm.server.list
+remote_lm_server_port=9000
+
+
+#parallel deocoder: it cannot be used together with remote lm
+num_parallel_decoders=1
+parallel_files_prefix=.
+
+#disk hg
+save_disk_hg=false
+
+###### model weights
+#lm order weight
+lm 1.000000
+
+#phrasemodel owner column(0-indexed) weight
+phrasemodel pt 0 1.066893
+phrasemodel pt 1 0.752247
+phrasemodel pt 2 0.589793
+
+#arityphrasepenalty owner start_arity end_arity weight
+#arityphrasepenalty pt 0 0 1.0
+#arityphrasepenalty pt 1 2 -1.0
+
+#phrasemodel mono 0 0.5
+
+#wordpenalty weight
+wordpenalty -2.844814
+#latticecost 1.0
+
245 src/joshua/corpus/AbstractPhrase.java
View
@@ -1,143 +1,132 @@
-/* This file is part of the Joshua Machine Translation System.
+/*
+ * This file is part of the Joshua Machine Translation System.
*
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
*/
package joshua.corpus;
/**
- * This class provides a skeletal implementation of the base methods
- * likely to be common to most or all implementations of the
- * <code>Phrase</code> interface.
+ * This class provides a skeletal implementation of the base methods likely to be common to most or
+ * all implementations of the <code>Phrase</code> interface.
*
* @author Lane Schwartz
* @author Chris Callison-Burch
*/
public abstract class AbstractPhrase implements Phrase {
- //===============================================================
- // Constants
- //===============================================================
-
- /** seed used in hash code generation */
- public static final int HASH_SEED = 17;
-
- /** offset used in has code generation */
- public static final int HASH_OFFSET = 37;
-
- /**
- * Splits a sentence (on white space), then looks up the
- * integer representations of each word using the supplied
- * symbol table.
- *
- * @param sentence White-space separated String of words.
- *
- * @return Array of integers corresponding to the words in
- * the sentence.
- */
- protected int[] splitSentence(String sentence) {
- String[] w = sentence.split("\\s+");
- int[] words = new int[w.length];
- for (int i = 0; i < w.length; i++)
- words[i] = Vocabulary.id(w[i]);
- return words;
- }
-
- /**
- * Uses the standard java approach of calculating hashCode.
- * Start with a seed, add in every value multiplying the
- * exsiting hash times an offset.
- *
- * @return int hashCode for the list
- */
- public int hashCode() {
- int result = HASH_SEED;
- for (int i=0; i < size(); i++) {
- result = HASH_OFFSET*result + getWordID(i);
- }
- return result;
- }
-
-
- /**
- * Two phrases are their word IDs are the same. Note that
- * this could give a false positive if their Vocabularies
- * were different but their IDs were somehow the same.
- */
- public boolean equals(Object o) {
-
- if (o instanceof Phrase) {
- Phrase other = (Phrase) o;
-
- if(this.size() != other.size()) return false;
- for (int i=0; i < size(); i++) {
- if(this.getWordID(i) != other.getWordID(i)) return false;
- }
- return true;
- } else {
- return false;
- }
-
- }
-
-
- /**
- * Compares the two strings based on the lexicographic order
- * of words defined in the Vocabulary.
- *
- * @param other the object to compare to
- * @return -1 if this object is less than the parameter, 0
- * if equals, 1 if greater
- * @exception ClassCastException if the passed object is
- * not of type Phrase
- */
- public int compareTo(Phrase other) {
- int length = size();
- int otherLength = other.size();
- for (int i = 0; i < length; i++) {
- if (i < otherLength) {
- int difference = getWordID(i) - other.getWordID(i);
- if (difference != 0) return difference;
- } else {
- //same but other is shorter, so we are after
- return 1;
- }
- }
- if (length < otherLength) {
- return -1;
- } else {
- return 0;
- }
- }
-
- /**
- * Returns a string representation of the phrase.
- *
- * @return a space-delimited string of the words in the
- * phrase.
- */
- public String toString() {
- StringBuffer buf = new StringBuffer();
- for (int i=0; i<size(); i++) {
- String word = Vocabulary.word(getWordID(i));
- if (i != 0) buf.append(' ');
- buf.append(word);
- }
- return buf.toString();
- }
-
+ // ===============================================================
+ // Constants
+ // ===============================================================
+
+ /** seed used in hash code generation */
+ public static final int HASH_SEED = 17;
+
+ /** offset used in has code generation */
+ public static final int HASH_OFFSET = 37;
+
+ /**
+ * Splits a sentence (on white space), then looks up the integer representations of each word
+ * using the supplied symbol table.
+ *
+ * @param sentence White-space separated String of words.
+ *
+ * @return Array of integers corresponding to the words in the sentence.
+ */
+ protected int[] splitSentence(String sentence) {
+ String[] w = sentence.split("\\s+");
+ int[] words = new int[w.length];
+ for (int i = 0; i < w.length; i++)
+ words[i] = Vocabulary.id(w[i]);
+ return words;
+ }
+
+ /**
+ * Uses the standard java approach of calculating hashCode. Start with a seed, add in every value
+ * multiplying the exsiting hash times an offset.
+ *
+ * @return int hashCode for the list
+ */
+ public int hashCode() {
+ int result = HASH_SEED;
+ for (int i = 0; i < size(); i++) {
+ result = HASH_OFFSET * result + getWordID(i);
+ }
+ return result;
+ }
+
+
+ /**
+ * Two phrases are their word IDs are the same. Note that this could give a false positive if
+ * their Vocabularies were different but their IDs were somehow the same.
+ */
+ public boolean equals(Object o) {
+
+ if (o instanceof Phrase) {
+ Phrase other = (Phrase) o;
+
+ if (this.size() != other.size()) return false;
+ for (int i = 0; i < size(); i++) {
+ if (this.getWordID(i) != other.getWordID(i)) return false;
+ }
+ return true;
+ } else {
+ return false;
+ }
+
+ }
+
+
+ /**
+ * Compares the two strings based on the lexicographic order of words defined in the Vocabulary.
+ *
+ * @param other the object to compare to
+ * @return -1 if this object is less than the parameter, 0 if equals, 1 if greater
+ * @exception ClassCastException if the passed object is not of type Phrase
+ */
+ public int compareTo(Phrase other) {
+ int length = size();
+ int otherLength = other.size();
+ for (int i = 0; i < length; i++) {
+ if (i < otherLength) {
+ int difference = getWordID(i) - other.getWordID(i);
+ if (difference != 0) return difference;
+ } else {
+ // same but other is shorter, so we are after
+ return 1;
+ }
+ }
+ if (length < otherLength) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+
+ /**
+ * Returns a string representation of the phrase.
+ *
+ * @return a space-delimited string of the words in the phrase.
+ */
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < size(); i++) {
+ String word = Vocabulary.word(getWordID(i));
+ if (i != 0) buf.append(' ');
+ buf.append(word);
+ }
+ return buf.toString();
+ }
+
}
204 src/joshua/corpus/BasicPhrase.java
View
@@ -1,103 +1,101 @@
-/* This file is part of the Joshua Machine Translation System.
- *
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
- */
-/*
- * This file is based on the edu.umd.clip.mt.Phrase class from the
- * University of Maryland's umd-hadoop-mt-0.01 project. That project
- * is released under the terms of the Apache License 2.0, but with
- * special permission for the Joshua Machine Translation System to
- * release modifications under the LGPL version 2.1. LGPL version
- * 3 requires no special permission since it is compatible with
- * Apache License 2.0
- */
-package joshua.corpus;
-
-import java.util.ArrayList;
-
-/**
- * The simplest concrete implementation of Phrase.
- *
- * @author wren ng thornton <wren@users.sourceforge.net>
- * @version $LastChangedDate$
- */
-public class BasicPhrase extends AbstractPhrase {
- private byte language;
- private int[] words;
-
-
- public BasicPhrase(byte language, String sentence) {
- this.language = language;
- this.words = splitSentence(sentence);
- }
-
- private BasicPhrase() {}
-
- public int[] getWordIDs() {
- return words;
- }
-
- /* See Javadoc for Phrase interface. */
- public BasicPhrase subPhrase(int start, int end) {
- BasicPhrase that = new BasicPhrase();
- that.language = this.language;
- that.words = new int[end-start+1];
- System.arraycopy(this.words, start, that.words, 0, end-start+1);
- return that;
- }
-
- /* See Javadoc for Phrase interface. */
- public ArrayList<Phrase> getSubPhrases() {
- return this.getSubPhrases(this.size());
- }
-
- /* See Javadoc for Phrase interface. */
- public ArrayList<Phrase> getSubPhrases(int maxLength) {
- ArrayList<Phrase> phrases = new ArrayList<Phrase>();
- int len = this.size();
- for (int n = 1; n <= maxLength; n++)
- for (int i = 0; i <= len-n; i++)
- phrases.add(this.subPhrase(i, i + n - 1));
- return phrases;
- }
-
- /* See Javadoc for Phrase interface. */
- public int size() { return (words == null ? 0 : words.length); }
-
- /* See Javadoc for Phrase interface. */
- public int getWordID(int position) { return words[position]; }
-
- /**
- * Returns a human-readable String representation of the
- * phrase.
- * <p>
- * The implementation of this method is slightly more
- * efficient than that inherited from <code>AbstractPhrase</code>.
- *
- * @return a human-readable String representation of the
- * phrase.
- */
- public String toString() {
- StringBuffer sb = new StringBuffer();
- if (words != null) {
- for (int i = 0; i < words.length; ++i) {
- if (i != 0) sb.append(' ');
- sb.append(Vocabulary.word(words[i]));
- }
- }
- return sb.toString();
- }
-}
+/*
+ * This file is part of the Joshua Machine Translation System.
+ *
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+/*
+ * This file is based on the edu.umd.clip.mt.Phrase class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.corpus;
+
+import java.util.ArrayList;
+
+/**
+ * The simplest concrete implementation of Phrase.
+ *
+ * @author wren ng thornton <wren@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class BasicPhrase extends AbstractPhrase {
+ private byte language;
+ private int[] words;
+
+
+ public BasicPhrase(byte language, String sentence) {
+ this.language = language;
+ this.words = splitSentence(sentence);
+ }
+
+ private BasicPhrase() {}
+
+ public int[] getWordIDs() {
+ return words;
+ }
+
+ /* See Javadoc for Phrase interface. */
+ public BasicPhrase subPhrase(int start, int end) {
+ BasicPhrase that = new BasicPhrase();
+ that.language = this.language;
+ that.words = new int[end - start + 1];
+ System.arraycopy(this.words, start, that.words, 0, end - start + 1);
+ return that;
+ }
+
+ /* See Javadoc for Phrase interface. */
+ public ArrayList<Phrase> getSubPhrases() {
+ return this.getSubPhrases(this.size());
+ }
+
+ /* See Javadoc for Phrase interface. */
+ public ArrayList<Phrase> getSubPhrases(int maxLength) {
+ ArrayList<Phrase> phrases = new ArrayList<Phrase>();
+ int len = this.size();
+ for (int n = 1; n <= maxLength; n++)
+ for (int i = 0; i <= len - n; i++)
+ phrases.add(this.subPhrase(i, i + n - 1));
+ return phrases;
+ }
+
+ /* See Javadoc for Phrase interface. */
+ public int size() {
+ return (words == null ? 0 : words.length);
+ }
+
+ /* See Javadoc for Phrase interface. */
+ public int getWordID(int position) {
+ return words[position];
+ }
+
+ /**
+ * Returns a human-readable String representation of the phrase.
+ * <p>
+ * The implementation of this method is slightly more efficient than that inherited from
+ * <code>AbstractPhrase</code>.
+ *
+ * @return a human-readable String representation of the phrase.
+ */
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ if (words != null) {
+ for (int i = 0; i < words.length; ++i) {
+ if (i != 0) sb.append(' ');
+ sb.append(Vocabulary.word(words[i]));
+ }
+ }
+ return sb.toString();
+ }
+}
344 src/joshua/corpus/ContiguousPhrase.java
View
@@ -1,19 +1,17 @@
-/* This file is part of the Joshua Machine Translation System.
+/*
+ * This file is part of the Joshua Machine Translation System.
*
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
*/
package joshua.corpus;
@@ -23,172 +21,164 @@
/**
- * ContiguousPhrase implements the Phrase interface by linking into
- * indices within a corpus. This is intended to be a very low-memory
- * implementation of the class.
- *
+ * ContiguousPhrase implements the Phrase interface by linking into indices within a corpus. This is
+ * intended to be a very low-memory implementation of the class.
+ *
* @author Chris Callison-Burch
- * @since 29 May 2008
+ * @since 29 May 2008
* @version $LastChangedDate:2008-09-18 12:47:23 -0500 (Thu, 18 Sep 2008) $
*/
public class ContiguousPhrase extends AbstractPhrase {
-//===============================================================
-// Constants
-//===============================================================
-
-//===============================================================
-// Member variables
-//===============================================================
-
- protected int startIndex;
- protected int endIndex;
- protected Corpus corpusArray;
-
-//===============================================================
-// Constructor(s)
-//===============================================================
-
- public ContiguousPhrase(int startIndex, int endIndex, Corpus corpusArray) {
- this.startIndex = startIndex;
- this.endIndex = endIndex;
- this.corpusArray = corpusArray;
- }
-
-
-//===============================================================
-// Public
-//===============================================================
-
- //===========================================================
- // Accessor methods (set/get)
- //===========================================================
-
- /**
- * This method copies the phrase into an array of ints.
- * This method should be avoided if possible.
- *
- * @return an int[] corresponding to the ID of each word
- * in the phrase
- */
- public int[] getWordIDs() {
- int[] words = new int[endIndex-startIndex];
- for (int i = startIndex; i < endIndex; i++) {
- words[i-startIndex] = corpusArray.getWordID(i); //corpusArray.corpus[i];
- }
- return words;
- }
-
-
- public int getWordID(int position) {
- return corpusArray.getWordID(startIndex+position);
-// return corpusArray.corpus[startIndex+position];
- }
-
-
- public int size() {
- return endIndex-startIndex;
- }
-
-
- //===========================================================
- // Methods
- //===========================================================
-
-
- /**
- * Gets all possible subphrases of this phrase, up to and
- * including the phrase itself. For example, the phrase "I
- * like cheese ." would return the following:
- * <ul>
- * <li>I
- * <li>like
- * <li>cheese
- * <li>.
- * <li>I like
- * <li>like cheese
- * <li>cheese .
- * <li>I like cheese
- * <li>like cheese .
- * <li>I like cheese .
- * </ul>
- *
- * @return ArrayList of all possible subphrases.
- */
- public List<Phrase> getSubPhrases() {
- return getSubPhrases(size());
- }
-
-
- /**
- * Returns a list of subphrases only of length
- * <code>maxLength</code> or smaller.
- *
- * @param maxLength the maximum length phrase to return.
- * @return ArrayList of all possible subphrases of length
- * maxLength or less
- * @see #getSubPhrases()
- */
- public List<Phrase> getSubPhrases(int maxLength) {
- if (maxLength > size()) return getSubPhrases(size());
- List<Phrase> phrases=new ArrayList<Phrase>();
- for (int i = 0; i < size(); i++) {
- for (int j=i+1; (j <= size()) && (j-i <= maxLength); j++) {
- Phrase subPhrase = subPhrase(i,j);
- phrases.add(subPhrase);
- }
- }
- return phrases;
- }
-
-
- /**
- * creates a new phrase object from the indexes provided.
- * <P>
- * NOTE: subList merely creates a "view" of the existing
- * Phrase object. Memory taken up by other Words in the
- * Phrase is not freed since the underlying subList object
- * still points to the complete Phrase List.
- *
- * @see ArrayList#subList(int, int)
- */
- public Phrase subPhrase(int start, int end) {
- return new ContiguousPhrase(startIndex+start, startIndex+end, corpusArray);
- }
-
-
-//===============================================================
-// Protected
-//===============================================================
-
- //===============================================================
- // Methods
- //===============================================================
-
-
-//===============================================================
-// Private
-//===============================================================
-
- //===============================================================
- // Methods
- //===============================================================
-
-
-//===============================================================
-// Static
-//===============================================================
-
-
-//===============================================================
-// Main
-//===============================================================
-
- /**
- * Main contains test code
- */
- public static void main(String[] args) {
-
- }
-}
+ // ===============================================================
+ // Constants
+ // ===============================================================
+
+ // ===============================================================
+ // Member variables
+ // ===============================================================
+
+ protected int startIndex;
+ protected int endIndex;
+ protected Corpus corpusArray;
+
+ // ===============================================================
+ // Constructor(s)
+ // ===============================================================
+
+ public ContiguousPhrase(int startIndex, int endIndex, Corpus corpusArray) {
+ this.startIndex = startIndex;
+ this.endIndex = endIndex;
+ this.corpusArray = corpusArray;
+ }
+
+
+ // ===============================================================
+ // Public
+ // ===============================================================
+
+ // ===========================================================
+ // Accessor methods (set/get)
+ // ===========================================================
+
+ /**
+ * This method copies the phrase into an array of ints. This method should be avoided if possible.
+ *
+ * @return an int[] corresponding to the ID of each word in the phrase
+ */
+ public int[] getWordIDs() {
+ int[] words = new int[endIndex - startIndex];
+ for (int i = startIndex; i < endIndex; i++) {
+ words[i - startIndex] = corpusArray.getWordID(i); // corpusArray.corpus[i];
+ }
+ return words;
+ }
+
+
+ public int getWordID(int position) {
+ return corpusArray.getWordID(startIndex + position);
+ // return corpusArray.corpus[startIndex+position];
+ }
+
+
+ public int size() {
+ return endIndex - startIndex;
+ }
+
+
+ // ===========================================================
+ // Methods
+ // ===========================================================
+
+ /**
+ * Gets all possible subphrases of this phrase, up to and including the phrase itself. For
+ * example, the phrase "I like cheese ." would return the following:
+ * <ul>
+ * <li>I
+ * <li>like
+ * <li>cheese
+ * <li>.
+ * <li>I like
+ * <li>like cheese
+ * <li>cheese .
+ * <li>I like cheese
+ * <li>like cheese .
+ * <li>I like cheese .
+ * </ul>
+ *
+ * @return ArrayList of all possible subphrases.
+ */
+ public List<Phrase> getSubPhrases() {
+ return getSubPhrases(size());
+ }
+
+
+ /**
+ * Returns a list of subphrases only of length <code>maxLength</code> or smaller.
+ *
+ * @param maxLength the maximum length phrase to return.
+ * @return ArrayList of all possible subphrases of length maxLength or less
+ * @see #getSubPhrases()
+ */
+ public List<Phrase> getSubPhrases(int maxLength) {
+ if (maxLength > size()) return getSubPhrases(size());
+ List<Phrase> phrases = new ArrayList<Phrase>();
+ for (int i = 0; i < size(); i++) {
+ for (int j = i + 1; (j <= size()) && (j - i <= maxLength); j++) {
+ Phrase subPhrase = subPhrase(i, j);
+ phrases.add(subPhrase);
+ }
+ }
+ return phrases;
+ }
+
+
+ /**
+ * creates a new phrase object from the indexes provided.
+ * <P>
+ * NOTE: subList merely creates a "view" of the existing Phrase object. Memory taken up by other
+ * Words in the Phrase is not freed since the underlying subList object still points to the
+ * complete Phrase List.
+ *
+ * @see ArrayList#subList(int, int)
+ */
+ public Phrase subPhrase(int start, int end) {
+ return new ContiguousPhrase(startIndex + start, startIndex + end, corpusArray);
+ }
+
+
+ // ===============================================================
+ // Protected
+ // ===============================================================
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+
+ // ===============================================================
+ // Private
+ // ===============================================================
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+
+ // ===============================================================
+ // Static
+ // ===============================================================
+
+
+ // ===============================================================
+ // Main
+ // ===============================================================
+
+ /**
+ * Main contains test code
+ */
+ public static void main(String[] args) {
+
+ }
+}
313 src/joshua/corpus/Corpus.java
View
@@ -1,183 +1,156 @@
-/* This file is part of the Joshua Machine Translation System.
+/*
+ * This file is part of the Joshua Machine Translation System.
*
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
*/
package joshua.corpus;
/**
- * Corpus is an interface that contains methods for accessing the
- * information within a monolingual corpus.
- *
+ * Corpus is an interface that contains methods for accessing the information within a monolingual
+ * corpus.
+ *
* @author Chris Callison-Burch
- * @since 7 February 2005
+ * @since 7 February 2005
* @version $LastChangedDate:2008-07-30 17:15:52 -0400 (Wed, 30 Jul 2008) $
*/
-public interface Corpus { //extends Externalizable {
-
-//===============================================================
-// Attribute definitions
-//===============================================================
-
- /**
- * @return the integer representation of the Word at the
- * specified position in the corpus.
- */
- int getWordID(int position);
-
-
- /**
- * Gets the sentence index associated with the specified
- * position in the corpus.
- *
- * @param position Index into the corpus
- * @return the sentence index associated with the specified
- * position in the corpus.
- */
- int getSentenceIndex(int position);
-
-
- /**
- * Gets the sentence index of each specified position.
- *
- * @param position Index into the corpus
- * @return array of the sentence indices associated
- * with the specified positions in the corpus.
- */
- int[] getSentenceIndices(int[] positions);
-
- /**
- * Gets the position in the corpus of the first word of
- * the specified sentence. If the sentenceID is
- * outside of the bounds of the sentences, then it
- * returns the last position in the corpus + 1.
- *
- * @return the position in the corpus of the first word of
- * the specified sentence. If the sentenceID is
- * outside of the bounds of the sentences, then it
- * returns the last position in the corpus + 1.
- */
- int getSentencePosition(int sentenceID);
-
- /**
- * Gets the exclusive end position of a sentence in the
- * corpus.
- *
- * @return the position in the corpus one past the last
- * word of the specified sentence. If the sentenceID
- * is outside of the bounds of the sentences, then
- * it returns one past the last position in the
- * corpus.
- */
- int getSentenceEndPosition(int sentenceID);
-
- /**
- * Gets the specified sentence as a phrase.
- *
- * @param sentenceIndex Zero-based sentence index
- * @return the sentence, or null if the specified sentence
- * number doesn't exist
- */
- Phrase getSentence(int sentenceIndex);
-
-
- /**
- * Gets the number of words in the corpus.
- *
- * @return the number of words in the corpus.
- */
- int size();
-
-
- /**
- * Gets the number of sentences in the corpus.
- *
- * @return the number of sentences in the corpus.
- */
- int getNumSentences();
-
-
- //===========================================================
- // Methods
- //===========================================================
-
-
- /**
- * Compares the phrase that starts at position start with
- * the subphrase indicated by the start and end points of
- * the phrase.
- *
- * @param corpusStart the point in the corpus where the
- * comparison begins
- * @param phrase the superphrase that the comparsion
- * phrase is drawn from
- * @param phraseStart the point in the phrase where the
- * comparison begins (inclusive)
- * @param phraseEnd the point in the phrase where the
- * comparison ends (exclusive)
- * @return an int that follows the conventions of
- * java.util.Comparator.compareTo()
- */
- int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd);
-
-
- /**
- * Compares the phrase that starts at position start with
- * the phrase passed in. Compares the entire phrase.
- *
- * @param corpusStart
- * @param phrase
- * @return
- */
- int comparePhrase(int corpusStart, Phrase phrase);
-
- /**
- * Compares the suffixes starting a positions index1 and
- * index2.
- *
- * @param position1 the position in the corpus where the
- * first suffix begins
- * @param position2 the position in the corpus where the
- * second suffix begins
- * @param maxComparisonLength a cutoff point to stop the
- * comparison
- * @return an int that follows the conventions of
- * java.util.Comparator.compareTo()
- */
- int compareSuffixes(int position1, int position2, int maxComparisonLength);
-
- /**
- *
- * @param startPosition
- * @param endPosition
- * @return
- */
- ContiguousPhrase getPhrase(int startPosition, int endPosition);
-
- /**
- * Gets an object capable of iterating
- * over all positions in the corpus, in order.
- *
- * @return An object capable of iterating
- * over all positions in the corpus, in order.
- */
- Iterable<Integer> corpusPositions();
-
-// void write(String corpusFilename, String vocabFilename, String charset) throws IOException;
-}
+public interface Corpus { // extends Externalizable {
+
+ // ===============================================================
+ // Attribute definitions
+ // ===============================================================
+
+ /**
+ * @return the integer representation of the Word at the specified position in the corpus.
+ */
+ int getWordID(int position);
+
+
+ /**
+ * Gets the sentence index associated with the specified position in the corpus.
+ *
+ * @param position Index into the corpus
+ * @return the sentence index associated with the specified position in the corpus.
+ */
+ int getSentenceIndex(int position);
+
+
+ /**
+ * Gets the sentence index of each specified position.
+ *
+ * @param position Index into the corpus
+ * @return array of the sentence indices associated with the specified positions in the corpus.
+ */
+ int[] getSentenceIndices(int[] positions);
+
+ /**
+ * Gets the position in the corpus of the first word of the specified sentence. If the sentenceID
+ * is outside of the bounds of the sentences, then it returns the last position in the corpus + 1.
+ *
+ * @return the position in the corpus of the first word of the specified sentence. If the
+ * sentenceID is outside of the bounds of the sentences, then it returns the last position
+ * in the corpus + 1.
+ */
+ int getSentencePosition(int sentenceID);
+
+ /**
+ * Gets the exclusive end position of a sentence in the corpus.
+ *
+ * @return the position in the corpus one past the last word of the specified sentence. If the
+ * sentenceID is outside of the bounds of the sentences, then it returns one past the last
+ * position in the corpus.
+ */
+ int getSentenceEndPosition(int sentenceID);
+
+ /**
+ * Gets the specified sentence as a phrase.
+ *
+ * @param sentenceIndex Zero-based sentence index
+ * @return the sentence, or null if the specified sentence number doesn't exist
+ */
+ Phrase getSentence(int sentenceIndex);
+
+ /**
+ * Gets the number of words in the corpus.
+ *
+ * @return the number of words in the corpus.
+ */
+ int size();
+
+
+ /**
+ * Gets the number of sentences in the corpus.
+ *
+ * @return the number of sentences in the corpus.
+ */
+ int getNumSentences();
+
+
+ // ===========================================================
+ // Methods
+ // ===========================================================
+
+
+ /**
+ * Compares the phrase that starts at position start with the subphrase indicated by the start and
+ * end points of the phrase.
+ *
+ * @param corpusStart the point in the corpus where the comparison begins
+ * @param phrase the superphrase that the comparsion phrase is drawn from
+ * @param phraseStart the point in the phrase where the comparison begins (inclusive)
+ * @param phraseEnd the point in the phrase where the comparison ends (exclusive)
+ * @return an int that follows the conventions of java.util.Comparator.compareTo()
+ */
+ int comparePhrase(int corpusStart, Phrase phrase, int phraseStart, int phraseEnd);
+
+
+ /**
+ * Compares the phrase that starts at position start with the phrase passed in. Compares the
+ * entire phrase.
+ *
+ * @param corpusStart
+ * @param phrase
+ * @return
+ */
+ int comparePhrase(int corpusStart, Phrase phrase);
+
+ /**
+ * Compares the suffixes starting a positions index1 and index2.
+ *
+ * @param position1 the position in the corpus where the first suffix begins
+ * @param position2 the position in the corpus where the second suffix begins
+ * @param maxComparisonLength a cutoff point to stop the comparison
+ * @return an int that follows the conventions of java.util.Comparator.compareTo()
+ */
+ int compareSuffixes(int position1, int position2, int maxComparisonLength);
+
+ /**
+ *
+ * @param startPosition
+ * @param endPosition
+ * @return
+ */
+ ContiguousPhrase getPhrase(int startPosition, int endPosition);
+
+ /**
+ * Gets an object capable of iterating over all positions in the corpus, in order.
+ *
+ * @return An object capable of iterating over all positions in the corpus, in order.
+ */
+ Iterable<Integer> corpusPositions();
+
+ // void write(String corpusFilename, String vocabFilename, String charset) throws IOException;
+}
203 src/joshua/corpus/Phrase.java
View
@@ -1,19 +1,17 @@
-/* This file is part of the Joshua Machine Translation System.
+/*
+ * This file is part of the Joshua Machine Translation System.
*
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
*/
package joshua.corpus;
@@ -23,102 +21,91 @@
/**
* Representation of a sequence of tokens.
- *
+ *
* @version $LastChangedDate:2008-09-18 10:31:54 -0500 (Thu, 18 Sep 2008) $
*/
public interface Phrase extends Comparable<Phrase> {
- /**
- * This method gets the integer IDs of the phrase as an
- * array of ints.
- *
- * @return an int[] corresponding to the ID of each word
- * in the phrase
- */
- public int[] getWordIDs();
-
- /**
- * Returns the integer word id of the word at the specified
- * position.
- *
- * @param position Index of a word in this phrase.
- * @return the integer word id of the word at the specified
- * position.
- */
- int getWordID(int position);
-
-
- /**
- * Returns the number of words in this phrase.
- *
- * @return the number of words in this phrase.
- */
- int size();
-
-
-
- /**
- * Gets all possible subphrases of this phrase, up to and
- * including the phrase itself. For example, the phrase "I
- * like cheese ." would return the following:
- * <ul>
- * <li>I
- * <li>like
- * <li>cheese
- * <li>.
- * <li>I like
- * <li>like cheese
- * <li>cheese .
- * <li>I like cheese
- * <li>like cheese .
- * <li>I like cheese .
- * </ul>
- * @return List of all possible subphrases.
- */
- List<Phrase> getSubPhrases();
-
-
- /**
- * Returns a list of subphrases only of length
- * <code>maxLength</code> or smaller.
- *
- * @param maxLength the maximum length phrase to return.
- * @return List of all possible subphrases of length maxLength
- * or less
- * @see #getSubPhrases()
- */
- List<Phrase> getSubPhrases(int maxLength);
-
-
- /**
- * creates a new phrase object from the indexes provided.
- * <P>
- * NOTE: subList merely creates a "view" of the existing
- * Phrase object. Memory taken up by other Words in the
- * Phrase is not freed since the underlying subList object
- * still points to the complete Phrase List.
- *
- * @see ArrayList#subList(int, int)
- */
- Phrase subPhrase(int start, int end);
-
-
- /**
- * Compares the two strings based on the lexicographic order
- * of words defined in the Vocabulary.
- *
- * @param other the object to compare to
- * @return -1 if this object is less than the parameter, 0
- * if equals, 1 if greater
- */
- int compareTo(Phrase other);
-
- /**
- * Returns a human-readable String representation of the
- * phrase.
- *
- * @return a human-readable String representation of the
- * phrase.
- */
- String toString();
+ /**
+ * This method gets the integer IDs of the phrase as an array of ints.
+ *
+ * @return an int[] corresponding to the ID of each word in the phrase
+ */
+ public int[] getWordIDs();
+
+ /**
+ * Returns the integer word id of the word at the specified position.
+ *
+ * @param position Index of a word in this phrase.
+ * @return the integer word id of the word at the specified position.
+ */
+ int getWordID(int position);
+
+
+ /**
+ * Returns the number of words in this phrase.
+ *
+ * @return the number of words in this phrase.
+ */
+ int size();
+
+
+
+ /**
+ * Gets all possible subphrases of this phrase, up to and including the phrase itself. For
+ * example, the phrase "I like cheese ." would return the following:
+ * <ul>
+ * <li>I
+ * <li>like
+ * <li>cheese
+ * <li>.
+ * <li>I like
+ * <li>like cheese
+ * <li>cheese .
+ * <li>I like cheese
+ * <li>like cheese .
+ * <li>I like cheese .
+ * </ul>
+ *
+ * @return List of all possible subphrases.
+ */
+ List<Phrase> getSubPhrases();
+
+
+ /**
+ * Returns a list of subphrases only of length <code>maxLength</code> or smaller.
+ *
+ * @param maxLength the maximum length phrase to return.
+ * @return List of all possible subphrases of length maxLength or less
+ * @see #getSubPhrases()
+ */
+ List<Phrase> getSubPhrases(int maxLength);
+
+
+ /**
+ * creates a new phrase object from the indexes provided.
+ * <P>
+ * NOTE: subList merely creates a "view" of the existing Phrase object. Memory taken up by other
+ * Words in the Phrase is not freed since the underlying subList object still points to the
+ * complete Phrase List.
+ *
+ * @see ArrayList#subList(int, int)
+ */
+ Phrase subPhrase(int start, int end);
+
+
+ /**
+ * Compares the two strings based on the lexicographic order of words defined in the Vocabulary.
+ *
+ * @param other the object to compare to
+ * @return -1 if this object is less than the parameter, 0 if equals, 1 if greater
+ */
+ int compareTo(Phrase other);
+
+ /**
+ * Returns a human-readable String representation of the phrase.
+ *
+ * @return a human-readable String representation of the phrase.
+ */
+ String toString();
}
321 src/joshua/corpus/Span.java
View
@@ -1,19 +1,17 @@
-/* This file is part of the Joshua Machine Translation System.
+/*
+ * This file is part of the Joshua Machine Translation System.
*
- * Joshua is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
+ * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- * MA 02111-1307 USA
+ *
+ * You should have received a copy of the GNU Lesser General Public License along with this library;
+ * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
*/
package joshua.corpus;
@@ -22,160 +20,153 @@
import java.util.List;
/**
- * Represents a span with an inclusive starting index and an exclusive
- * ending index.
- *
+ * Represents a span with an inclusive starting index and an exclusive ending index.
+ *
* @author Lane Schwartz
* @version $LastChangedDate:2008-09-18 12:47:23 -0500 (Thu, 18 Sep 2008) $
*/
public class Span implements Iterable<Integer>, Comparable<Span> {
- /** Inclusive starting index of this span. */
- public int start;
-
- /** Exclusive ending index of this span. */
- public int end;
-
-
- /**
- * Constructs a new span with the given inclusive starting
- * and exclusive ending indices.
- *
- * @param start Inclusive starting index of this span.
- * @param end Exclusive ending index of this span.
- */
- public Span(int start, int end) {
- this.start = start;
- this.end = end;
- }
-
-
- /**
- * Returns the length of the span.
- *
- * @return the length of the span; this is equivalent to
- * <code>span.end - span.start</code>.
- */
- public int size() {
- return end-start;
- }
-
- /**
- * Returns all subspans of the given Span.
- *
- * @return a list of all subspans.
- */
- public List<Span> getSubSpans()
- {
- return getSubSpans(size());
- }
-
- /**
- * Returns all subspans of the given Span, up to a specified Span size.
- *
- * @param max the maximum Span size to return
- * @return a list all subspans up to the given size
- */
- public List<Span> getSubSpans(int max)
- {
- int spanSize = size();
- ArrayList<Span> result = new ArrayList<Span>(max * spanSize);
- for (int len = max; len > 0; len--) {
- for (int i = start; i < end - len + 1; i++) {
- result.add(new Span(i, i + len));
- }
- }
- return result;
- }
-
- public boolean strictlyContainedIn(Span o)
- {
- return (start >= o.start) && (end <= o.end) && !(start == o.start && end == o.end);
- }
-
- public boolean disjointFrom(Span o)
- {