Browse files

Merge remote branch 'joshua-main/devel' into devel

Conflicts:
	.gitignore
  • Loading branch information...
2 parents 645fd76 + d640337 commit cf78d1c0816e34ec979be47208fffc7683963678 @lukeorland lukeorland committed Jun 15, 2012
Showing with 1,035 additions and 16,790 deletions.
  1. +2 −0 .gitignore
  2. +5 −0 examples/README
  3. +1 −0 examples/grammars/README
  4. +7 −21 examples/lattice/config.test
  5. +1 −1 examples/lattice/test.plf
  6. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.data
  7. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.source
  8. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.target
  9. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.data
  10. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.source
  11. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.target
  12. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.data
  13. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.source
  14. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.target
  15. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.data
  16. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.source
  17. BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.target
  18. BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.features
  19. BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.source
  20. BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.target
  21. BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.target.lookup
  22. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.data
  23. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.source
  24. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.target
  25. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.target.lookup
  26. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.data
  27. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.source
  28. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.target
  29. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.target.lookup
  30. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.data
  31. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.source
  32. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.target
  33. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.target.lookup
  34. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.data
  35. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.source
  36. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.target
  37. BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.target.lookup
  38. BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.features
  39. BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.source
  40. BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.target
  41. BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.target.lookup
  42. +1 −1 examples/packed-grammar/packer.quantized
  43. +1 −1 examples/packed-grammar/packer.uncompressed
  44. +1 −0 examples/parser/README
  45. +103 −69 scripts/support/bbn2plf.pl
  46. +5 −0 scripts/training/build-vocab.pl
  47. +8 −5 scripts/training/normalize-punctuation.pl
  48. +1 −1 scripts/training/parallelize/LocalConfig.pm
  49. +501 −494 scripts/training/pipeline.pl
  50. +31 −0 scripts/training/unmap-html.pl
  51. +1 −3 src/joshua/decoder/DecoderThread.java
  52. +2 −1 src/joshua/decoder/JoshuaDecoder.java
  53. +2 −2 src/joshua/decoder/Translation.java
  54. +11 −1 src/joshua/decoder/chart_parser/Chart.java
  55. +5 −17 src/joshua/decoder/chart_parser/ComputeNodeResult.java
  56. +5 −3 src/joshua/decoder/ff/DefaultStatelessFF.java
  57. +3 −0 src/joshua/decoder/ff/FeatureFunction.java
  58. +1 −0 src/joshua/decoder/ff/SourcePathFF.java
  59. +111 −23 src/joshua/decoder/ff/lm/LanguageModelFF.java
  60. +1 −1 src/joshua/decoder/ff/lm/kenlm/Makefile
  61. +6 −0 src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
  62. +1 −1 src/joshua/decoder/ff/state_maintenance/NgramStateComputer.java
  63. +202 −2 src/joshua/decoder/ff/tm/packed/PackedGrammar.java
  64. +1 −1 src/joshua/decoder/hypergraph/HyperEdge.java
  65. +1 −2 src/joshua/decoder/hypergraph/KBestExtractor.java
  66. +1 −1 src/joshua/decoder/segment_file/LatticeInput.java
  67. +1 −1 src/joshua/decoder/segment_file/Sentence.java
  68. +10 −0 src/joshua/tools/GrammarPacker.java
  69. +0 −16,136 test/bn-en/packed/output
  70. 0 test/packed/test.sh
  71. 0 test/scripts/tokenization/test.sh
  72. +1 −1 test/test-all.sh
  73. +1 −1 thrax
View
2 .gitignore
@@ -66,3 +66,5 @@ test/bn-en/samt/reference.en.all
*.class
doxygen_*.tmp
*.so
+test/scripts/tokenization/diff
+test/scripts/tokenization/output
View
5 examples/README
@@ -0,0 +1,5 @@
+The examples in this directory are in various states of functionality.
+
+If you want to see working code and examples of usage, please consult the
+online documentation (joshua-decoder.org/userdocs). You should also find
+some useful examples in the pipeline script (the recommended way to use Joshua).
View
1 examples/grammars/README
@@ -0,0 +1 @@
+The grammars in this directory are used by a variety of the examples.
View
28 examples/lattice/config.test
@@ -1,40 +1,27 @@
-lm_file=test.lm
-
tm_file=grammar.test
tm_format=hiero
glue_file=../grammars/hiero.glue
glue_format=hiero
#lm config
-use_srilm=true
-lm_ceiling_cost=100
-use_left_equivalent_state=false
-use_right_equivalent_state=false
-order=3
-
+lm = berkeleylm 3 false false 100 test.lm
#tm config
span_limit=50
phrase_owner=pt
-mono_owner=mono
-begin_mono_owner=begin_mono
default_non_terminal=X
goalSymbol=S
#pruning config
-fuzz1=0.1
-fuzz2=0.1
-max_n_items=500
-relative_threshold=10.0
-max_n_rules=500
-rule_relative_threshold=10.0
+pop-limit = 100
#nbest config
-use_unique_nbest=false
-use_tree_nbest=false
+use_unique_nbest = true
+use_tree_nbest = false
+include-align-index = false
add_combined_cost=true
-top_n=300
+top_n = 300
#parallel deocoder: it cannot be used together with remote lm
num_parallel_decoders=1
@@ -43,7 +30,7 @@ parallel_files_prefix=/tmp/
###### model weights
#lm order weight
-lm 1.0
+lm 0 1.0
#phrasemodel owner column(0-indexed) weight
phrasemodel pt 0 1.0
@@ -59,4 +46,3 @@ phrasemodel pt 2 0.5
#wordpenalty weight
wordpenalty -1.0
latticecost 1.0
-
View
2 examples/lattice/test.plf
@@ -1 +1 @@
-((('ein',0.1,1),('dieses',0.2,1),('haus',0.4,2),),(('haus',0.8,1),),)
+((('ein',0.21,1),('dieses',0.31,1),('haus',0.51,2),),(('haus',0.71,1),),)
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00000.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00001.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00002.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/chunk_00003.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.features
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_quantized/slice_00000.target.lookup
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00000.target.lookup
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00001.target.lookup
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00002.target.lookup
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.data
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/chunk_00003.target.lookup
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.features
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.source
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.target
Binary file not shown.
View
BIN examples/packed-grammar/hiero_tm_uncompressed/slice_00000.target.lookup
Binary file not shown.
View
2 examples/packed-grammar/packer.quantized
@@ -1,3 +1,3 @@
-chunk_size 4000
+slice_size 400000
quantizer 8bit p(e|f) lex(e|f) lex(f|e)
View
2 examples/packed-grammar/packer.uncompressed
@@ -1,3 +1,3 @@
-chunk_size 4000
+slice_size 400000
quantizer float p(e|f) lex(e|f) lex(f|e)
View
1 examples/parser/README
@@ -0,0 +1 @@
+Someone needs to write this README file.
View
172 scripts/support/bbn2plf.pl
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
# Converts BBN FSMs (text format) to Lane Schwartz's PLF format.
+# Usage: cat BBN-FILE | bbn2plf.pl > PLF-FILE
# FORMAT:
# - optional comments (#) and blank lines
@@ -12,89 +13,122 @@
# J=0 s=X E=X W=[word] v=? a=? l=? s=[double]
# ...
# J=L-1 ...
+#
+# where the Is list states and the Js enumerate edges
+
+use strict;
my @lines;
+my @states;
+my $head;
+{
+
+# read in one utterance at a time
+local $/ = 'UTTERANCE';
while (<>) {
chomp();
- # print "LINE($_)\n";
-
+ my $utterance = $_;
# skip comments and blank lines
- next if /^#/ or /^\s*$/;
-
- # new utterance
- if (/^UTTERANCE/) {
- convert_utterance(@lines) unless $first;
- @lines = ();
- $first = 0;
- }
+ $utterance =~ s/#.*//gm;
+ $utterance =~ s/^\s*$//gm;
+ # print "LINE($_)\n";
+ # print STDERR "utterance: '$utterance'\n";
+ unless($utterance =~ /^=/){next}
+ @lines = split /\n/, $utterance;
+ convert_utterance(@lines);
+}
- push(@lines, $_);
}
-convert_utterance(@lines);
+# this function prepends backslashes in front of single-quotes
+sub escape {
+ my $arg = shift;
+ $arg =~ s/'/\\'/g;
+ return $arg;
+}
sub convert_utterance {
my @lines = @_;
-
- for (;;) {
- my $line = shift(@lines);
- my ($label,$id) = split('=', $line);
- die unless ($label eq "UTTERANCE");
-
- # read in the number of states and edges
- $line = shift(@lines);
- my ($N,$L);
- ($label,$N,undef,$L) = split(/ =/, $line);
- die unless ($label eq "N");
-
- # pass over the nodes, reading what are (I think) priors or state costs
- for (my $n = 0; $n < $N; $n++) {
- $line = shift(@lines);
- my ($label,$stateno,undef,$prior) = split(/ =/, $line);
- die unless $label eq "I";
- die unless $stateno == $n;
- $states[$n] = $prior;
- }
-
- # pass over the edges
- for (my $l = 0; $l < $L; $l++) {
- $line = shift(@lines);
- my ($label,$edgeno,undef,$from,undef,$to,undef,$word,@crap) = split(/ =/, $line);
- die unless $label eq "J" and $edgeno == $l;
- my $score = pop(@crap);
- $arcs[$from][$to] = [$word,$score];
- }
+ my @arcs;
+
+ # loop until we've read everything
+ while(@lines > 0){
+ # the first line better be an utterance marker (sanity check)
+ # which, in this case, means it starts with '=' (we stripped
+ # off the 'UTTERANCE' while reading in)
+ my $numlines = @lines;
+ my $line = shift(@lines);
+ #clean up any lingering comments or blank lines
+ while($line =~ /^\s*$/ or $line =~ /^#/){$line = shift(@lines)}
+ #clean up any remaining 'UTTERANCE's
+ chomp $line;
+ die "Failed sanity check: first line ('$line') is not an utterance\n" unless $line =~ /^=/;
+
+ my (undef, $id) = split('=', $line);
+
+ # read in the number of states and edges
+ $line = shift(@lines);
+ my ($label,$N,undef,$L) = split(/[ =]/, $line);
+ die "Problem reading states and edges: '$label' is not 'N' in '$line'\n" unless ($label eq "N");
+
+ # pass over the nodes, reading what are (I think) priors or state costs
+ for (my $n = 0; $n < $N; $n++) {
+ $line = shift(@lines);
+ while($line =~ /^\s*$/ or $line =~ /^#/) {$line = shift(@lines)}
+ my ($label,$stateno,undef,$prior) = split(/[ =]/, $line);
+ die "Problem reading node '$line': '$label' != 'I'\n" unless $label eq "I";
+ die "Problem reading node '$line': '$stateno' != '$n'\n" unless $stateno == $n;
+ $states[$n] = $prior;
+ }
+
+ # pass over the edges. arcs is a two-level table marking (from,to) pairs
+ for (my $l = 0; $l < $L; $l++) {
+ $line = shift(@lines);
+ while($line =~ /^\s*$/ or $line =~ /^#/) {$line = shift(@lines)}
+ my ($label,$edgeno,undef,$from,undef,$to,undef,$word,@crap) = split(/[ =]/, $line);
+ die "Problem reading edge '$line': '$label' != 'J' or '$edgeno' != '$l'\n" unless $label eq "J" and $edgeno == $l;
+ my $score = pop(@crap);
+ my @pair = ($word, $score);
+ if( $arcs[$from][$to]){
+ push @{$arcs[$from][$to]}, \@pair;
+ }
+ else{
+ my @pairslist = (\@pair);
+ $arcs[$from][$to] = \@pairslist;
+ }
+ }
}
- my ($i,$j,undef,$label,$score) = split(' ',$_);
-
- if (defined $j) {
- if ($j < $i) {
- print "* FATAL: $j < $i\n";
- exit;
+#turn the 'to's into offsets
+my @newarcs;
+for(my $i=0; $i<@arcs; $i++){
+ for(my $j=$i; $j<@{$arcs[$i]}; $j++){
+ if (defined $arcs[$i][$j]){
+ foreach my $pair (@{$arcs[$i][$j]}){
+ my $newj = $j-$i;
+ push @{$newarcs[$i][$newj]}, $pair;
+ }
+ }
}
- push @{$arcs[$i][$j-$i]}, [$label,$score];
- }
}
-
-print "(\n";
-foreach my $i (0..$#arcs) {
- print " (\n";
- foreach my $j (0..$#{$arcs[$i]}) {
- if (defined $arcs[$i][$j]) {
- foreach my $arc (@{$arcs[$i][$j]}) {
- my ($label,$score) = @$arc;
- $head = $j;
- print " ('".escape($label)."', $score, $head),\n";
- }
+@arcs = @newarcs;
+
+ # now print out the lattices
+ print "(\n";
+ foreach my $i (0..@arcs) {
+ if (defined $arcs[$i]){
+ print " (\n";
+ foreach my $j (0..@{$arcs[$i]}) {
+ if (defined $arcs[$i][$j]) {
+ foreach my $arc (@{$arcs[$i][$j]}) {
+ my ($label,$score) = @$arc;
+ $head = $j;
+ print " ('".escape($label)."', $head, $score),";
+ }
+ }
+ }
+ print "\n ),\n";
+ }
}
- }
- print " ),\n";
-}
-print ")\n";
-
-sub escape {
- my $arg = shift;
- $arg =~ s/'/\\'/g;
- return $arg;
+ print ")\n";
}
View
5 scripts/training/build-vocab.pl
@@ -5,6 +5,11 @@
#
# ID WORD COUNT
+use utf8;
+
+binmode(STDIN, ":utf-8");
+binmode(STDOUT, ":utf-8");
+
while (<>) {
chomp;
split;
View
13 scripts/training/normalize-punctuation.pl
@@ -6,6 +6,9 @@
use strict;
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
my ($language) = @ARGV;
while(<STDIN>) {
@@ -54,24 +57,24 @@
# English "quotation," followed by comma, style
if ($language eq "en") {
- s/\"([,\.]+)/$1\"/g;
+ s/\"([,\.]+)/$1\"/g;
}
# Czech is confused
elsif ($language eq "cs" || $language eq "cz") {
}
# German/Spanish/French "quotation", followed by comma, style
else {
- s/,\"/\",/g;
- s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
+ s/,\"/\",/g;
+ s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
}
print STDERR $_ if //;
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
- s/(\d) (\d)/$1,$2/g;
+ s/(\d) (\d)/$1,$2/g;
}
else {
- s/(\d) (\d)/$1.$2/g;
+ s/(\d) (\d)/$1.$2/g;
}
print $_;
}
View
2 scripts/training/parallelize/LocalConfig.pm
@@ -37,7 +37,7 @@ my $CCONFIG = {
'QSubMemFlag' => '-l pmem=',
},
'HLTCOE' => {
- 'HOST_REGEXP' => qr/(test1|test2|test3|test4|r\d+n\d+|hltcoe\.jhu\.edu)/,
+ 'HOST_REGEXP' => qr/(test1|test2|test3|test4|himem|r\d+n\d+|hltcoe\.jhu\.edu)/,
'QSubMemFlag' => '-l num_proc=1,h_rt=24:00:00,mem_free=16g,h_vmem=',
# 'QSubQueue' => '-q '
# 'QSubQueue' => '-q mem.q'
View
995 scripts/training/pipeline.pl
@@ -9,12 +9,12 @@
BEGIN {
if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "" ||
! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "") {
- print "Several environment variables must be set before running the pipeline. Please set:\n";
- print "* \$JOSHUA to the root of the Joshua source code.\n"
- if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "");
- print "* \$JAVA_HOME to the directory of your local java installation. \n"
- if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "");
- exit;
+ print "Several environment variables must be set before running the pipeline. Please set:\n";
+ print "* \$JOSHUA to the root of the Joshua source code.\n"
+ if (! exists $ENV{JOSHUA} || $ENV{JOSHUA} eq "");
+ print "* \$JAVA_HOME to the directory of your local java installation. \n"
+ if (! exists $ENV{JAVA_HOME} || $ENV{JAVA_HOME} eq "");
+ exit;
}
$JOSHUA = $ENV{JOSHUA};
unshift(@INC,"$JOSHUA/scripts/training/cachepipe");
@@ -72,7 +72,7 @@ BEGIN
'mert.config' => "$TUNECONFDIR/mert.config",
'pro.config' => "$TUNECONFDIR/pro.config",
'params.txt' => "$TUNECONFDIR/params.txt",
-);
+ );
my $DO_MBR = 1;
@@ -149,35 +149,35 @@ BEGIN
my $PARSED_CORPUS = undef;
my $retval = GetOptions(
- "corpus=s" => \@CORPORA,
+ "corpus=s" => \@CORPORA,
"parsed-corpus=s" => \$PARSED_CORPUS,
- "tune=s" => \$TUNE,
+ "tune=s" => \$TUNE,
"test=s" => \$TEST,
"prepare!" => \$DO_PREPARE_CORPORA,
"data-dir=s" => \$DATA_DIR,
"name=s" => \$NAME,
"aligner=s" => \$ALIGNER,
- "alignment=s" => \$ALIGNMENT,
+ "alignment=s" => \$ALIGNMENT,
"giza-merge=s" => \$GIZA_MERGE,
"aligner-mem=s" => \$ALIGNER_MEM,
- "source=s" => \$SOURCE,
- "target=s" => \$TARGET,
- "rundir=s" => \$RUNDIR,
+ "source=s" => \$SOURCE,
+ "target=s" => \$TARGET,
+ "rundir=s" => \$RUNDIR,
"filter-tm!" => \$DO_FILTER_TM,
"filter-lm!" => \$DO_FILTER_LM,
"lm=s" => \$LM_TYPE,
- "lmfile=s" => \@LMFILES,
+ "lmfile=s" => \@LMFILES,
"lm-gen=s" => \$LM_GEN,
"corpus-lm!" => \$DO_BUILD_LM_FROM_CORPUS,
- "witten-bell!" => \$WITTEN_BELL,
+ "witten-bell!" => \$WITTEN_BELL,
"tune-grammar=s" => \$TUNE_GRAMMAR_FILE,
"test-grammar=s" => \$TEST_GRAMMAR_FILE,
- "grammar=s" => \$GRAMMAR_FILE,
- "glue-grammar=s" => \$GLUE_GRAMMAR_FILE,
+ "grammar=s" => \$GRAMMAR_FILE,
+ "glue-grammar=s" => \$GLUE_GRAMMAR_FILE,
"mbr!" => \$DO_MBR,
- "type=s" => \$GRAMMAR_TYPE,
- "maxlen=i" => \$MAXLEN,
- "tokenizer=s" => \$TOKENIZER,
+ "type=s" => \$GRAMMAR_TYPE,
+ "maxlen=i" => \$MAXLEN,
+ "tokenizer=s" => \$TOKENIZER,
"joshua-config=s" => \$TUNEFILES{'joshua.config'},
"joshua-mem=s" => \$JOSHUA_MEM,
"hadoop-mem=s" => \$HADOOP_MEM,
@@ -189,15 +189,15 @@ BEGIN
"thrax-conf=s" => \$THRAX_CONF_FILE,
"jobs=i" => \$NUM_JOBS,
"threads=i" => \$NUM_THREADS,
- "subsample!" => \$DO_SUBSAMPLE,
- "qsub-args=s" => \$QSUB_ARGS,
- "first-step=s" => \$FIRST_STEP,
- "last-step=s" => \$LAST_STEP,
+ "subsample!" => \$DO_SUBSAMPLE,
+ "qsub-args=s" => \$QSUB_ARGS,
+ "first-step=s" => \$FIRST_STEP,
+ "last-step=s" => \$LAST_STEP,
"aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE,
"hadoop=s" => \$HADOOP,
"omit-cmd!" => \$OMIT_CMD,
"optimizer-runs=i" => \$OPTIMIZER_RUNS,
-);
+ );
if (! $retval) {
print "Invalid usage, quitting\n";
@@ -208,7 +208,7 @@ BEGIN
train => "$DATA_DIR/train",
tune => "$DATA_DIR/tune",
test => "$DATA_DIR/test",
-);
+ );
if (defined $NAME) {
map { $DATA_DIRS{$_} .= "/$NAME" } (keys %DATA_DIRS);
@@ -226,7 +226,7 @@ BEGIN
# determining to run a command. Note that this is not backwards
# compatible!
$cachepipe->omit_cmd()
- if ($OMIT_CMD);
+ if ($OMIT_CMD);
$SIG{INT} = sub {
print "* Got C-c, quitting\n";
@@ -244,8 +244,8 @@ BEGIN
# make sure the LMs exist
foreach my $lmfile (@LMFILES) {
if (! -e $lmfile) {
- print "* FATAL: couldn't find language model file '$lmfile'\n";
- exit 1;
+ print "* FATAL: couldn't find language model file '$lmfile'\n";
+ exit 1;
}
}
@@ -270,30 +270,30 @@ BEGIN
# make sure a tuning corpus was provided if we're doing tuning
if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
- and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) {
+ and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) {
print "* FATAL: need a tuning set (--tune)\n";
exit 1;
}
# make sure a test corpus was provided if we're decoding a test set
if (! defined $TEST and ($STEPS{$FIRST_STEP} <= $STEPS{TEST}
- and $STEPS{$LAST_STEP} >= $STEPS{TEST})) {
+ and $STEPS{$LAST_STEP} >= $STEPS{TEST})) {
print "* FATAL: need a test set (--test)\n";
exit 1;
}
# make sure a grammar file was given if we're skipping training
if (! defined $GRAMMAR_FILE) {
if ($STEPS{$FIRST_STEP} >= $STEPS{TEST}) {
- if (! defined $TEST_GRAMMAR_FILE) {
- print "* FATAL: need a grammar (--grammar or --test-grammar) if you're skipping to testing\n";
- exit 1;
- }
+ if (! defined $TEST_GRAMMAR_FILE) {
+ print "* FATAL: need a grammar (--grammar or --test-grammar) if you're skipping to testing\n";
+ exit 1;
+ }
} elsif ($STEPS{$FIRST_STEP} >= $STEPS{TUNE}) {
- if (! defined $TUNE_GRAMMAR_FILE) {
- print "* FATAL: need a grammar (--grammar or --tune-grammar) if you're skipping grammar learning\n";
- exit 1;
- }
+ if (! defined $TUNE_GRAMMAR_FILE) {
+ print "* FATAL: need a grammar (--grammar or --tune-grammar) if you're skipping grammar learning\n";
+ exit 1;
+ }
}
}
@@ -328,10 +328,10 @@ BEGIN
foreach my $corpus (@CORPORA) {
foreach my $ext ($TARGET,$SOURCE) {
- if (! -e "$corpus.$ext") {
- print "* FATAL: can't find '$corpus.$ext'";
- exit 1;
- }
+ if (! -e "$corpus.$ext") {
+ print "* FATAL: can't find '$corpus.$ext'";
+ exit 1;
+ }
}
}
@@ -393,27 +393,27 @@ BEGIN
if ($FIRST_STEP ne "FIRST") {
if (@CORPORA > 1) {
- print "* FATAL: you can't skip steps if you specify more than one --corpus\n";
- exit(1);
+ print "* FATAL: you can't skip steps if you specify more than one --corpus\n";
+ exit(1);
}
if (eval { goto $FIRST_STEP }) {
- print "* Skipping to step $FIRST_STEP\n";
- goto $FIRST_STEP;
+ print "* Skipping to step $FIRST_STEP\n";
+ goto $FIRST_STEP;
} else {
- print "* No such step $FIRST_STEP\n";
- exit 1;
+ print "* No such step $FIRST_STEP\n";
+ exit 1;
}
}
## STEP 1: filter and preprocess corpora #############################
FIRST:
-if (defined $ALIGNMENT) {
- print "* FATAL: it doesn't make sense to provide an alignment and then do\n";
- print " tokenization. Either remove --alignment or specify a first step\n";
- print " of Thrax (--first-step THRAX)\n";
- exit 1;
+ if (defined $ALIGNMENT) {
+ print "* FATAL: it doesn't make sense to provide an alignment and then do\n";
+ print " tokenization. Either remove --alignment or specify a first step\n";
+ print " of Thrax (--first-step THRAX)\n";
+ exit 1;
}
if (@CORPORA == 0) {
@@ -426,7 +426,7 @@ BEGIN
TRAIN => 0,
TUNE => 0,
TEST => 0
-);
+ );
if ($DO_PREPARE_CORPORA) {
@@ -437,7 +437,7 @@ BEGIN
$TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
foreach my $lang ($SOURCE,$TARGET) {
- system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
+ system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
}
$TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
$TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
@@ -466,32 +466,32 @@ BEGIN
SUBSAMPLE:
# subsample
-if ($DO_SUBSAMPLE) {
- mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled";
-
- $cachepipe->cmd("subsample-manifest",
- "echo corpus > $DATA_DIRS{train}/subsampled/manifest",
- "$DATA_DIRS{train}/subsampled/manifest");
-
- $cachepipe->cmd("subsample-testdata",
- "cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data",
- $TUNE{source},
- $TEST{source},
- "$DATA_DIRS{train}/subsampled/test-data");
-
- $cachepipe->cmd("subsample",
- "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest",
- "$DATA_DIRS{train}/subsampled/manifest",
- "$DATA_DIRS{train}/subsampled/test-data",
- $TRAIN{source},
- $TRAIN{target},
- "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET",
- "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE");
-
- # rewrite the symlinks to point to the subsampled corpus
- foreach my $lang ($TARGET,$SOURCE) {
- system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang");
- }
+ if ($DO_SUBSAMPLE) {
+ mkdir("$DATA_DIRS{train}/subsampled") unless -d "$DATA_DIRS{train}/subsampled";
+
+ $cachepipe->cmd("subsample-manifest",
+ "echo corpus > $DATA_DIRS{train}/subsampled/manifest",
+ "$DATA_DIRS{train}/subsampled/manifest");
+
+ $cachepipe->cmd("subsample-testdata",
+ "cat $TUNE{source} $TEST{source} > $DATA_DIRS{train}/subsampled/test-data",
+ $TUNE{source},
+ $TEST{source},
+ "$DATA_DIRS{train}/subsampled/test-data");
+
+ $cachepipe->cmd("subsample",
+ "java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/bin:$JOSHUA/lib/commons-cli-2.0-SNAPSHOT.jar joshua.subsample.Subsampler -e $TARGET -f $SOURCE -epath $DATA_DIRS{train}/ -fpath $DATA_DIRS{train}/ -output $DATA_DIRS{train}/subsampled/subsampled.$MAXLEN -ratio 1.04 -test $DATA_DIRS{train}/subsampled/test-data -training $DATA_DIRS{train}/subsampled/manifest",
+ "$DATA_DIRS{train}/subsampled/manifest",
+ "$DATA_DIRS{train}/subsampled/test-data",
+ $TRAIN{source},
+ $TRAIN{target},
+ "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$TARGET",
+ "$DATA_DIRS{train}/subsampled/subsampled.$MAXLEN.$SOURCE");
+
+ # rewrite the symlinks to point to the subsampled corpus
+ foreach my $lang ($TARGET,$SOURCE) {
+ system("ln -sf subsampled/subsampled.$MAXLEN.$lang $DATA_DIRS{train}/corpus.$lang");
+ }
}
maybe_quit("SUBSAMPLE");
@@ -503,18 +503,18 @@ BEGIN
# This basically means that we've skipped tokenization, in which case
# we still want to move the input files into the canonical place
-if ($FIRST_STEP eq "ALIGN") {
- if (defined $ALIGNMENT) {
- print "* FATAL: It doesn't make sense to provide an alignment\n";
- print " but not to skip the tokenization and subsampling steps\n";
- exit 1;
- }
+ if ($FIRST_STEP eq "ALIGN") {
+ if (defined $ALIGNMENT) {
+ print "* FATAL: It doesn't make sense to provide an alignment\n";
+ print " but not to skip the tokenization and subsampling steps\n";
+ exit 1;
+ }
- # TODO: copy the files into the canonical place
+ # TODO: copy the files into the canonical place
- # Jumping straight to alignment is probably the same thing as
- # skipping tokenization, and might also be implemented by a
- # --no-tokenization flag
+ # Jumping straight to alignment is probably the same thing as
+ # skipping tokenization, and might also be implemented by a
+ # --no-tokenization flag
}
# skip this step if an alignment was provided
@@ -524,8 +524,8 @@ BEGIN
system("mkdir","-p","$DATA_DIRS{train}/splits") unless -d "$DATA_DIRS{train}/splits";
$cachepipe->cmd("source-numlines",
- "cat $TRAIN{source} | wc -l",
- $TRAIN{source});
+ "cat $TRAIN{source} | wc -l",
+ $TRAIN{source});
my $numlines = $cachepipe->stdout();
my $numchunks = ceil($numlines / $ALIGNER_BLOCKSIZE);
@@ -534,26 +534,26 @@ BEGIN
my $lastchunk = -1;
while (my $target = <TARGET>) {
- my $source = <SOURCE>;
-
- # We want to prevent a very small last chunk, which we accomplish
- # by folding the last chunk into the penultimate chunk.
- my $chunk = ($numchunks <= 2)
- ? 0
- : min($numchunks - 2,
- int( (${.} - 1) / $ALIGNER_BLOCKSIZE ));
-
- if ($chunk != $lastchunk) {
- close CHUNK_SOURCE;
- close CHUNK_TARGET;
- open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/corpus.$SOURCE.$chunk" or die;
- open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/corpus.$TARGET.$chunk" or die;
+ my $source = <SOURCE>;
+
+ # We want to prevent a very small last chunk, which we accomplish
+ # by folding the last chunk into the penultimate chunk.
+ my $chunk = ($numchunks <= 2)
+ ? 0
+ : min($numchunks - 2,
+ int( (${.} - 1) / $ALIGNER_BLOCKSIZE ));
+
+ if ($chunk != $lastchunk) {
+ close CHUNK_SOURCE;
+ close CHUNK_TARGET;
+ open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/corpus.$SOURCE.$chunk" or die;
+ open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/corpus.$TARGET.$chunk" or die;
- $lastchunk = $chunk;
- }
+ $lastchunk = $chunk;
+ }
- print CHUNK_SOURCE $source;
- print CHUNK_TARGET $target;
+ print CHUNK_SOURCE $source;
+ print CHUNK_TARGET $target;
}
close CHUNK_SOURCE;
close CHUNK_TARGET;
@@ -563,44 +563,44 @@ BEGIN
# my $max_aligner_threads = $NUM_THREADS;
# if ($ALIGNER eq "giza" and $max_aligner_threads > 1) {
- # $max_aligner_threads /= 2;
+ # $max_aligner_threads /= 2;
# }
# # With multi-threading, we can use a pool to set up concurrent GIZA jobs on the chunks.
# my $pool = new Thread::Pool(Min => 1, Max => $max_aligner_threads);
for (my $chunkno = 0; $chunkno <= $lastchunk; $chunkno++) {
- # create the alignment subdirectory
- my $chunkdir = "alignments/$chunkno";
- system("mkdir","-p", $chunkdir);
-
- if ($ALIGNER eq "giza") {
- run_giza($chunkdir, $chunkno, $NUM_THREADS > 1);
- # $pool->enqueue(\&run_giza, $chunkdir, $chunkno, $NUM_THREADS > 1);
-
- } elsif ($ALIGNER eq "berkeley") {
- run_berkeley_aligner($chunkdir, $chunkno);
- # $pool->enqueue(\&run_berkeley_aligner, $chunkdir, $chunkno);
- }
+ # create the alignment subdirectory
+ my $chunkdir = "alignments/$chunkno";
+ system("mkdir","-p", $chunkdir);
+
+ if ($ALIGNER eq "giza") {
+ run_giza($chunkdir, $chunkno, $NUM_THREADS > 1);
+ # $pool->enqueue(\&run_giza, $chunkdir, $chunkno, $NUM_THREADS > 1);
+
+ } elsif ($ALIGNER eq "berkeley") {
+ run_berkeley_aligner($chunkdir, $chunkno);
+ # $pool->enqueue(\&run_berkeley_aligner, $chunkdir, $chunkno);
+ }
}
# wait for all the threads to finish
# $pool->join();
if ($ALIGNER eq "giza") {
- # combine the alignments
- $cachepipe->cmd("giza-aligner-combine",
- "cat alignments/*/model/aligned.grow-diag-final > alignments/training.align",
- "alignments/$lastchunk/model/aligned.grow-diag-final",
- "alignments/training.align");
+ # combine the alignments
+ $cachepipe->cmd("giza-aligner-combine",
+ "cat alignments/*/model/aligned.grow-diag-final > alignments/training.align",
+ "alignments/$lastchunk/model/aligned.grow-diag-final",
+ "alignments/training.align");
} elsif ($ALIGNER eq "berkeley") {
- # combine the alignments
- $cachepipe->cmd("berkeley-aligner-combine",
- "cat alignments/*/training.align > alignments/training.align",
- "alignments/$lastchunk/training.align",
- "alignments/training.align");
+ # combine the alignments
+ $cachepipe->cmd("berkeley-aligner-combine",
+ "cat alignments/*/training.align > alignments/training.align",
+ "alignments/$lastchunk/training.align",
+ "alignments/training.align");
}
$ALIGNMENT = "alignments/training.align";
@@ -613,46 +613,46 @@ BEGIN
PARSE:
-if ($GRAMMAR_TYPE eq "samt") {
-
- # If the user passed in the already-parsed corpus, use that (after copying it into place)
- if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) {
- # copy and adjust the location of the file to its canonical location
- system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET");
- $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
- } else {
-
- $cachepipe->cmd("build-vocab",
- "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET",
- $TRAIN{target},
- "$DATA_DIRS{train}/vocab.$TARGET");
-
- if ($NUM_JOBS > 1) {
- # the black-box parallelizer model doesn't work with multiple
- # threads, so we're always spawning single-threaded instances here
-
- # open PARSE, ">parse.sh" or die;
- # print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n";
- # close PARSE;
- # chmod 0755, "parse.sh";
- # $cachepipe->cmd("parse",
- # "setsid ./parse.sh",
- # "$TRAIN{target}",
- # "$DATA_DIRS{train}/corpus.parsed.$TARGET");
-
- $cachepipe->cmd("parse",
- "$CAT $TRAIN{mixedcase} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
- "$TRAIN{target}",
- "$DATA_DIRS{train}/corpus.parsed.$TARGET");
- } else {
- $cachepipe->cmd("parse",
- "$CAT $TRAIN{mixedcase} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
- "$TRAIN{target}",
- "$DATA_DIRS{train}/corpus.parsed.$TARGET");
- }
-
- $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
- }
+ if ($GRAMMAR_TYPE eq "samt") {
+
+ # If the user passed in the already-parsed corpus, use that (after copying it into place)
+ if (defined $TRAIN{parsed} && -e $TRAIN{parsed}) {
+ # copy and adjust the location of the file to its canonical location
+ system("cp $TRAIN{parsed} $DATA_DIRS{train}/corpus.parsed.$TARGET");
+ $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
+ } else {
+
+ $cachepipe->cmd("build-vocab",
+ "cat $TRAIN{target} | $SCRIPTDIR/training/build-vocab.pl > $DATA_DIRS{train}/vocab.$TARGET",
+ $TRAIN{target},
+ "$DATA_DIRS{train}/vocab.$TARGET");
+
+ if ($NUM_JOBS > 1) {
+ # the black-box parallelizer model doesn't work with multiple
+ # threads, so we're always spawning single-threaded instances here
+
+ # open PARSE, ">parse.sh" or die;
+ # print PARSE "cat $TRAIN{target} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed.mc | perl -pi -e 's/(\\S+)\\)/lc(\$1).\")\"/ge' | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET > $DATA_DIRS{train}/corpus.parsed.$TARGET\n";
+ # close PARSE;
+ # chmod 0755, "parse.sh";
+ # $cachepipe->cmd("parse",
+ # "setsid ./parse.sh",
+ # "$TRAIN{target}",
+ # "$DATA_DIRS{train}/corpus.parsed.$TARGET");
+
+ $cachepipe->cmd("parse",
+ "$CAT $TRAIN{mixedcase} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_JOBS --qsub-args \"$QSUB_ARGS\" -p 8g -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
+ "$TRAIN{target}",
+ "$DATA_DIRS{train}/corpus.parsed.$TARGET");
+ } else {
+ $cachepipe->cmd("parse",
+ "$CAT $TRAIN{mixedcase} | $JOSHUA/scripts/training/parallelize/parallelize.pl --jobs $NUM_THREADS --use-fork -- java -d64 -Xmx${PARSER_MEM} -jar $JOSHUA/lib/BerkeleyParser.jar -gr $JOSHUA/lib/eng_sm6.gr -nThreads 1 | sed 's/^\(/\(TOP/' | perl $SCRIPTDIR/training/add-OOVs.pl $DATA_DIRS{train}/vocab.$TARGET | tee $DATA_DIRS{train}/corpus.$TARGET.parsed | $SCRIPTDIR/training/lowercase-leaves.pl > $DATA_DIRS{train}/corpus.parsed.$TARGET",
+ "$TRAIN{target}",
+ "$DATA_DIRS{train}/corpus.parsed.$TARGET");
+ }
+
+ $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
+ }
}
maybe_quit("PARSE");
@@ -661,47 +661,47 @@ BEGIN
THRAX:
-system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train};
+ system("mkdir -p $DATA_DIRS{train}") unless -d $DATA_DIRS{train};
if ($GRAMMAR_TYPE eq "samt") {
# if we jumped right here, $TRAIN{target} should be parsed
if (exists $TRAIN{parsed}) {
- # parsing step happened in-script or a parsed corpus was passed in explicitly, all is well
+ # parsing step happened in-script or a parsed corpus was passed in explicitly, all is well
} elsif (already_parsed($TRAIN{target})) {
- # skipped straight to this step, passing a parsed corpus
+ # skipped straight to this step, passing a parsed corpus
- $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
-
- $cachepipe->cmd("cp-train-$TARGET",
- "cp $TRAIN{target} $TRAIN{parsed}",
- $TRAIN{target},
- $TRAIN{parsed});
-
- $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
-
- # now extract the leaves of the parsed corpus
- $cachepipe->cmd("extract-leaves",
- "cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}",
- $TRAIN{parsed},
- $TRAIN{target});
-
- if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") {
- $cachepipe->cmd("cp-train-$SOURCE",
- "cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE",
- $TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE");
- $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
- }
+ $TRAIN{parsed} = "$DATA_DIRS{train}/corpus.parsed.$TARGET";
+
+ $cachepipe->cmd("cp-train-$TARGET",
+ "cp $TRAIN{target} $TRAIN{parsed}",
+ $TRAIN{target},
+ $TRAIN{parsed});
+
+ $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
+
+ # now extract the leaves of the parsed corpus
+ $cachepipe->cmd("extract-leaves",
+ "cat $TRAIN{parsed} | perl -pe 's/\\(.*?(\\S\+)\\)\+?/\$1/g' | perl -pe 's/\\)//g' > $TRAIN{target}",
+ $TRAIN{parsed},
+ $TRAIN{target});
+
+ if ($TRAIN{source} ne "$DATA_DIRS{train}/corpus.$SOURCE") {
+ $cachepipe->cmd("cp-train-$SOURCE",
+ "cp $TRAIN{source} $DATA_DIRS{train}/corpus.$SOURCE",
+ $TRAIN{source}, "$DATA_DIRS{train}/corpus.$SOURCE");
+ $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
+ }
} else {
- print "* FATAL: You requested to build an SAMT grammar, but provided an\n";
- print " unparsed corpus. Please re-run the pipeline and begin no later\n";
- print " than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n";
- print " using --parsed-corpus CORPUS.\n";
- exit 1;
+ print "* FATAL: You requested to build an SAMT grammar, but provided an\n";
+ print " unparsed corpus. Please re-run the pipeline and begin no later\n";
+ print " than the PARSE step (--first-step PARSE), or pass in a parsed corpus\n";
+ print " using --parsed-corpus CORPUS.\n";
+ exit 1;
}
-
+
}
# we may have skipped directly to this step, in which case we need to
@@ -715,64 +715,64 @@ BEGIN
if (! -e "grammar.gz") {
- # create the input file
- my $target_file = ($GRAMMAR_TYPE eq "hiero")
- ? $TRAIN{target} : $TRAIN{parsed};
- $cachepipe->cmd("thrax-input-file",
- "paste $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file",
- $TRAIN{source}, $target_file, $ALIGNMENT,
- "$DATA_DIRS{train}/thrax-input-file");
+ # create the input file
+ my $target_file = ($GRAMMAR_TYPE eq "hiero")
+ ? $TRAIN{target} : $TRAIN{parsed};
+ $cachepipe->cmd("thrax-input-file",
+ "paste $TRAIN{source} $target_file $ALIGNMENT | perl -pe 's/\\t/ ||| /g' | grep -v '()' | grep -v '||| \\+\$' > $DATA_DIRS{train}/thrax-input-file",
+ $TRAIN{source}, $target_file, $ALIGNMENT,
+ "$DATA_DIRS{train}/thrax-input-file");
- # rollout the hadoop cluster if needed
- start_hadoop_cluster() unless defined $HADOOP;
+ # rollout the hadoop cluster if needed
+ start_hadoop_cluster() unless defined $HADOOP;
- # put the hadoop files in place
- my $THRAXDIR;
- my $thrax_input;
- if ($HADOOP eq "hadoop") {
- $THRAXDIR = "thrax";
+ # put the hadoop files in place
+ my $THRAXDIR;
+ my $thrax_input;
+ if ($HADOOP eq "hadoop") {
+ $THRAXDIR = "thrax";
- $thrax_input = "$DATA_DIRS{train}/thrax-input-file"
+ $thrax_input = "$DATA_DIRS{train}/thrax-input-file"
- } else {
- $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
- $THRAXDIR =~ s#/#_#g;
+ } else {
+ $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
+ $THRAXDIR =~ s#/#_#g;
- $cachepipe->cmd("thrax-prep",
- "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
- "$DATA_DIRS{train}/thrax-input-file",
- "grammar.gz");
+ $cachepipe->cmd("thrax-prep",
+ "$HADOOP/bin/hadoop fs -rmr $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
+ "$DATA_DIRS{train}/thrax-input-file",
+ "grammar.gz");
- $thrax_input = "$THRAXDIR/input-file";
- }
+ $thrax_input = "$THRAXDIR/input-file";
+}
- # copy the thrax config file
- my $thrax_file = "thrax-$GRAMMAR_TYPE.conf";
- system("grep -v ^input-file $THRAX_CONF_FILE > $thrax_file.tmp");
- system("echo input-file $thrax_input >> $thrax_file.tmp");
- system("mv $thrax_file.tmp $thrax_file");
+ # copy the thrax config file
+ my $thrax_file = "thrax-$GRAMMAR_TYPE.conf";
+ system("grep -v ^input-file $THRAX_CONF_FILE > $thrax_file.tmp");
+ system("echo input-file $thrax_input >> $thrax_file.tmp");
+ system("mv $thrax_file.tmp $thrax_file");
- $cachepipe->cmd("thrax-run",
- "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar; $HADOOP/bin/hadoop fs -rmr $THRAXDIR; gzip -9nf grammar",
- "$DATA_DIRS{train}/thrax-input-file",
- $thrax_file,
- "grammar.gz");
+ $cachepipe->cmd("thrax-run",
+ "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapred.child.java.opts='-Xmx$HADOOP_MEM' $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar; $HADOOP/bin/hadoop fs -rmr $THRAXDIR; gzip -9nf grammar",
+ "$DATA_DIRS{train}/thrax-input-file",
+ $thrax_file,
+ "grammar.gz");
#perl -pi -e 's/\.?0+\b//g' grammar;
- stop_hadoop_cluster() if $HADOOP eq "hadoop";
+ stop_hadoop_cluster() if $HADOOP eq "hadoop";
- # cache the thrax-prep step, which depends on grammar.gz
- if ($HADOOP ne "hadoop") {
- $cachepipe->cmd("thrax-prep", "--cache-only");
- }
+ # cache the thrax-prep step, which depends on grammar.gz
+ if ($HADOOP ne "hadoop") {
+ $cachepipe->cmd("thrax-prep", "--cache-only");
+ }
- # clean up
- # TODO: clean up real hadoop clusters too
- if ($HADOOP eq "hadoop") {
- system("rm -rf $THRAXDIR hadoop hadoop-0.20.2");
- }
+ # clean up
+ # TODO: clean up real hadoop clusters too
+ if ($HADOOP eq "hadoop") {
+ system("rm -rf $THRAXDIR hadoop hadoop-0.20.2");
+ }
}
# set the grammar file
@@ -783,11 +783,11 @@ BEGIN
## TUNING ##############################################################
TUNE:
- ;
+ ;
MERT:
- ;
+ ;
PRO:
- ;
+ ;
# prep the tuning data, unless already prepped
if (! $PREPPED{TUNE} and $DO_PREPARE_CORPORA) {
my $prefixes = prepare_data("tune",[$TUNE]);
@@ -801,32 +801,32 @@ BEGIN
# make sure the training data is prepped
if (! $PREPPED{TRAIN} and $DO_PREPARE_CORPORA) {
- my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN);
-
- $TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
- foreach my $lang ($SOURCE,$TARGET) {
- system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
- }
- $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
- $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
- $PREPPED{TRAIN} = 1;
+ my $prefixes = prepare_data("train",\@CORPORA,$MAXLEN);
+
+ $TRAIN{prefix} = "$DATA_DIRS{train}/corpus";
+ foreach my $lang ($SOURCE,$TARGET) {
+ system("ln -sf $prefixes->{lowercased}.$lang $DATA_DIRS{train}/corpus.$lang");
+ }
+ $TRAIN{source} = "$DATA_DIRS{train}/corpus.$SOURCE";
+ $TRAIN{target} = "$DATA_DIRS{train}/corpus.$TARGET";
+ $PREPPED{TRAIN} = 1;
}
if (! -e $TRAIN{target}) {
- print "* FATAL: I need either a language model (--lmfile) or a training corpus to build it from (--corpus)\n";
- exit(1);
+ print "* FATAL: I need either a language model (--lmfile) or a training corpus to build it from (--corpus)\n";
+ exit(1);
}
my $lmfile = "lm.gz";
if ($LM_GEN eq "srilm") {
- my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount";
- $cachepipe->cmd("srilm",
- "$SRILM -interpolate $smoothing -order $LM_ORDER -text $TRAIN{target} -unk -lm lm.gz",
- $lmfile);
+ my $smoothing = ($WITTEN_BELL) ? "-wbdiscount" : "-kndiscount";
+ $cachepipe->cmd("srilm",
+ "$SRILM -interpolate $smoothing -order $LM_ORDER -text $TRAIN{target} -unk -lm lm.gz",
+ $lmfile);
} else {
- $cachepipe->cmd("berkeleylm",
- "java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/lib/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}",
- $lmfile);
+ $cachepipe->cmd("berkeleylm",
+ "java -ea -mx$BUILDLM_MEM -server -cp $JOSHUA/lib/berkeleylm.jar edu.berkeley.nlp.lm.io.MakeKneserNeyArpaFromText $LM_ORDER lm.gz $TRAIN{target}",
+ $lmfile);
}
push (@LMFILES, $lmfile);
@@ -845,40 +845,40 @@ BEGIN
}
if ($numrefs > 1) {
for my $i (0..$numrefs-1) {
- if (! -e "$TUNE{target}.$i") {
- print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n";
- exit 1;
- }
+ if (! -e "$TUNE{target}.$i") {
+ print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}.$i'\n";
+ exit 1;
+ }
}
} else {
if (! -e $TUNE{target}) {
- print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n";
- exit 1;
+ print STDERR "* FATAL: couldn't find tuning reference file '$TUNE{target}'\n";
+ exit 1;
}
}
# filter the tuning grammar
my $TUNE_GRAMMAR = (defined $TUNE_GRAMMAR_FILE)
- ? $TUNE_GRAMMAR_FILE
- : $GRAMMAR_FILE;
+ ? $TUNE_GRAMMAR_FILE
+ : $GRAMMAR_FILE;
if ($DO_FILTER_TM and ! defined $TUNE_GRAMMAR_FILE) {
$TUNE_GRAMMAR = "$DATA_DIRS{tune}/grammar.filtered.gz";
$cachepipe->cmd("filter-tune",
- "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TUNE{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TUNE_GRAMMAR",
- $GRAMMAR_FILE,
- $TUNE{source},
- $TUNE_GRAMMAR);
+ "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TUNE{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TUNE_GRAMMAR",
+ $GRAMMAR_FILE,
+ $TUNE{source},
+ $TUNE_GRAMMAR);
}
# create the glue grammars
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-tune",
- "$CAT $TUNE_GRAMMAR | java -Xmx2g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{tune}/grammar.glue",
- $TUNE_GRAMMAR,
- "$DATA_DIRS{tune}/grammar.glue");
+ "$CAT $TUNE_GRAMMAR | java -Xmx2g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{tune}/grammar.glue",
+ $TUNE_GRAMMAR,
+ "$DATA_DIRS{tune}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
} else {
# just create a symlink to it
@@ -912,62 +912,62 @@ BEGIN
system("mkdir -p $tunedir") unless -d $tunedir;
foreach my $key (keys %TUNEFILES) {
- my $file = $TUNEFILES{$key};
- open FROM, $file or die "can't find file '$file'";
- open TO, ">$tunedir/$key" or die "can't write to file '$tunedir/$key'";
- while (<FROM>) {
- s/<INPUT>/$TUNE{source}/g;
- s/<SOURCE>/$SOURCE/g;
- s/<RUNDIR>/$RUNDIR/g;
- s/<TARGET>/$TARGET/g;
- s/<LMLINES>/$lmlines/g;
- s/<LMWEIGHTS>/$lmweights/g;
- s/<LMPARAMS>/$lmparams/g;
- s/<LMFILE>/$LMFILES[0]/g;
- s/<LMTYPE>/$LM_TYPE/g;
- s/<MEM>/$JOSHUA_MEM/g;
- s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g;
- s/<GRAMMAR_FILE>/$TUNE_GRAMMAR/g;
- s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g;
- s/<OOV>/$OOV/g;
- s/<NUMJOBS>/$NUM_JOBS/g;
- s/<NUMTHREADS>/$NUM_THREADS/g;
- s/<QSUB_ARGS>/$QSUB_ARGS/g;
- s/<OUTPUT>/$tunedir\/tune.output.nbest/g;
- s/<REF>/$TUNE{target}/g;
- s/<JOSHUA>/$JOSHUA/g;
- s/<NUMREFS>/$numrefs/g;
- s/<CONFIG>/$tunedir\/joshua.config/g;
- s/<LOG>/$tunedir\/joshua.log/g;
- s/<TUNEDIR>/$tunedir/g;
- s/<MERTDIR>/$tunedir/g; # for backwards compatibility
- s/use_sent_specific_tm=.*/use_sent_specific_tm=0/g;
- print TO;
- }
- close(FROM);
- close(TO);
+ my $file = $TUNEFILES{$key};
+ open FROM, $file or die "can't find file '$file'";
+ open TO, ">$tunedir/$key" or die "can't write to file '$tunedir/$key'";
+ while (<FROM>) {
+ s/<INPUT>/$TUNE{source}/g;
+ s/<SOURCE>/$SOURCE/g;
+ s/<RUNDIR>/$RUNDIR/g;
+ s/<TARGET>/$TARGET/g;
+ s/<LMLINES>/$lmlines/g;
+ s/<LMWEIGHTS>/$lmweights/g;
+ s/<LMPARAMS>/$lmparams/g;
+ s/<LMFILE>/$LMFILES[0]/g;
+ s/<LMTYPE>/$LM_TYPE/g;
+ s/<MEM>/$JOSHUA_MEM/g;
+ s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g;
+ s/<GRAMMAR_FILE>/$TUNE_GRAMMAR/g;
+ s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g;
+ s/<OOV>/$OOV/g;
+ s/<NUMJOBS>/$NUM_JOBS/g;
+ s/<NUMTHREADS>/$NUM_THREADS/g;
+ s/<QSUB_ARGS>/$QSUB_ARGS/g;
+ s/<OUTPUT>/$tunedir\/tune.output.nbest/g;
+ s/<REF>/$TUNE{target}/g;
+ s/<JOSHUA>/$JOSHUA/g;
+ s/<NUMREFS>/$numrefs/g;
+ s/<CONFIG>/$tunedir\/joshua.config/g;
+ s/<LOG>/$tunedir\/joshua.log/g;
+ s/<TUNEDIR>/$tunedir/g;
+ s/<MERTDIR>/$tunedir/g; # for backwards compatibility
+ s/use_sent_specific_tm=.*/use_sent_specific_tm=0/g;
+ print TO;
+ }
+ close(FROM);
+ close(TO);
}
chmod(0755,"$tunedir/decoder_command");
# tune
if ($TUNER eq "mert") {
- $cachepipe->cmd("mert-$run",
- "java -d64 -Xmx2g -cp $JOSHUA/bin joshua.zmert.ZMERT -maxMem 4500 $tunedir/mert.config > $tunedir/mert.log 2>&1",
- $TUNE_GRAMMAR,
- "$tunedir/joshua.config.ZMERT.final",
- "$tunedir/decoder_command",
- "$tunedir/mert.config",
- "$tunedir/params.txt");
- system("ln -sf joshua.config.ZMERT.final $tunedir/joshua.config.final");
+ $cachepipe->cmd("mert-$run",
+ "java -d64 -Xmx2g -cp $JOSHUA/bin joshua.zmert.ZMERT -maxMem 4500 $tunedir/mert.config > $tunedir/mert.log 2>&1",
+ $TUNE_GRAMMAR,
+ "$tunedir/joshua.config.ZMERT.final",
+ "$tunedir/decoder_command",
+ "$tunedir/mert.config",
+ "$tunedir/params.txt");
+ system("ln -sf joshua.config.ZMERT.final $tunedir/joshua.config.final");
} elsif ($TUNER eq "pro") {
- $cachepipe->cmd("pro-$run",
- "java -d64 -Xmx2g -cp $JOSHUA/bin joshua.pro.PRO -maxMem 4500 $tunedir/pro.config > $tunedir/pro.log 2>&1",
- $TUNE_GRAMMAR,
- "$tunedir/joshua.config.PRO.final",
- "$tunedir/decoder_command",
- "$tunedir/pro.config",
- "$tunedir/params.txt");
- system("ln -sf joshua.config.PRO.final $tunedir/joshua.config.final");
+ $cachepipe->cmd("pro-$run",
+ "java -d64 -Xmx2g -cp $JOSHUA/bin joshua.pro.PRO -maxMem 4500 $tunedir/pro.config > $tunedir/pro.log 2>&1",
+ $TUNE_GRAMMAR,
+ "$tunedir/joshua.config.PRO.final",
+ "$tunedir/decoder_command",
+ "$tunedir/pro.config",
+ "$tunedir/params.txt");
+ system("ln -sf joshua.config.PRO.final $tunedir/joshua.config.final");
}
}
@@ -992,32 +992,32 @@ BEGIN
$TEST_GRAMMAR = $GRAMMAR_FILE;
if ($DO_FILTER_TM) {
- $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
+ $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
- $cachepipe->cmd("filter-test",
- "$SCRIPTDIR/training/scat $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TEST{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TEST_GRAMMAR",
- $GRAMMAR_FILE,
- $TEST{source},
- $TEST_GRAMMAR);
+ $cachepipe->cmd("filter-test",
+ "$SCRIPTDIR/training/scat $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TEST{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TEST_GRAMMAR",
+ $GRAMMAR_FILE,
+ $TEST{source},
+ $TEST_GRAMMAR);
}
}
# create the glue file
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-test",
- "$SCRIPTDIR/training/scat $TEST_GRAMMAR | java -Xmx1g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{test}/grammar.glue",
- $TEST_GRAMMAR,
- "$DATA_DIRS{test}/grammar.glue");
+ "$SCRIPTDIR/training/scat $TEST_GRAMMAR | java -Xmx1g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{test}/grammar.glue",
+ $TEST_GRAMMAR,
+ "$DATA_DIRS{test}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
} else {
# just create a symlink to it
my $filename = $DATA_DIRS{test} . "/" . basename($GLUE_GRAMMAR_FILE);
if ($GLUE_GRAMMAR_FILE =~ /^\//) {
- system("ln -sf $GLUE_GRAMMAR_FILE $filename");
+ system("ln -sf $GLUE_GRAMMAR_FILE $filename");
} else {
- system("ln -sf ../../$GLUE_GRAMMAR_FILE $filename");
+ system("ln -sf ../../$GLUE_GRAMMAR_FILE $filename");
}
}
@@ -1028,75 +1028,75 @@ BEGIN
system("mkdir -p $testrun") unless -d $testrun;
foreach my $key (qw(decoder_command)) {
- my $file = $TUNEFILES{$key};
- open FROM, $file or die "can't find file '$file'";
- open TO, ">$testrun/$key" or die "can't write to '$testrun/$key'";
- while (<FROM>) {
- s/<INPUT>/$TEST{source}/g;
- s/<NUMJOBS>/$NUM_JOBS/g;
- s/<NUMTHREADS>/$NUM_THREADS/g;
- s/<QSUB_ARGS>/$QSUB_ARGS/g;
- s/<OUTPUT>/$testrun\/test.output.nbest/g;
- s/<JOSHUA>/$JOSHUA/g;
- s/<NUMREFS>/$numrefs/g;
- s/<SOURCE>/$SOURCE/g;
- s/<TARGET>/$TARGET/g;
- s/<RUNDIR>/$TARGET/g;
- s/<LMFILE>/$LMFILES[0]/g;
- s/<MEM>/$JOSHUA_MEM/g;
- s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g;
- s/<GRAMMAR_FILE>/$TEST_GRAMMAR/g;
- s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g;
- s/<OOV>/$OOV/g;
- s/<CONFIG>/$testrun\/joshua.config/g;
- s/<LOG>/$testrun\/joshua.log/g;
-
- print TO;
- }
- close(FROM);
- close(TO);
+ my $file = $TUNEFILES{$key};
+ open FROM, $file or die "can't find file '$file'";
+ open TO, ">$testrun/$key" or die "can't write to '$testrun/$key'";
+ while (<FROM>) {
+ s/<INPUT>/$TEST{source}/g;
+ s/<NUMJOBS>/$NUM_JOBS/g;
+ s/<NUMTHREADS>/$NUM_THREADS/g;
+ s/<QSUB_ARGS>/$QSUB_ARGS/g;
+ s/<OUTPUT>/$testrun\/test.output.nbest/g;
+ s/<JOSHUA>/$JOSHUA/g;
+ s/<NUMREFS>/$numrefs/g;
+ s/<SOURCE>/$SOURCE/g;
+ s/<TARGET>/$TARGET/g;
+ s/<RUNDIR>/$TARGET/g;
+ s/<LMFILE>/$LMFILES[0]/g;
+ s/<MEM>/$JOSHUA_MEM/g;
+ s/<GRAMMAR_TYPE>/$GRAMMAR_TYPE/g;
+ s/<GRAMMAR_FILE>/$TEST_GRAMMAR/g;
+ s/<GLUE_GRAMMAR>/$GLUE_GRAMMAR_FILE/g;
+ s/<OOV>/$OOV/g;
+ s/<CONFIG>/$testrun\/joshua.config/g;
+ s/<LOG>/$testrun\/joshua.log/g;
+
+ print TO;
+ }
+ close(FROM);
+ close(TO);
}
chmod(0755,"$testrun/decoder_command");
# copy the config file over
my $tunedir = (defined $NAME) ? "tune/$NAME/$run" : "tune/$run";
$cachepipe->cmd("test-joshua-config-from-tune-$run",
- "cat $tunedir/joshua.config.final | perl -pe 's#tune/#test/#; s/mark_oovs=false/mark_oovs=true/; s/use_sent_specific_tm=.*/use_sent_specific_tm=0/; s/keep_sent_specific_tm=true/keep_sent_specific_tm=false/' > $testrun/joshua.config",
- "$tunedir/joshua.config.final",
- "$testrun/joshua.config");
+ "cat $tunedir/joshua.config.final | perl -pe 's#tune/#test/#; s/mark_oovs=false/mark_oovs=true/; s/use_sent_specific_tm=.*/use_sent_specific_tm=0/; s/keep_sent_specific_tm=true/keep_sent_specific_tm=false/' > $testrun/joshua.config",
+ "$tunedir/joshua.config.final",
+ "$testrun/joshua.config");
$cachepipe->cmd("test-decode-$run",
- "./$testrun/decoder_command",
- "$testrun/decoder_command",
- "$DATA_DIRS{test}/grammar.glue",
- $TEST_GRAMMAR,
- "$testrun/test.output.nbest");
+ "./$testrun/decoder_command",
+ "$testrun/decoder_command",
+ "$DATA_DIRS{test}/grammar.glue",
+ $TEST_GRAMMAR,
+ "$testrun/test.output.nbest");
$cachepipe->cmd("remove-oov-$run",
- "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV",
- "$testrun/test.output.nbest",
- "$testrun/test.output.nbest.noOOV");
+ "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.nbest",
+ "$testrun/test.output.nbest.noOOV");
if ($DO_MBR) {
- my $numlines = `cat $TEST{source} | wc -l`;
- $numlines--;
+ my $numlines = `cat $TEST{source} | wc -l`;
+ $numlines--;
- $cachepipe->cmd("test-onebest-parmbr-$run",
- "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 > $testrun/test.output.1best",
- "$testrun/test.output.nbest.noOOV",
- "$testrun/test.output.1best");
+ $cachepipe->cmd("test-onebest-parmbr-$run",
+ "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 > $testrun/test.output.1best",
+ "$testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.1best");
} else {
- $cachepipe->cmd("test-extract-onebest-$run",
- "java -Xmx500m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest $testrun/test.output.1best",
- "$testrun/test.output.nbest.noOOV",
- "$testrun/test.output.1best");
+ $cachepipe->cmd("test-extract-onebest-$run",
+ "java -Xmx500m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest $testrun/test.output.1best",
+ "$testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.1best");
}
$numrefs = get_numrefs($TEST{target});
$cachepipe->cmd("test-bleu-$run",
- "java -cp $JOSHUA/bin -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $testrun/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu",
- "$testrun/test.output.1best",
- "$testrun/test.output.1best.bleu");
+ "java -cp $JOSHUA/bin -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $testrun/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu",
+ "$testrun/test.output.1best",
+ "$testrun/test.output.1best.bleu");
# system("cat $testrun/test.output.1best.bleu");
}
@@ -1126,7 +1126,7 @@ BEGIN
TEST:
-system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};
+ system("mkdir -p $DATA_DIRS{test}") unless -d $DATA_DIRS{test};
if (! defined $NAME) {
print "* FATAL: for direct tests, you must specify a unique run name\n";
@@ -1157,22 +1157,22 @@ BEGIN
$TEST_GRAMMAR = $GRAMMAR_FILE;
if ($DO_FILTER_TM) {
- $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
+ $TEST_GRAMMAR = "$DATA_DIRS{test}/grammar.filtered.gz";
- $cachepipe->cmd("filter-test-$NAME",
- "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TEST{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TEST_GRAMMAR",
- $GRAMMAR_FILE,
- $TEST{source},
- $TEST_GRAMMAR);
+ $cachepipe->cmd("filter-test-$NAME",
+ "$CAT $GRAMMAR_FILE | java -Xmx2g -Dfile.encoding=utf8 -cp $THRAX/bin/thrax.jar edu.jhu.thrax.util.TestSetFilter -v $TEST{source} | $SCRIPTDIR/training/remove-unary-abstract.pl | gzip -9n > $TEST_GRAMMAR",
+ $GRAMMAR_FILE,
+ $TEST{source},
+ $TEST_GRAMMAR);
}
}
# build the glue grammar if needed
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-test-$NAME",
- "$CAT $TEST_GRAMMAR | java -Xmx2g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{test}/grammar.glue",
- $TEST_GRAMMAR,
- "$DATA_DIRS{test}/grammar.glue");
+ "$CAT $TEST_GRAMMAR | java -Xmx2g -cp $THRAX/bin/thrax.jar:$JOSHUA/lib/hadoop-core-0.20.203.0.jar:$JOSHUA/lib/commons-logging-1.1.1.jar edu.jhu.thrax.util.CreateGlueGrammar $THRAX_CONF_FILE > $DATA_DIRS{test}/grammar.glue",
+ $TEST_GRAMMAR,
+ "$DATA_DIRS{test}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
}
@@ -1193,43 +1193,43 @@ BEGIN
# decode
$cachepipe->cmd("test-$NAME-decode-run",
- "./$testrun/decoder_command",
- "$testrun/decoder_command",
- $TEST_GRAMMAR,
- $GLUE_GRAMMAR_FILE,
- "$testrun/test.output.nbest");
+ "./$testrun/decoder_command",
+ "$testrun/decoder_command",
+ $TEST_GRAMMAR,
+ $GLUE_GRAMMAR_FILE,
+ "$testrun/test.output.nbest");
$cachepipe->cmd("test-$NAME-remove-oov",
- "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV",
- "$testrun/test.output.nbest",
- "$testrun/test.output.nbest.noOOV");
+ "cat $testrun/test.output.nbest | perl -pe 's/_OOV//g' > $testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.nbest",
+ "$testrun/test.output.nbest.noOOV");
if ($DO_MBR) {
$cachepipe->cmd("test-$NAME-onebest-parmbr",
- "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 > $testrun/test.output.1best",
- "$testrun/test.output.nbest.noOOV",
- "$testrun/test.output.1best");
+ "cat $testrun/test.output.nbest.noOOV | java -Xmx1700m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.decoder.NbestMinRiskReranker false 1 > $testrun/test.output.1best",
+ "$testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.1best");
} else {
$cachepipe->cmd("test-$NAME-extract-onebest",
- "java -Xmx500m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest $testrun/test.output.1best",
- "$testrun/test.output.nbest.noOOV",
- "$testrun/test.output.1best");
+ "java -Xmx500m -cp $JOSHUA/bin -Dfile.encoding=utf8 joshua.util.ExtractTopCand $testrun/test.output.nbest $testrun/test.output.1best",
+ "$testrun/test.output.nbest.noOOV",
+ "$testrun/test.output.1best");
}
$numrefs = get_numrefs($TEST{target});
$cachepipe->cmd("$NAME-test-bleu",
- "java -cp $JOSHUA/bin -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $testrun/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu",
- "$testrun/test.output.1best",
- "$testrun/test.output.1best.bleu");
+ "java -cp $JOSHUA/bin -Dfile.encoding=utf8 -Djava.library.path=lib -Xmx1000m -Xms1000m -Djava.util.logging.config.file=logging.properties joshua.util.JoshuaEval -cand $testrun/test.output.1best -ref $TEST{target} -rps $numrefs -m BLEU 4 closest > $testrun/test.output.1best.bleu",
+ "$testrun/test.output.1best",
+ "$testrun/test.output.1best.bleu");
system("cat $testrun/test.output.1best.bleu");
-
+
######################################################################
## SUBROUTINES #######################################################
######################################################################
LAST:
-1;
+ 1;
# Does tokenization and normalization of training, tuning, and test data.
# $label: one of train, tune, or test
@@ -1245,63 +1245,70 @@ sub prepare_data {
my %prefixes;
# copy the data from its original location to our location
+ my $numlines = -1;
foreach my $ext ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") {
- # append each extension to the corpora prefixes
- my @files = map { "$_.$ext" } @$corpora;
- # a list of all the files (in case of multiple corpora prefixes)
- my $files = join(" ",@files);
- if (-e $files[0]) {
- $cachepipe->cmd("$label-copy-$ext",
- "cat $files | gzip -9n > $DATA_DIRS{$label}/$label.$ext.gz",
- @files, "$DATA_DIRS{$label}/$label.$ext.gz");
- }
+ # append each extension to the corpora prefixes
+ my @files = map { "$_.$ext" } @$corpora;
+ # a list of all the files (in case of multiple corpora prefixes)
+ my $files = join(" ",@files);
+ if (-e $files[0]) {
+ $cachepipe->cmd("$label-copy-$ext",
+ "cat $files | gzip -9n > $DATA_DIRS{$label}/$label.$ext.gz",
+ @files, "$DATA_DIRS{$label}/$label.$ext.gz");
+ }
+ chomp(my $lines = `$CAT $DATA_DIRS{$label}/$label.$ext.gz | wc -l`);
+ $numlines = $lines if ($numlines == -1);
+ if ($lines != $numlines) {
+ print STDERR "* FATAL: $DATA_DIRS{$label}/$label.$ext.gz has a different number of lines ($lines) than a 'parallel' file that preceded it ($numlines)\n";
+ exit(1);
+ }
}
my $prefix = "$label";
# tokenize the data
foreach my $lang ($TARGET,$SOURCE,"$TARGET.0","$TARGET.1","$TARGET.2","$TARGET.3") {
- if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") {
- if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) {
- system("cp $DATA_DIRS{$label}/$prefix.$lang.gz $DATA_DIRS{$label}/$prefix.tok.$lang.gz");
- } else {
- $cachepipe->cmd("$label-tokenize-$lang",
- "$CAT $DATA_DIRS{$label}/$prefix.$lang.gz | $NORMALIZER $lang | $TOKENIZER -l $lang 2> /dev/null | gzip -9n > $DATA_DIRS{$label}/$prefix.tok.$lang.gz",
- "$DATA_DIRS{$label}/$prefix.$lang.gz", "$DATA_DIRS{$label}/$prefix.tok.$lang.gz");
- }
-
- }
+ if (-e "$DATA_DIRS{$label}/$prefix.$lang.gz") {
+ if (is_lattice("$DATA_DIRS{$label}/$prefix.$lang.gz")) {
+ system("cp $DATA_DIRS{$label}/$prefix.$lang.gz $DATA_DIRS{$label}/$prefix.tok.$lang.gz");
+ } else {
+ $cachepipe->cmd("$label-tokenize-$lang",
+ "$CAT $DATA_DIRS{$label}/$prefix.$lang.gz | $NORMALIZER $lang | $TOKENIZER -l $lang 2> /dev/null | gzip -9n > $DATA_DIRS{$label}/$prefix.tok.$lang.gz",
+ "$DATA_DIRS{$label}/$prefix.$lang.gz", "$DATA_DIRS{$label}/$prefix.tok.$lang.gz");
+ }
+
+ }
}
# extend the prefix
$prefix .= ".tok";
$prefixes{tokenized} = $prefix;
if ($label eq "train" and $maxlen > 0) {
- # trim training data
- $cachepipe->cmd("train-trim",
- "paste <(gzip -cd $DATA_DIRS{$label}/$prefix.$TARGET.gz) <(gzip -cd $DATA_DIRS{$label}/$prefix.$SOURCE.gz) | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $DATA_DIRS{$label}/$prefix.$maxlen.$TARGET.gz $DATA_DIRS{$label}/$prefix.$maxlen.$SOURCE.gz",
- "$DATA_DIRS{$label}/$prefix.$TARGET.gz",
- "$DATA_DIRS{$label}/$prefix.$SOURCE.gz",
- "$DATA_DIRS{$label}/$prefix.$maxlen.$TARGET.gz",
- "$DATA_DIRS{$label}/$prefix.$maxlen.$SOURCE.gz",
- );
- $prefix .= ".$maxlen";
+ # trim training data
+ $cachepipe->cmd("train-trim",
+ "paste <(gzip -cd $DATA_DIRS{$label}/$prefix.$TARGET.gz) <(gzip -cd $DATA_DIRS{$label}/$prefix.$SOURCE.gz) | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $DATA_DIRS{$label}/$prefix.$maxlen.$TARGET.gz $DATA_DIRS{$label}/$prefix.$maxlen.$SOURCE.gz",
+ "$DATA_DIRS{$label}/$prefix.$TARGET.gz",
+ "$DATA_DIRS{$label}/$prefix.$SOURCE.gz",
+ "$DATA_DIRS{$label}/$prefix.$maxlen.$TARGET.gz",
+ "$DATA_DIRS{$label}/$prefix.$maxlen.$SOURCE.gz",
+ );
+ $prefix .= ".$maxlen";
}
# record this whether we shortened or not
$prefixes{shortened} = $prefix;
# lowercase