Browse files

Added alignment storage. Changed PackedGrammar to use IntBuffer views…

… on source and target. Some cleanup.
  • Loading branch information...
1 parent ef4b0da commit 98acd2bdc56947b6a892dee3590723f348a44398 @jganitkevitch jganitkevitch committed Mar 7, 2012
Showing with 492 additions and 320 deletions.
  1. +2 −11 logging.properties
  2. +135 −86 src/joshua/decoder/ff/tm/packed/PackedGrammar.java
  3. +278 −186 src/joshua/tools/GrammarPacker.java
  4. BIN test/bn-en/packed/grammar_packed/chunk_00000.data
  5. BIN test/bn-en/packed/grammar_packed/chunk_00000.source
  6. BIN test/bn-en/packed/grammar_packed/chunk_00000.target
  7. BIN test/bn-en/packed/grammar_packed/chunk_00000.target.lookup
  8. BIN test/bn-en/packed/grammar_packed/chunk_00001.data
  9. BIN test/bn-en/packed/grammar_packed/chunk_00001.source
  10. BIN test/bn-en/packed/grammar_packed/chunk_00001.target
  11. BIN test/bn-en/packed/grammar_packed/chunk_00001.target.lookup
  12. BIN test/bn-en/packed/grammar_packed/chunk_00002.data
  13. BIN test/bn-en/packed/grammar_packed/chunk_00002.source
  14. BIN test/bn-en/packed/grammar_packed/chunk_00002.target
  15. BIN test/bn-en/packed/grammar_packed/chunk_00002.target.lookup
  16. BIN test/bn-en/packed/grammar_packed/chunk_00003.data
  17. BIN test/bn-en/packed/grammar_packed/chunk_00003.source
  18. BIN test/bn-en/packed/grammar_packed/chunk_00003.target
  19. BIN test/bn-en/packed/grammar_packed/chunk_00003.target.lookup
  20. BIN test/bn-en/packed/grammar_packed/chunk_00004.data
  21. BIN test/bn-en/packed/grammar_packed/chunk_00004.source
  22. BIN test/bn-en/packed/grammar_packed/chunk_00004.target
  23. BIN test/bn-en/packed/grammar_packed/chunk_00004.target.lookup
  24. BIN test/bn-en/packed/grammar_packed/slice_00000.features
  25. BIN test/bn-en/packed/grammar_packed/slice_00000.source
  26. BIN test/bn-en/packed/grammar_packed/slice_00000.target
  27. BIN test/bn-en/packed/grammar_packed/slice_00000.target.lookup
  28. +1 −1 test/bn-en/packed/packer.quantized
  29. +1 −1 test/bn-en/packed/packer.uncompressed
  30. +75 −35 test/packed-grammar/PrintRules.java
View
13 logging.properties
@@ -33,19 +33,13 @@ java.util.logging.FileHandler.limit=1073741824
# To turn off all logging for a class, set to INFO; to turn all on, set to FINEST
joshua.corpus.lexprob.SampledLexProbs.level=INFO
joshua.corpus.lexprob.LexProbs.level=INFO
-joshua.corpus.suffix_array.FrequentMatches.level=INFO
-joshua.corpus.suffix_array.FrequentPhrases.level=INFO
-joshua.corpus.suffix_array.AbstractSuffixArray.level=INFO
-joshua.corpus.suffix_array.SuffixArray.level=INFO
-joshua.corpus.vocab.SrilmSymbol.level=INFO
-joshua.corpus.vocab.Vocabulary.level=INFO
joshua.decoder.InputHandler.level=INFO
joshua.decoder.DecoderThread.level=INFO
joshua.decoder.JoshuaDecoder.level=INFO
joshua.decoder.chart_parser.DotChart.level=INFO
joshua.decoder.chart_parser.Chart.level=INFO
-joshua.decoder.chart_parser.Cell.level=FINEST
-joshua.decoder.hypergraph.HGNode.level=FINEST
+joshua.decoder.chart_parser.Cell.level=INFO
+joshua.decoder.hypergraph.HGNode.level=INFO
joshua.decoder.hypergraph.KBestExtractor.level=INFO
joshua.decoder.Decoder.level=INFO
joshua.decoder.ff.PhraseModelFF.level=INFO
@@ -59,9 +53,6 @@ joshua.decoder.ff.tm.MonolingualRule.level=INFO
joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar.level=INFO
joshua.decoder.ff.tm.TMGrammar_Memory.level=INFO
joshua.lattice.Lattice.level=INFO
-joshua.prefix_tree.PrefixTree.level=FINE
-joshua.prefix_tree.ExtractRules.level=INFO
-joshua.prefix_tree.HierarchicalRuleExtractor.level=INFO
joshua.ui.alignment.GridPanel.level=INFO
joshua.ui.alignment.GridScrollPanelHeader.level=INFO
joshua.ui.alignment.GridViewer.level=INFO
View
221 src/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -1,10 +1,13 @@
package joshua.decoder.ff.tm.packed;
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
-import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
@@ -43,6 +46,8 @@
private PackedRoot root;
private ArrayList<PackedSlice> slices;
+
+ private final float maxId;
public PackedGrammar(String grammar_directory, int span_limit)
throws FileNotFoundException, IOException {
@@ -52,7 +57,8 @@ public PackedGrammar(String grammar_directory, int span_limit)
logger.info("Reading vocabulary: " +
grammar_directory + File.separator + "vocabulary");
Vocabulary.read(grammar_directory + File.separator + "vocabulary");
-
+ maxId = (float) Vocabulary.size();
+
// Read the quantizer setup.
logger.info("Reading quantization configuration: " +
grammar_directory + File.separator + "quantization");
@@ -69,7 +75,7 @@ public PackedGrammar(String grammar_directory, int span_limit)
String[] listing = new File(grammar_directory).list();
slices = new ArrayList<PackedSlice>();
for (int i = 0; i < listing.length; i++) {
- if (listing[i].startsWith("chunk_") && listing[i].endsWith(".source"))
+ if (listing[i].startsWith("slice_") && listing[i].endsWith(".source"))
slices.add(new PackedSlice(grammar_directory + File.separator +
listing[i].substring(0, 11)));
}
@@ -123,7 +129,7 @@ public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
public int getNumRules() {
int num_rules = 0;
for (PackedSlice ps : slices)
- num_rules += ps.dataSize;
+ num_rules += ps.featureSize;
return num_rules;
}
@@ -172,22 +178,22 @@ public PackedTrie(PackedSlice grammar, int position, int[] parent_src,
arity++;
}
- public Trie match(int token_id) {
- int num_children = grammar.source.getInt(position);
+ public final Trie match(int token_id) {
+ int num_children = grammar.source.get(position);
if (num_children == 0)
return null;
- if (num_children == 1 && token_id == grammar.source.getInt(position + 4))
- return new PackedTrie(grammar, grammar.source.getInt(position + 8),
+ if (num_children == 1 && token_id == grammar.source.get(position + 1))
+ return new PackedTrie(grammar, grammar.source.get(position + 2),
src, arity, token_id);
int top = 0;
int bottom = num_children - 1;
while (true) {
int candidate = (top + bottom) / 2;
- int candidate_position = position + 4 + 8 * candidate;
- int read_token = grammar.source.getInt(candidate_position);
+ int candidate_position = position + 1 + 2 * candidate;
+ int read_token = grammar.source.get(candidate_position);
if (read_token == token_id) {
return new PackedTrie(grammar,
- grammar.source.getInt(candidate_position + 4),
+ grammar.source.get(candidate_position + 1),
src, arity, token_id);
} else if (top == bottom) {
return null;
@@ -200,88 +206,128 @@ public Trie match(int token_id) {
return null;
}
}
+
+// public final Trie match(final int token_id) {
+// final int num_children = grammar.source.get(position);
+// final int offset = position + 1;
+//
+// if (num_children == 0)
+// return null;
+// if (num_children == 1 && token_id == grammar.source.get(position + 1))
+// return new PackedTrie(grammar, grammar.source.get(position + 2),
+// src, arity, token_id);
+// int top = 0;
+// int bottom = num_children - 1;
+//
+// int top_token, bottom_token;
+// int candidate, candidate_position, candidate_token;
+// while (true) {
+// top_token = grammar.source.get(offset + 2 * top);
+// bottom_token = grammar.source.get(offset + 2 * bottom);
+// candidate = (int) ((bottom_token - token_id) / (float) (top_token - bottom_token)) * (bottom - top);
+// candidate_position = offset + 2 * candidate;
+// candidate_token = grammar.source.get(candidate_position);
+//
+// logger.info("[" + top + " - " + candidate + " - " + bottom + "]");
+// logger.info("{" + top_token + " - " + candidate_token + " - " + bottom_token + "}");
+//
+// if (candidate_token == token_id) {
+// return new PackedTrie(grammar,
+// grammar.source.get(candidate_position + 1),
+// src, arity, token_id);
+// } else if (top == bottom) {
+// return null;
+// } else if (candidate_token > token_id) {
+// top = candidate + 1;
+// } else {
+// bottom = candidate - 1;
+// }
+// if (bottom < top)
+// return null;
+// }
+// }
public boolean hasExtensions() {
- return (grammar.source.getInt(position) != 0);
+ return (grammar.source.get(position) != 0);
}
public Collection<? extends Trie> getExtensions() {
- int num_children = grammar.source.getInt(position);
+ int num_children = grammar.source.get(position);
ArrayList<PackedTrie> tries = new ArrayList<PackedTrie>(num_children);
for (int i = 0; i < num_children; i++) {
- int symbol = grammar.source.getInt(position + 4 + 8 * i);
- int address = grammar.source.getInt(position + 8 + 8 * i);
+ int symbol = grammar.source.get(position + 1 + 2 * i);
+ int address = grammar.source.get(position + 2 + 2 * i);
tries.add(new PackedTrie(grammar, address, src, arity, symbol));
}
return tries;
}
public boolean hasRules() {
- int num_children = grammar.source.getInt(position);
- return (grammar.source.getInt(position + 4 + 8 * num_children) != 0);
+ int num_children = grammar.source.get(position);
+ return (grammar.source.get(position + 1 + 2 * num_children) != 0);
}
public RuleCollection getRuleCollection() {
return this;
}
public List<Rule> getRules() {
- int num_children = grammar.source.getInt(position);
- int rule_position = position + 8 * (num_children + 1);
- int num_rules = grammar.source.getInt(rule_position - 4);
+ int num_children = grammar.source.get(position);
+ int rule_position = position + 2 * (num_children + 1);
+ int num_rules = grammar.source.get(rule_position - 1);
ArrayList<Rule> rules = new ArrayList<Rule>(num_rules);
for (int i = 0; i < num_rules; i++)
- rules.add(grammar.assembleRule(rule_position + 12 * i, src, arity));
+ rules.add(grammar.assembleRule(rule_position + 3 * i, src, arity));
return rules;
}
@Override
public void sortRules(List<FeatureFunction> models) {
- int num_children = grammar.source.getInt(position);
- int rule_position = position + 8 * (num_children + 1);
- int num_rules = grammar.source.getInt(rule_position - 4);
+ int num_children = grammar.source.get(position);
+ int rule_position = position + 2 * (num_children + 1);
+ int num_rules = grammar.source.get(rule_position - 1);
Integer[] rules = new Integer[num_rules];
int target_address;
int block_id;
for (int i = 0; i < num_rules; i++) {
- target_address = grammar.source.getInt(rule_position + 4 + 12 * i);
- rules[i] = rule_position + 8 + 12 * i;
- block_id = grammar.source.getInt(rules[i]);
+ target_address = grammar.source.get(rule_position + 1 + 3 * i);
+ rules[i] = rule_position + 2 + 3 * i;
+ block_id = grammar.source.get(rules[i]);
BilingualRule rule = new BilingualRule(
- grammar.source.getInt(rule_position + 12 * i),
+ grammar.source.get(rule_position + 3 * i),
src,
grammar.getTarget(target_address),
grammar.getFeatures(block_id),
arity,
owner,
0,
- rule_position + 12 * i);
+ rule_position + 3 * i);
grammar.cache[block_id] = rule.estimateRuleCost(models);
}
Arrays.sort(rules, new Comparator<Integer>() {
public int compare(Integer a, Integer b) {
- float a_cost = grammar.cache[grammar.source.getInt(a)];
- float b_cost = grammar.cache[grammar.source.getInt(b)];
+ float a_cost = grammar.cache[grammar.source.get(a)];
+ float b_cost = grammar.cache[grammar.source.get(b)];
if (a_cost == b_cost)
return 0;
return (a_cost > b_cost ? 1 : -1);
}
});
- byte[] backing = new byte[12 * num_rules];
- ByteBuffer sorted = ByteBuffer.wrap(backing);
+ int[] backing = new int[3 * num_rules];
+ IntBuffer sorted = IntBuffer.wrap(backing);
for (int i = 0; i < rules.length; i++) {
int address = rules[i];
- sorted.putInt(grammar.source.getInt(address - 8));
- sorted.putInt(grammar.source.getInt(address - 4));
- sorted.putInt(grammar.source.getInt(address));
+ sorted.put(grammar.source.get(address - 2));
+ sorted.put(grammar.source.get(address - 1));
+ sorted.put(grammar.source.get(address));
}
for (int i = 0; i < backing.length; i++)
grammar.source.put(rule_position + i, backing[i]);
@@ -303,17 +349,17 @@ public int getArity() {
}
}
- public class PackedRoot implements Trie {
+ public final class PackedRoot implements Trie {
private HashMap<Integer, PackedSlice> lookup;
public PackedRoot(PackedGrammar grammar) {
lookup = new HashMap<Integer, PackedSlice>();
for (PackedSlice ps : grammar.slices) {
- int num_children = ps.source.getInt(0);
+ int num_children = ps.source.get(0);
for (int i = 0; i < num_children; i++)
- lookup.put(ps.source.getInt(4 + i * 8), ps);
+ lookup.put(ps.source.get(2 * i + 1), ps);
}
}
@@ -351,17 +397,19 @@ public RuleCollection getRuleCollection() {
}
}
- public class PackedSlice {
+ public final class PackedSlice {
private String name;
- private MappedByteBuffer source;
+ private MappedByteBuffer byteSource;
+ private IntBuffer source;
- private MappedByteBuffer target;
+ private MappedByteBuffer byteTarget;
+ private IntBuffer target;
private int[] targetLookup;
- private MappedByteBuffer data;
- int dataSize;
- private int[] dataLookup;
+ private MappedByteBuffer features;
+ int featureSize;
+ private int[] featureLookup;
private float[] cache;
@@ -371,86 +419,87 @@ public PackedSlice(String prefix) throws IOException {
File source_file = new File(prefix + ".source");
File target_file = new File(prefix + ".target");
File target_lookup_file = new File(prefix + ".target.lookup");
- File data_file = new File(prefix + ".data");
+ File feature_file = new File(prefix + ".features");
// Get the channels etc.
FileChannel source_channel =
new RandomAccessFile(source_file, "rw").getChannel();
int source_size = (int) source_channel.size();
FileChannel target_channel =
- new RandomAccessFile(target_file, "rw").getChannel();
+ new RandomAccessFile(target_file, "r").getChannel();
int target_size = (int) target_channel.size();
- // TODO: change this to read-only.
- FileChannel target_lookup_channel =
- new RandomAccessFile(target_lookup_file, "rw").getChannel();
- int target_lookup_size = (int) target_channel.size();
-
- FileChannel data_channel =
- new RandomAccessFile(data_file, "rw").getChannel();
- int data_size = (int) data_channel.size();
+ FileChannel feature_channel =
+ new RandomAccessFile(feature_file, "r").getChannel();
+ int feature_size = (int) feature_channel.size();
- source = source_channel.map(MapMode.READ_WRITE, 0, source_size);
- target = target_channel.map(MapMode.READ_ONLY, 0, target_size);
- data = data_channel.map(MapMode.READ_ONLY, 0, data_size);
+ byteSource = source_channel.map(MapMode.PRIVATE, 0, source_size);
+ byteSource.load();
+ source = byteSource.asIntBuffer();
+
+ byteTarget = target_channel.map(MapMode.READ_ONLY, 0, target_size);
+ byteTarget.load();
+ target = byteTarget.asIntBuffer();
+
+ features = feature_channel.map(MapMode.READ_ONLY, 0, feature_size);
+ features.load();
- int num_blocks = data.getInt(0);
- dataLookup = new int[num_blocks];
+ int num_blocks = features.getInt(0);
+ featureLookup = new int[num_blocks];
cache = new float[num_blocks];
- dataSize = data.getInt(4);
+ featureSize = features.getInt(4);
for (int i = 0; i < num_blocks; i++)
- dataLookup[i] = data.getInt(8 + 4 * i);
+ featureLookup[i] = features.getInt(8 + 4 * i);
- MappedByteBuffer target_lookup =
- target_lookup_channel.map(MapMode.READ_ONLY, 0, target_lookup_size);
- targetLookup = new int[target_lookup.getInt()];
+ DataInputStream target_lookup_stream = new DataInputStream(
+ new BufferedInputStream(new FileInputStream(target_lookup_file)));
+ targetLookup = new int[target_lookup_stream.readInt()];
for (int i = 0; i < targetLookup.length; i++)
- targetLookup[i] = target_lookup.getInt(4 * (i + 1));
+ targetLookup[i] = target_lookup_stream.readInt();
}
- private int[] getTarget(int pointer) {
+ private final int[] getTarget(int pointer) {
// Figure out level.
int tgt_length = 1;
while (tgt_length < (targetLookup.length + 1)
&& targetLookup[tgt_length] <= pointer)
tgt_length++;
-
int[] tgt = new int[tgt_length];
int index = 0;
int parent;
do {
- parent = target.getInt(pointer);
+ parent = target.get(pointer);
if (parent != -1)
- tgt[index++] = target.getInt(pointer + 4);
+ tgt[index++] = target.get(pointer + 1);
pointer = parent;
} while (pointer != -1);
return tgt;
}
- private float[] getFeatures(int block_id, float[] features) {
- int data_position = dataLookup[block_id];
- int num_features = data.getInt(data_position);
- data_position += 4;
+ private final float[] getFeatures(int block_id, float[] feature_vector) {
+ int feature_position = featureLookup[block_id];
+ int num_features = features.getInt(feature_position);
+ feature_position += 4;
for (int i = 0; i < num_features; i++) {
- int feature_id = data.getInt(data_position);
+ int feature_id = features.getInt(feature_position);
Quantizer quantizer = quantization.get(feature_id);
- features[featureNameMap.get(feature_id)] = quantizer.read(data,
- data_position);
- data_position += 4 + quantizer.size();
+ feature_vector[featureNameMap.get(feature_id)] = quantizer.read(features,
+ feature_position);
+ feature_position += 4 + quantizer.size();
}
- return features;
+ return feature_vector;
}
- private float[] getFeatures(int block_id) {
- float[] features = new float[JoshuaConfiguration.num_phrasal_features];
- return getFeatures(block_id, features);
+ private final float[] getFeatures(int block_id) {
+ float[] feature_vector = new float[JoshuaConfiguration.num_phrasal_features];
+ return getFeatures(block_id, feature_vector);
}
- private Rule assembleRule(int address, int[] src, int arity) {
- int lhs = source.getInt(address);
- int tgt_address = source.getInt(address + 4);
- int data_block = source.getInt(address + 8);
+ private final Rule assembleRule(int address, int[] src, int arity) {
+ int lhs = source.get(address);
+ int tgt_address = source.get(address + 1);
+ int data_block = source.get(address + 2);
BilingualRule rule = new BilingualRule(lhs,
src,
getTarget(tgt_address),
View
464 src/joshua/tools/GrammarPacker.java
@@ -5,10 +5,7 @@
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.FileChannel.MapMode;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
@@ -20,39 +17,51 @@
import joshua.corpus.Vocabulary;
import joshua.util.FormatUtils;
import joshua.util.io.LineReader;
+import joshua.util.quantization.Quantizer;
import joshua.util.quantization.QuantizerConfiguration;
public class GrammarPacker {
private static final Logger logger =
Logger.getLogger(GrammarPacker.class.getName());
- private static int FANOUT;
- private static int CHUNK_SIZE;
+ private static int SLICE_SIZE;
private static int DATA_SIZE_LIMIT;
- private static int AVERAGE_NUM_FEATURES;
+ private static int DATA_SIZE_ESTIMATE;
private static String WORKING_DIRECTORY;
- private List<String> grammars;
+ private String grammar;
+
+ private boolean have_alignments;
+ private String alignments;
private QuantizerConfiguration quantization;
static {
- FANOUT = 2;
- CHUNK_SIZE = 100000;
+ SLICE_SIZE = 5000000;
DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
- AVERAGE_NUM_FEATURES = 20;
+ DATA_SIZE_ESTIMATE = 20;
WORKING_DIRECTORY = System.getProperty("user.dir")
+ File.separator + "packed";
}
- public GrammarPacker(String config_filename,
- List<String> grammars) throws IOException {
- this.grammars = grammars;
+ public GrammarPacker(String config_filename,
+ String grammar_filename,
+ String alignments_filename) throws IOException {
+ this.grammar = grammar_filename;
this.quantization = new QuantizerConfiguration();
-
+
+ this.alignments = alignments_filename;
+ have_alignments = (alignments != null);
+ if (!have_alignments) {
+ logger.info("No alignments file specified, skipping.");
+ } else if (!new File(alignments_filename).exists()) {
+ logger.severe("Alignements file does not exist: " + alignments);
+ System.exit(0);
+ }
+
readConfig(config_filename);
}
@@ -71,13 +80,9 @@ private void readConfig(String config_filename) throws IOException {
logger.severe("Incomplete line in config.");
System.exit(0);
}
- if ("chunk_size".equals(fields[0])) {
+ if ("slice_size".equals(fields[0])) {
// Number of records to concurrently load into memory for sorting.
- CHUNK_SIZE = Integer.parseInt(fields[1]);
-
- } else if ("fanout".equals(fields[0])) {
- // Number of sorted chunks to concurrently merge.
- FANOUT = Integer.parseInt(fields[1]);
+ SLICE_SIZE = Integer.parseInt(fields[1]);
} else if ("quantizer".equals(fields[0])) {
// Adding a quantizer to the mix.
@@ -108,46 +113,39 @@ private void readConfig(String config_filename) throws IOException {
*/
public void pack() throws IOException {
logger.info("Beginning exploration pass.");
-
+ LineReader grammar_reader = null;
+ LineReader alignment_reader = null;
+
quantization.initialize();
// Explore pass. Learn vocabulary and quantizer histograms.
- for (String grammar_file : grammars) {
- logger.info("Exploring: " + grammar_file);
- LineReader reader = new LineReader(grammar_file);
- explore(reader);
- }
+ logger.info("Exploring: " + grammar);
+ grammar_reader = new LineReader(grammar);
+ explore(grammar_reader);
logger.info("Exploration pass complete. Freezing vocabulary and " +
"finalizing quantizers.");
quantization.finalize();
-
quantization.write(WORKING_DIRECTORY + File.separator + "quantization");
Vocabulary.freeze();
Vocabulary.write(WORKING_DIRECTORY + File.separator + "vocabulary");
+ // Read previously written quantizer configuration to match up to changed
+ // vocabulary id's.
quantization.read(WORKING_DIRECTORY + File.separator + "quantization");
- logger.info("Beginning chunking pass.");
- Queue<PackingFileTuple> chunks = new PriorityQueue<PackingFileTuple>();
- // Chunking pass. Split and binarize source, target and features into
- for (String grammar_file : grammars) {
- LineReader reader = new LineReader(grammar_file);
- binarize(reader, chunks);
- }
- logger.info("Chunking pass complete.");
+ logger.info("Beginning packing pass.");
+ Queue<PackingFileTuple> slices = new PriorityQueue<PackingFileTuple>();
+ // Actual binarization pass. Slice and pack source, target and data.
+ grammar_reader = new LineReader(grammar);
+
+ if (have_alignments)
+ alignment_reader = new LineReader(alignments);
+ binarize(grammar_reader, alignment_reader, slices);
+ logger.info("Packing complete.");
logger.info("Packed grammar in: " + WORKING_DIRECTORY);
-
- // logger.info("Beginning merge phase.");
- // // Merge loop.
- // while (chunks.size() > 1) {
- // List<PackingFileTuple> to_merge = new ArrayList<PackingFileTuple>();
- // while (to_merge.size() < FANOUT && !chunks.isEmpty())
- // to_merge.add(chunks.poll());
- // chunks.add(merge(to_merge));
- // }
- // logger.info("Merge phase complete.");
+ logger.info("Done.");
}
// TODO: add javadoc.
@@ -191,26 +189,30 @@ private void explore(LineReader grammar) {
}
}
- private void binarize(LineReader grammar, Queue<PackingFileTuple> chunks)
- throws IOException {
+ private void binarize(LineReader grammar_reader, LineReader alignment_reader,
+ Queue<PackingFileTuple> slices) throws IOException {
int counter = 0;
- int chunk_counter = 0;
- int num_chunks = 0;
+ int slice_counter = 0;
+ int num_slices = 0;
boolean ready_to_flush = false;
String first_source_word = null;
PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
- PackingBuffer data_buffer = new PackingBuffer();
+ FeatureBuffer feature_buffer = new FeatureBuffer();
+
+ AlignmentBuffer alignment_buffer = null;
+ if (have_alignments)
+ alignment_buffer = new AlignmentBuffer();
TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
- while (grammar.hasNext()) {
- String line = grammar.next().trim();
+ while (grammar_reader.hasNext()) {
+ String grammar_line = grammar_reader.next().trim();
counter++;
- chunk_counter++;
+ slice_counter++;
- String[] fields = line.split("\\s\\|{3}\\s");
+ String[] fields = grammar_line.split("\\s\\|{3}\\s");
if (fields.length < 4) {
logger.warning("Incomplete grammar line at line " + counter);
continue;
@@ -219,24 +221,51 @@ private void binarize(LineReader grammar, Queue<PackingFileTuple> chunks)
String[] source_words = fields[1].split("\\s");
String[] target_words = fields[2].split("\\s");
String[] feature_entries = fields[3].split("\\s");
-
- // Reached chunk limit size, indicate that we're closing up.
- if (!ready_to_flush
- && (chunk_counter > CHUNK_SIZE || data_buffer.overflowing())) {
+
+ // Reached slice limit size, indicate that we're closing up.
+ if (!ready_to_flush && (slice_counter > SLICE_SIZE
+ || feature_buffer.overflowing()
+ || (have_alignments && alignment_buffer.overflowing()))) {
ready_to_flush = true;
first_source_word = source_words[0];
}
// Finished closing up.
if (ready_to_flush && !first_source_word.equals(source_words[0])) {
- chunks.add(flush(source_trie, target_trie, data_buffer, num_chunks));
+ slices.add(flush(source_trie, target_trie, feature_buffer,
+ alignment_buffer, num_slices));
source_trie.clear();
target_trie.clear();
- data_buffer.clear();
+ feature_buffer.clear();
+ if (have_alignments)
+ alignment_buffer.clear();
- num_chunks++;
- chunk_counter = 0;
+ num_slices++;
+ slice_counter = 0;
ready_to_flush = false;
}
+
+ int alignment_index = -1;
+ // If present, process alignments.
+ if (have_alignments) {
+ if (!alignment_reader.hasNext()) {
+ logger.severe("No more alignments starting in line " + counter);
+ throw new RuntimeException("No more alignments starting in line "
+ + counter);
+ } else {
+ String alignment_line = alignment_reader.next().trim();
+ String[] alignment_entries = alignment_line.split("\\s");
+ byte[] alignments = new byte[alignment_entries.length * 2];
+ if (alignment_entries.length != 0) {
+ for (int i = 0; i < alignment_entries.length; i++) {
+ String[] parts = alignment_entries[i].split("-");
+ alignments[2 * i] = Byte.parseByte(parts[0]);
+ alignments[2 * i + 1] = Byte.parseByte(parts[1]);
+ }
+ }
+ alignment_index = alignment_buffer.add(alignments);
+ }
+ }
+
// Process features.
// Implicitly sort via TreeMap, write to data buffer, remember position
// to pass on to the source trie node.
@@ -248,7 +277,15 @@ private void binarize(LineReader grammar, Queue<PackingFileTuple> chunks)
if (feature_value != 0)
features.put(feature_id, feature_value);
}
- int features_index = data_buffer.add(features);
+ int features_index = feature_buffer.add(features);
+
+ // Sanity check on the data block index.
+ if (have_alignments && features_index != alignment_index) {
+ logger.severe("Block index mismatch between features ("
+ + features_index + ") and alignments ("
+ + alignment_index + ").");
+ throw new RuntimeException("Data block index mismatch.");
+ }
// Process source side.
SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
@@ -275,7 +312,8 @@ private void binarize(LineReader grammar, Queue<PackingFileTuple> chunks)
}
target_trie.add(target, tv);
}
- chunks.add(flush(source_trie, target_trie, data_buffer, num_chunks));
+ slices.add(flush(source_trie, target_trie, feature_buffer,
+ alignment_buffer, num_slices));
}
/**
@@ -289,21 +327,24 @@ private void binarize(LineReader grammar, Queue<PackingFileTuple> chunks)
*
* @param source_trie
* @param target_trie
- * @param data_buffer
+ * @param feature_buffer
* @param id
* @throws IOException
*/
private PackingFileTuple flush(PackingTrie<SourceValue> source_trie,
- PackingTrie<TargetValue> target_trie, PackingBuffer data_buffer,
+ PackingTrie<TargetValue> target_trie,
+ FeatureBuffer feature_buffer,
+ AlignmentBuffer alignment_buffer,
int id) throws IOException {
- // Make a chunk object for this piece of the grammar.
- PackingFileTuple chunk = new PackingFileTuple("chunk_"
+ // Make a slice object for this piece of the grammar.
+ PackingFileTuple slice = new PackingFileTuple("slice_"
+ String.format("%05d", id));
// Pull out the streams for source, target and data output.
- DataOutputStream source_stream = chunk.getSourceOutput();
- DataOutputStream target_stream = chunk.getTargetOutput();
- DataOutputStream target_lookup_stream = chunk.getTargetLookupOutput();
- DataOutputStream data_stream = chunk.getDataOutput();
+ DataOutputStream source_stream = slice.getSourceOutput();
+ DataOutputStream target_stream = slice.getTargetOutput();
+ DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+ DataOutputStream feature_stream = slice.getFeatureOutput();
+ DataOutputStream alignment_stream = slice.getAlignmentOutput();
Queue<PackingTrie<TargetValue>> target_queue;
Queue<PackingTrie<SourceValue>> source_queue;
@@ -366,8 +407,10 @@ private PackingFileTuple flush(PackingTrie<SourceValue> source_trie,
source_position = source_trie.size(true, false);
source_trie.address = target_position;
- // Ready data buffer for writing.
- data_buffer.initialize();
+ // Ready data buffers for writing.
+ feature_buffer.initialize();
+ if (have_alignments)
+ alignment_buffer.initialize();
// Packing loop for downwards-pointing source trie.
while (!source_queue.isEmpty()) {
@@ -392,56 +435,74 @@ private PackingFileTuple flush(PackingTrie<SourceValue> source_trie,
source_stream.writeInt(node.values.size());
// Write lhs and links to target and data.
for (SourceValue sv : node.values) {
- sv.data = data_buffer.write(sv.data);
+ int feature_block_index = feature_buffer.write(sv.data);
+ if (have_alignments) {
+ int alignment_block_index = alignment_buffer.write(sv.data);
+ if (alignment_block_index != feature_block_index) {
+ logger.severe("Block index mismatch.");
+ throw new RuntimeException("Block index mismatch: alignment ("
+ + alignment_block_index +") and features ("
+ + feature_block_index + ") don't match.");
+ }
+ }
source_stream.writeInt(sv.lhs);
source_stream.writeInt(sv.target);
- source_stream.writeInt(sv.data);
+ source_stream.writeInt(feature_block_index);
}
}
// Flush the data stream.
- data_buffer.flush(data_stream);
+ feature_buffer.flush(feature_stream);
+ if (have_alignments)
+ alignment_buffer.flush(alignment_stream);
target_stream.close();
source_stream.close();
- data_stream.close();
-
- return chunk;
- }
-
- // TODO: Evaluate whether an implementation of this is necessary.
- private PackingFileTuple merge(List<PackingFileTuple> chunks) {
- return null;
+ feature_stream.close();
+ if (have_alignments)
+ alignment_stream.close();
+
+ return slice;
}
public static void main(String[] args) throws IOException {
- if (args.length == 3) {
- String config_filename = args[0];
-
- WORKING_DIRECTORY = args[1];
-
- if (new File(WORKING_DIRECTORY).exists()) {
- System.err.println("File or directory already exists: "
- + WORKING_DIRECTORY);
- System.err.println("Will not overwrite.");
- return;
+ String config_filename = null;
+ String grammar_filename = null;
+ String alignments_filename = null;
+
+ for (int i = 0; i < args.length; i++) {
+ if ("-g".equals(args[i]) && (i < args.length - 1)) {
+ grammar_filename = args[++i];
+ } else if ("-p".equals(args[i]) && (i < args.length - 1)) {
+ WORKING_DIRECTORY = args[++i];
+ } else if ("-c".equals(args[i]) && (i < args.length - 1)) {
+ config_filename = args[++i];
+ } else if ("-a".equals(args[i]) && (i < args.length - 1)) {
+ alignments_filename = args[++i];
}
-
- List<String> grammar_files = new ArrayList<String>();
-
- // Currently not supporting more than one grammar file due to our
- // assumption of sortedness.
- // for (int i = 2; i < args.length; i++)
- // grammar_files.add(args[i]);
-
- grammar_files.add(args[2]);
- GrammarPacker packer = new GrammarPacker(config_filename, grammar_files);
- packer.pack();
-
- } else {
- System.err.println("Expecting three arguments: ");
- System.err.println("\tjoshua.tools.GrammarPacker config_file " +
- "output_directory sorted_grammar_file");
}
+ if (grammar_filename == null) {
+ logger.severe("Grammar file not specified.");
+ return;
+ }
+ if (!new File(grammar_filename).exists()) {
+ logger.severe("Grammar file not found: " + grammar_filename);
+ }
+ if (config_filename == null) {
+ logger.severe("Config file not specified.");
+ return;
+ }
+ if (!new File(config_filename).exists()) {
+ logger.severe("Config file not found: " + config_filename);
+ }
+ if (new File(WORKING_DIRECTORY).exists()) {
+ logger.severe("File or directory already exists: " + WORKING_DIRECTORY);
+ logger.severe("Will not overwrite.");
+ return;
+ }
+
+ GrammarPacker packer = new GrammarPacker(config_filename, grammar_filename,
+ alignments_filename);
+ packer.pack();
}
/**
@@ -495,7 +556,7 @@ private void add(int[] path, int index, D value) {
}
/**
- * Calculate the size (in bytes) of a packed trie node. Distinguishes
+ * Calculate the size (in ints) of a packed trie node. Distinguishes
* downwards pointing (parent points to children) from upwards pointing
* (children point to parent) tries, as well as skeletal (no data, just the
* labeled links) and non-skeletal (nodes have a data block) packing.
@@ -523,7 +584,7 @@ int size(boolean downwards, boolean skeletal) {
if (!skeletal && !values.isEmpty())
size += values.size() * values.get(0).size();
- return size * 4;
+ return size;
}
void clear() {
@@ -570,16 +631,13 @@ public int size() {
}
}
- // TODO: abstract away from features, use generic type for in-memory
- // structure. PackingBuffers are to implement structure to in-memory bytes
- // and flushing in-memory bytes to on-disk bytes.
- class PackingBuffer {
+ abstract class PackingBuffer<T> {
private byte[] backing;
- private ByteBuffer buffer;
+ protected ByteBuffer buffer;
- private ArrayList<Integer> memoryLookup;
- private int totalSize;
- private ArrayList<Integer> onDiskOrder;
+ protected ArrayList<Integer> memoryLookup;
+ protected int totalSize;
+ protected ArrayList<Integer> onDiskOrder;
PackingBuffer() throws IOException {
allocate();
@@ -588,14 +646,16 @@ public int size() {
totalSize = 0;
}
+ abstract int add(T item);
+
// Allocate a reasonably-sized buffer for the feature data.
private void allocate() {
- backing = new byte[CHUNK_SIZE * AVERAGE_NUM_FEATURES];
+ backing = new byte[SLICE_SIZE * DATA_SIZE_ESTIMATE];
buffer = ByteBuffer.wrap(backing);
}
// Reallocate the backing array and buffer, copies data over.
- private void reallocate() {
+ protected void reallocate() {
if (backing.length == Integer.MAX_VALUE)
return;
long attempted_length = backing.length * 2l;
@@ -615,41 +675,6 @@ private void reallocate() {
}
/**
- * Add a block of features to the buffer.
- *
- * @param features
- * TreeMap with the features for one rule.
- * @return The index of the resulting data block.
- */
- int add(TreeMap<Integer, Float> features) {
- int data_position = buffer.position();
-
- // Over-estimate how much room this addition will need: 12 bytes per
- // feature (4 for label, "upper bound" of 8 for the value), plus 4 for
- // the number of features. If this won't fit, reallocate the buffer.
- int size_estimate = 12 * features.size() + 4;
- if (buffer.capacity() - buffer.position() <= size_estimate)
- reallocate();
-
- // Write features to buffer.
- buffer.putInt(features.size());
- for (Integer k : features.descendingKeySet()) {
- float v = features.get(k);
- // Sparse features.
- if (v != 0.0) {
- buffer.putInt(k);
- quantization.get(k).write(buffer, v);
- }
- }
- // Store position the block was written to.
- memoryLookup.add(data_position);
- // Update total size (in bytes).
- totalSize = buffer.position();
- // Return block index.
- return memoryLookup.size() - 1;
- }
-
- /**
* Prepare the data buffer for disk writing.
*/
void initialize() {
@@ -722,11 +747,88 @@ private int blockSize(int block_index) {
}
}
+ class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+ FeatureBuffer() throws IOException {
+ super();
+ }
+
+ /**
+ * Add a block of features to the buffer.
+ *
+ * @param features
+ * TreeMap with the features for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(TreeMap<Integer, Float> features) {
+ int data_position = buffer.position();
+
+ // Over-estimate how much room this addition will need: 12 bytes per
+ // feature (4 for label, "upper bound" of 8 for the value), plus 4 for
+ // the number of features. If this won't fit, reallocate the buffer.
+ int size_estimate = 12 * features.size() + 4;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write features to buffer.
+ buffer.putInt(features.size());
+ for (Integer k : features.descendingKeySet()) {
+ float v = features.get(k);
+ // Sparse features.
+ if (v != 0.0) {
+ buffer.putInt(k);
+ quantization.get(k).write(buffer, v);
+ }
+ }
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+ AlignmentBuffer() throws IOException {
+ super();
+ }
+
+ /**
+ * Add a rule alignments to the buffer.
+ *
+ * @param alignments
+ * a byte array with the alignment points for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(byte[] alignments) {
+ int data_position = buffer.position();
+ int size_estimate = alignments.length + 1;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write alignment points to buffer.
+ buffer.put((byte) (alignments.length / 2));
+ buffer.put(alignments);
+
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
class PackingFileTuple implements Comparable<PackingFileTuple> {
private File sourceFile;
private File targetLookupFile;
private File targetFile;
- private File dataFile;
+
+ private File featureFile;
+ private File alignmentFile;
PackingFileTuple(String prefix) {
sourceFile = new File(WORKING_DIRECTORY + File.separator
@@ -735,10 +837,15 @@ private int blockSize(int block_index) {
+ prefix + ".target");
targetLookupFile = new File(WORKING_DIRECTORY + File.separator
+ prefix + ".target.lookup");
- dataFile = new File(WORKING_DIRECTORY + File.separator
- + prefix + ".data");
+ featureFile = new File(WORKING_DIRECTORY + File.separator
+ + prefix + ".features");
+ if (have_alignments)
+ alignmentFile = new File(WORKING_DIRECTORY + File.separator
+ + prefix + ".alignments");
+ else
+ alignmentFile = null;
- logger.info("Allocated chunk: " + sourceFile.getAbsolutePath());
+ logger.info("Allocated slice: " + sourceFile.getAbsolutePath());
}
DataOutputStream getSourceOutput() throws IOException {
@@ -753,8 +860,14 @@ DataOutputStream getTargetLookupOutput() throws IOException {
return getOutput(targetLookupFile);
}
- DataOutputStream getDataOutput() throws IOException {
- return getOutput(dataFile);
+ DataOutputStream getFeatureOutput() throws IOException {
+ return getOutput(featureFile);
+ }
+
+ DataOutputStream getAlignmentOutput() throws IOException {
+ if (alignmentFile != null)
+ return getOutput(alignmentFile);
+ return null;
}
private DataOutputStream getOutput(File file) throws IOException {
@@ -766,29 +879,8 @@ private DataOutputStream getOutput(File file) throws IOException {
}
}
- ByteBuffer getSourceBuffer() throws IOException {
- return getBuffer(sourceFile);
- }
-
- ByteBuffer getTargetBuffer() throws IOException {
- return getBuffer(targetFile);
- }
-
- ByteBuffer getDataBuffer() throws IOException {
- return getBuffer(dataFile);
- }
-
- private ByteBuffer getBuffer(File file) throws IOException {
- if (file.exists()) {
- FileChannel channel = new RandomAccessFile(file, "rw").getChannel();
- return channel.map(MapMode.READ_WRITE, 0, channel.size());
- } else {
- throw new RuntimeException("File doesn't exist: " + file.getName());
- }
- }
-
long getSize() {
- return sourceFile.length() + targetFile.length() + dataFile.length();
+ return sourceFile.length() + targetFile.length() + featureFile.length();
}
@Override
View
BIN test/bn-en/packed/grammar_packed/chunk_00000.data
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00000.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00000.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00000.target.lookup
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00001.data
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00001.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00001.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00001.target.lookup
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00002.data
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00002.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00002.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00002.target.lookup
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00003.data
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00003.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00003.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00003.target.lookup
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00004.data
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00004.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00004.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/chunk_00004.target.lookup
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/slice_00000.features
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/slice_00000.source
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/slice_00000.target
Binary file not shown.
View
BIN test/bn-en/packed/grammar_packed/slice_00000.target.lookup
Binary file not shown.
View
2 test/bn-en/packed/packer.quantized
@@ -1,3 +1,3 @@
-chunk_size 40000
+slice_size 40000
quantizer 8bit 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
View
2 test/bn-en/packed/packer.uncompressed
@@ -1,3 +1,3 @@
-chunk_size 40000
+slice_size 40000
quantizer float 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
View
110 test/packed-grammar/PrintRules.java
@@ -1,6 +1,7 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.nio.IntBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
@@ -20,17 +21,24 @@
private QuantizerConfiguration quantization;
- private MappedByteBuffer source;
- private MappedByteBuffer target;
- private MappedByteBuffer data;
+ private IntBuffer source;
+ private IntBuffer target;
+ private MappedByteBuffer features;
+ private MappedByteBuffer alignments;
- private int[] lookup;
+ private int[] featureLookup;
+ private int[] alignmentLookup;
+
+ private boolean have_alignments;
public PrintRules(String dir) throws IOException {
- File source_file = new File(dir + "/chunk_00000.source");
- File target_file = new File(dir + "/chunk_00000.target");
- File data_file = new File(dir + "/chunk_00000.data");
+ File source_file = new File(dir + "/slice_00000.source");
+ File target_file = new File(dir + "/slice_00000.target");
+ File feature_file = new File(dir + "/slice_00000.features");
+ File alignment_file = new File(dir + "/slice_00000.alignments");
+ have_alignments = alignment_file.exists();
+
// Read the vocabulary.
Vocabulary.read(dir + "/vocabulary");
@@ -41,23 +49,40 @@ public PrintRules(String dir) throws IOException {
// Get the channels etc.
FileChannel source_channel = new FileInputStream(source_file).getChannel();
int source_size = (int) source_channel.size();
+ source = source_channel.map(MapMode.READ_ONLY, 0,
+ source_size).asIntBuffer();
+
FileChannel target_channel = new FileInputStream(target_file).getChannel();
int target_size = (int) target_channel.size();
- FileChannel data_channel = new FileInputStream(data_file).getChannel();
- int data_size = (int) data_channel.size();
-
- source = source_channel.map(MapMode.READ_ONLY, 0,
- source_size);
- target = target_channel.map(MapMode.READ_ONLY, 0,
- target_size);
- data = data_channel.map(MapMode.READ_ONLY, 0, data_size);
-
- int num_blocks = data.getInt();
- lookup = new int[num_blocks];
+ target = target_channel.map(MapMode.READ_ONLY, 0,
+ target_size).asIntBuffer();
+
+ FileChannel feature_channel = new FileInputStream(feature_file).getChannel();
+ int feature_size = (int) feature_channel.size();
+ features = feature_channel.map(MapMode.READ_ONLY, 0, feature_size);
+
+ if (have_alignments) {
+ FileChannel alignment_channel = new FileInputStream(alignment_file).getChannel();
+ int alignment_size = (int) alignment_channel.size();
+ alignments = alignment_channel.map(MapMode.READ_ONLY, 0, alignment_size);
+ }
+
+ int num_feature_blocks = features.getInt();
+ featureLookup = new int[num_feature_blocks];
+ // Read away data size.
+ features.getInt();
+ for (int i = 0; i < num_feature_blocks; i++)
+ featureLookup[i] = features.getInt();
+
+ int num_alignment_blocks = alignments.getInt();
+ alignmentLookup = new int[num_alignment_blocks];
// Read away data size.
- data.getInt();
- for (int i = 0; i < num_blocks; i++)
- lookup[i] = data.getInt();
+ alignments.getInt();
+ for (int i = 0; i < num_alignment_blocks; i++)
+ alignmentLookup[i] = alignments.getInt();
+
+ if (num_alignment_blocks != num_feature_blocks)
+ throw new RuntimeException("Number of blocks doesn't match up.");
}
public void traverse() {
@@ -66,18 +91,18 @@ public void traverse() {
private void traverse(int position, String src_side) {
source.position(position);
- int num_children = source.getInt();
+ int num_children = source.get();
int[] addresses = new int[num_children];
int[] symbols = new int[num_children];
for (int i = 0; i < num_children; i++) {
- symbols[i] = source.getInt();
- addresses[i] = source.getInt();
+ symbols[i] = source.get();
+ addresses[i] = source.get();
}
- int num_rules = source.getInt();
+ int num_rules = source.get();
for (int i = 0; i < num_rules; i++) {
- int lhs = source.getInt();
- int tgt_address = source.getInt();
- int data_address = source.getInt();
+ int lhs = source.get();
+ int tgt_address = source.get();
+ int data_address = source.get();
printRule(src_side, lhs, tgt_address, data_address);
}
for (int i = 0; i < num_children; i++) {
@@ -89,9 +114,9 @@ private String getTarget(int pointer) {
StringBuilder sb = new StringBuilder();
do {
target.position(pointer);
- pointer = target.getInt();
+ pointer = target.get();
if (pointer != -1) {
- int symbol = target.getInt();
+ int symbol = target.get();
if (symbol < 0)
sb.append(" ").append("NT" + symbol);
else
@@ -104,25 +129,40 @@ private String getTarget(int pointer) {
private String getFeatures(int block_id) {
StringBuilder sb = new StringBuilder();
- int data_position = lookup[block_id];
- int num_features = data.getInt(data_position);
+ int data_position = featureLookup[block_id];
+ int num_features = features.getInt(data_position);
data_position += 4;
for (int i = 0; i < num_features; i++) {
- int feature_id = data.getInt(data_position);
+ int feature_id = features.getInt(data_position);
Quantizer quantizer = quantization.get(feature_id);
sb.append(" " + Vocabulary.word(feature_id) + "=" +
- quantizer.read(data, data_position));
+ quantizer.read(features, data_position));
data_position += 4 + quantizer.size();
}
return sb.toString();
}
+ private String getAlignments(int block_id) {
+ StringBuilder sb = new StringBuilder();
+
+ int data_position = alignmentLookup[block_id];
+ byte num_points = alignments.get(data_position);
+ for (int i = 0; i < num_points; i++) {
+ byte src = alignments.get(data_position + 1 + 2 * i);
+ byte tgt = alignments.get(data_position + 2 + 2 * i);
+
+ sb.append(" " + src + "-" + tgt);
+ }
+ return sb.toString();
+ }
+
private void printRule(String src_side, int lhs, int tgt_address,
int data_address) {
System.out.println(Vocabulary.word(lhs) + " |||" +
src_side + " |||" +
getTarget(tgt_address) + " |||" +
- getFeatures(data_address));
+ getFeatures(data_address) +
+ (have_alignments ? " |||" + getAlignments(data_address) : ""));
}
public static void main(String args[]) throws IOException {

0 comments on commit 98acd2b

Please sign in to comment.