Permalink
Browse files

Switch to reading text files for collectionFrequencyPath

  • Loading branch information...
1 parent 99faa3c commit 7f2da4c65065ea43addc8d4aee071684a3bed2ed @jakemannix committed May 8, 2012
@@ -1,13 +1,18 @@
package org.apache.mahout.clustering.lda.cvb;
+import java.io.IOException;
+
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Counter;
+import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
@@ -59,11 +64,18 @@ public void setConf(Configuration conf) {
}
CVBConfig config = new CVBConfig().read(conf);
minCfRatio = config.getCfSparsificationThreshold();
- SequenceFileIterable<IntWritable, DoubleWritable> cfIterator =
- new SequenceFileIterable<IntWritable, DoubleWritable>(config.getCollectionFrequencyPath(), conf);
double[] collectionFrequencies = new double[config.getNumTerms()];
- for(Pair<IntWritable, DoubleWritable> cfPair : cfIterator) {
- collectionFrequencies[cfPair.getFirst().get()] = cfPair.getSecond().get();
+ try {
+ Iterable<String> lines = new FileLineIterable(HadoopUtil.openStream(
+ config.getCollectionFrequencyPath(), conf));
+ for (String line : lines) {
+ String[] split = line.split("\t");
+ int featureId = Integer.parseInt(split[0]);
+ double cf = Double.parseDouble(split[1]);
+ collectionFrequencies[featureId] = cf;
+ }
+ } catch (Exception e) {
+ throw new RuntimeException("Could not configure VectorSparsifier", e);
}
setCollectionFrequencies(new DenseVector(collectionFrequencies));
}

0 comments on commit 7f2da4c

Please sign in to comment.