Permalink
Browse files

Merge branch 'refactoring'

  • Loading branch information...
2 parents 0cdcfd3 + fc95059 commit 831c725f798f5c9fe6d24154af29242c1ae9047a @lintool committed Mar 31, 2012
@@ -25,7 +25,6 @@
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
@@ -59,22 +58,16 @@
* @author fangyue
* @author metzler
*/
-
public class ExtractHTMLFieldCollection extends PowerTool {
private static final Logger LOG = Logger.getLogger(ExtractHTMLFieldCollection.class);
- public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument>
- {
-
+ public static class MyMapper extends Mapper<LongWritable, Indexable, LongWritable, TextDocument> {
// TODO: allow this to support user-defined regular expressions, not just the "heading" one pre-defined here
- public static class HeadingTagFilter implements NodeFilter
- {
+ public static class HeadingTagFilter implements NodeFilter {
private static final long serialVersionUID = 3848416345122090905L;
+ private final Pattern pattern = Pattern.compile("h[123456]", Pattern.CASE_INSENSITIVE);
- Pattern pattern = Pattern.compile("h[123456]", Pattern.CASE_INSENSITIVE);
-
- public boolean accept(Node node)
- {
+ public boolean accept(Node node) {
return (pattern.matcher(node.getText()).matches());
}
}
@@ -164,7 +157,7 @@ public ExtractHTMLFieldCollection(Configuration conf) {
super(conf);
}
- @SuppressWarnings({ "rawtypes", "unchecked" })
+ @SuppressWarnings("unchecked")
@Override
public int runTool() throws Exception {
Configuration conf = getConf();
@@ -0,0 +1,5 @@
+package edu.umd.cloud9.collection;
+
+public abstract class WebDocument extends Indexable {
+ public abstract String getURL();
+}
@@ -1,8 +0,0 @@
-package edu.umd.cloud9.collection.generic;
-
-import edu.umd.cloud9.collection.Indexable;
-
-public abstract class WebDocument extends Indexable
-{
- public abstract String getURL();
-}
@@ -25,7 +25,7 @@
import com.google.common.base.Preconditions;
import edu.umd.cloud9.collection.Indexable;
-import edu.umd.cloud9.collection.generic.WebDocument;
+import edu.umd.cloud9.collection.WebDocument;
/**
* Object representing a TREC document.
@@ -25,10 +25,10 @@
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import edu.umd.cloud9.collection.IndexableFileInputFormat;
+import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.collection.XMLInputFormatOld;
import edu.umd.cloud9.collection.XMLInputFormat.XMLRecordReader;
-import edu.umd.cloud9.collection.generic.WebDocument;
public class TrecDocumentInputFormat extends
IndexableFileInputFormat<LongWritable, WebDocument> {
@@ -24,7 +24,7 @@
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.WritableUtils;
-import edu.umd.cloud9.collection.generic.WebDocument;
+import edu.umd.cloud9.collection.WebDocument;
public class TrecWebDocument extends WebDocument {
@@ -9,10 +9,10 @@
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import edu.umd.cloud9.collection.IndexableFileInputFormat;
+import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.collection.XMLInputFormatOld;
import edu.umd.cloud9.collection.XMLInputFormat.XMLRecordReader;
-import edu.umd.cloud9.collection.generic.WebDocument;
public class TrecWebDocumentInputFormat extends
IndexableFileInputFormat<LongWritable, WebDocument> {
@@ -50,10 +50,7 @@
* @author Nima Asadi
*
*/
-
-@SuppressWarnings("deprecation")
public class BuildReverseWebGraph extends PowerTool {
-
private static final Logger LOG = Logger.getLogger(BuildReverseWebGraph.class);
public static class Reduce extends MapReduceBase implements
@@ -48,10 +48,7 @@
* @author Nima Asadi
*
*/
-
-@SuppressWarnings("deprecation")
public class BuildWebGraph extends PowerTool {
-
private static final Logger LOG = Logger.getLogger(BuildWebGraph.class);
public static class Map extends MapReduceBase implements
Oops, something went wrong.

0 comments on commit 831c725

Please sign in to comment.