diff --git a/src/java/org/apache/nutch/crawl/GeneratorMapper.java b/src/java/org/apache/nutch/crawl/GeneratorMapper.java index 29c00c79f2..aafebd7ba4 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorMapper.java +++ b/src/java/org/apache/nutch/crawl/GeneratorMapper.java @@ -17,6 +17,7 @@ package org.apache.nutch.crawl; import java.io.IOException; +import java.net.MalformedURLException; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.GeneratorJob.SelectorEntry; @@ -62,6 +63,9 @@ public void map(String reversedUrl, WebPage page, } catch (URLFilterException e) { GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); return; + } catch (MalformedURLException e) { + GeneratorJob.LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); + return; } // check fetch schedule diff --git a/src/java/org/apache/nutch/crawl/GeneratorReducer.java b/src/java/org/apache/nutch/crawl/GeneratorReducer.java index fce196258b..c3ebb923cb 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorReducer.java +++ b/src/java/org/apache/nutch/crawl/GeneratorReducer.java @@ -20,6 +20,8 @@ import java.util.HashMap; import java.util.Map; +import java.net.MalformedURLException; + import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.GeneratorJob.SelectorEntry; @@ -73,7 +75,11 @@ protected void reduce(SelectorEntry key, Iterable values, } Mark.GENERATE_MARK.putMark(page, batchId); - context.write(TableUtil.reverseUrl(key.url), page); + try { + context.write(TableUtil.reverseUrl(key.url), page); + } catch (MalformedURLException e) { + continue; + } context.getCounter("Generator", "GENERATE_MARK").increment(1); count++; } diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java index 64e452611c..86d819a9ca 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherJob.java +++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java @@ -154,6 +154,7 @@ public Map run(Map args) throws Exception { Integer threads = (Integer)args.get(Nutch.ARG_THREADS); Boolean shouldResume = (Boolean)args.get(Nutch.ARG_RESUME); Integer numTasks = (Integer)args.get(Nutch.ARG_NUMTASKS); + Boolean parse = (Boolean)args.get(Nutch.ARG_PARSE); if (threads != null && threads > 0) { getConf().setInt(THREADS_KEY, threads); @@ -166,6 +167,10 @@ public Map run(Map args) throws Exception { getConf().setBoolean(RESUME_KEY, shouldResume); } + if (parse != null) { + getConf().setBoolean(PARSE_KEY, parse); + } + // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails @@ -201,7 +206,7 @@ public Map run(Map args) throws Exception { * @return 0 on success * @throws Exception */ - public int fetch(String batchId, int threads, boolean shouldResume, int numTasks) + public int fetch(String batchId, int threads, boolean shouldResume, int numTasks, boolean parse) throws Exception { LOG.info("FetcherJob: starting"); @@ -219,7 +224,8 @@ public int fetch(String batchId, int threads, boolean shouldResume, int numTasks Nutch.ARG_BATCH, batchId, Nutch.ARG_THREADS, threads, Nutch.ARG_RESUME, shouldResume, - Nutch.ARG_NUMTASKS, numTasks)); + Nutch.ARG_NUMTASKS, numTasks, + Nutch.ARG_PARSE, parse)); LOG.info("FetcherJob: done"); return 0; } @@ -261,6 +267,7 @@ void checkConfiguration() { public int run(String[] args) throws Exception { int threads = -1; boolean shouldResume = false; + boolean parse = false; String batchId; String usage = "Usage: FetcherJob ( | -all) [-crawlId ] " + @@ -292,10 +299,12 @@ public int run(String[] args) throws Exception { numTasks = Integer.parseInt(args[++i]); } else if ("-crawlId".equals(args[i])) { getConf().set(Nutch.CRAWL_ID_KEY, args[++i]); + } else if ("-parse".equals(args[i])) { + parse = true; } } - int fetchcode = fetch(batchId, threads, shouldResume, numTasks); // run the Fetcher + int fetchcode = fetch(batchId, threads, shouldResume, numTasks, parse); // run the Fetcher return fetchcode; } diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java index a857c5f917..767063798f 100644 --- a/src/java/org/apache/nutch/metadata/Nutch.java +++ b/src/java/org/apache/nutch/metadata/Nutch.java @@ -110,6 +110,8 @@ public interface Nutch { public static final String ARG_CLASS = "class"; /** Depth (number of cycles) of a crawl. */ public static final String ARG_DEPTH = "depth"; + /** Parse */ + public static final String ARG_PARSE = "parse"; // short constants for status / results fields /** Status / result message. */ diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java index f9f5a0951e..8338316012 100644 --- a/src/java/org/apache/nutch/parse/ParseUtil.java +++ b/src/java/org/apache/nutch/parse/ParseUtil.java @@ -204,7 +204,7 @@ public URLWebPage process(String key, WebPage page) { } catch (MalformedURLException e) { return redirectedPage; } - if (newUrl == null || newUrl.equals(url)) { + if (newUrl != null && !newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME); WebPage newWebPage = new WebPage(); diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java index b790fbf291..39f5d009bf 100644 --- a/src/java/org/apache/nutch/util/TableUtil.java +++ b/src/java/org/apache/nutch/util/TableUtil.java @@ -119,11 +119,13 @@ public static String getReversedHost(String reversedUrl) { } private static void reverseAppendSplits(String[] splits, StringBuilder buf) { - for (int i = splits.length - 1; i > 0; i--) { - buf.append(splits[i]); - buf.append('.'); + if (splits.length > 0) { + for (int i = splits.length - 1; i > 0; i--) { + buf.append(splits[i]); + buf.append('.'); + } + buf.append(splits[0]); } - buf.append(splits[0]); } /**