diff --git a/engine/src/main/java/org/archive/crawler/reporting/HostsReport.java b/engine/src/main/java/org/archive/crawler/reporting/HostsReport.java index 4370e4257..a781c1aa7 100644 --- a/engine/src/main/java/org/archive/crawler/reporting/HostsReport.java +++ b/engine/src/main/java/org/archive/crawler/reporting/HostsReport.java @@ -20,7 +20,7 @@ package org.archive.crawler.reporting; import java.io.PrintWriter; -import java.util.Map; +import java.util.Collection; import java.util.logging.Level; import java.util.logging.Logger; @@ -36,33 +36,81 @@ public class HostsReport extends Report { private final static Logger logger = Logger.getLogger(HostsReport.class.getName()); - - @Override + + int maxSortSize = -1; + public int getMaxSortSize() { + return maxSortSize; + } + /** + * The maximum number of hosts allowed in a report while still sorting it. If the number of hosts exceeds + * this value, the generated report will not be sorted. A negative signifies no limit (always sort). + * A value of zero means never sort. Default -1, always sort. This matches the behavior before this + * parameter was introduced. + * + * This value can not be overridden by a sheet. It may be safely edited at runtime. + * + * @param maxSortSize + */ + public void setMaxSortSize(int maxSortSize) { + this.maxSortSize = maxSortSize; + } + + boolean suppressEmptyHosts = false; + public boolean isSuppressEmptyHosts() { + return suppressEmptyHosts; + } + /** + * If true, hosts for whom no URLs have been fetched will be suppressed in this report. + * Such hosts are recorded when the crawler encounters an URL for a host but has not yet (and may never) + * processed any URL for the host. This can happen for many reason's, related to scoping and queue budgeting + * among others. + * Default behavior is to include these non-crawled hosts. + * + * This value can not be overridden by a sheet. It may be safely edited at runtime. + * + * @param suppressEmptyHosts + */ + public void setSuppressEmptyHosts(boolean suppressEmptyHosts) { + this.suppressEmptyHosts = suppressEmptyHosts; + } + + @Override public void write(final PrintWriter writer, StatisticsTracker stats) { - // TODO: only perform sorting on manageable number of hosts - DisposableStoredSortedMap hd = stats.calcReverseSortedHostsDistribution(); + Collection keys = null; + DisposableStoredSortedMap hd = null; + if (maxSortSize<0 || maxSortSize>stats.serverCache.hostKeys().size()) { + hd = stats.calcReverseSortedHostsDistribution(); + keys = hd.values(); + } else { + keys = stats.serverCache.hostKeys(); + } writer.print("[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n"); - for (Map.Entry entry : hd.entrySet()) { + for (String key : keys) { // key is -count, value is hostname try { - CrawlHost host = stats.serverCache.getHostFor(entry.getValue()); - writeReportLine(writer, - host.getSubstats().getFetchSuccesses(), - host.getSubstats().getTotalBytes(), - host.fixUpName(), - host.getSubstats().getRobotsDenials(), - host.getSubstats().getRemaining(), - host.getSubstats().getNovelUrls(), - host.getSubstats().getNovelBytes(), - host.getSubstats().getDupByHashUrls(), - host.getSubstats().getDupByHashBytes(), - host.getSubstats().getNotModifiedUrls(), - host.getSubstats().getNotModifiedBytes()); + CrawlHost host = stats.serverCache.getHostFor(key); + long fetchSuccesses = host.getSubstats().getFetchSuccesses(); + if (!suppressEmptyHosts || fetchSuccesses>0) { + writeReportLine(writer, + fetchSuccesses, + host.getSubstats().getTotalBytes(), + host.fixUpName(), + host.getSubstats().getRobotsDenials(), + host.getSubstats().getRemaining(), + host.getSubstats().getNovelUrls(), + host.getSubstats().getNovelBytes(), + host.getSubstats().getDupByHashUrls(), + host.getSubstats().getDupByHashBytes(), + host.getSubstats().getNotModifiedUrls(), + host.getSubstats().getNotModifiedBytes()); + } } catch (Exception e) { - logger.log(Level.WARNING, "unable to tally host stats for " + entry.getValue(), e); + logger.log(Level.WARNING, "unable to tally host stats for " + key, e); } } - hd.dispose(); + if (hd!=null) { + hd.dispose(); + } } protected void writeReportLine(PrintWriter writer, Object ... fields) { diff --git a/engine/src/main/java/org/archive/crawler/reporting/StatisticsTracker.java b/engine/src/main/java/org/archive/crawler/reporting/StatisticsTracker.java index 09af0b479..a8bb2e8ea 100644 --- a/engine/src/main/java/org/archive/crawler/reporting/StatisticsTracker.java +++ b/engine/src/main/java/org/archive/crawler/reporting/StatisticsTracker.java @@ -50,7 +50,6 @@ import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Engine; import org.archive.crawler.util.CrawledBytesHistotable; -import org.archive.crawler.util.TopNSet; import org.archive.modules.CrawlURI; import org.archive.modules.net.CrawlHost; import org.archive.modules.net.ServerCache; @@ -139,7 +138,7 @@ public class StatisticsTracker Checkpointable, BeanNameAware { @SuppressWarnings("unused") - private static final long serialVersionUID = 5L; + private static final long serialVersionUID = 6L; protected SeedModule seeds; public SeedModule getSeeds() { @@ -289,11 +288,6 @@ public CrawledBytesHistotable getCrawledBytes() { protected ConcurrentHashMap> sourceHostDistribution = new ConcurrentHashMap>(); - /* Keep track of 'top' hosts for live reports */ - protected TopNSet hostsDistributionTop; - protected TopNSet hostsBytesTop; - protected TopNSet hostsLastFinishedTop; - /** * Record of seeds and latest results */ @@ -350,10 +344,6 @@ public void start() { this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords", isRecover, SeedRecord.class); - this.hostsDistributionTop = new TopNSet(getLiveHostReportSize()); - this.hostsBytesTop = new TopNSet(getLiveHostReportSize()); - this.hostsLastFinishedTop = new TopNSet(getLiveHostReportSize()); - if(isRecover) { JSONObject json = recoveryCheckpoint.loadJson(beanName); @@ -363,19 +353,6 @@ public void start() { crawlPauseStarted = json.getLong("crawlPauseStarted"); tallyCurrentPause(); - JSONUtils.putAllLongs( - hostsDistributionTop.getTopSet(), - json.getJSONObject("hostsDistributionTop")); - hostsDistributionTop.updateBounds(); - JSONUtils.putAllLongs( - hostsBytesTop.getTopSet(), - json.getJSONObject("hostsBytesTop")); - hostsBytesTop.updateBounds(); - JSONUtils.putAllLongs( - hostsLastFinishedTop.getTopSet(), - json.getJSONObject("hostsLastFinishedTop")); - hostsLastFinishedTop.updateBounds(); - JSONUtils.putAllAtomicLongs( mimeTypeDistribution, json.getJSONObject("mimeTypeDistribution")); @@ -758,11 +735,7 @@ public void crawledURISuccessful(CrawlURI curi) { incrementMapCount(mimeTypeDistribution, mime); incrementMapCount(mimeTypeBytes, mime, curi.getContentSize()); - // Save hosts stats. ServerCache sc = serverCache; - saveHostStats(sc.getHostFor(curi.getUURI()).getHostName(), - curi.getContentSize()); - if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) { saveSourceStats((String)curi.getData().get(A_SOURCE_TAG), sc.getHostFor(curi.getUURI()). @@ -783,22 +756,6 @@ protected void saveSourceStats(String source, String hostname) { } - /** - * Update some running-stats based on a URI success - * - * @param hostname - * @param size - */ - protected void saveHostStats(String hostname, long size) { - // TODO: consider moving 'top' accounting elsewhere, such - // as the frontier or ServerCache itself - - CrawlHost host = serverCache.getHostFor(hostname); - hostsDistributionTop.update(hostname, host.getSubstats().getFetchSuccesses()); - hostsBytesTop.update(hostname, host.getSubstats().getSuccessBytes()); - hostsLastFinishedTop.update(hostname, host.getSubstats().getLastSuccessTime()); - } - public void crawledURINeedRetry(CrawlURI curi) { handleSeed(curi,"Failed to crawl seed, will retry"); } @@ -1074,10 +1031,6 @@ public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException { json.put("crawlPauseStarted",virtualCrawlPauseStarted); json.put("crawlTotalPausedTime",crawlTotalPausedTime); - json.put("hostsDistributionTop", hostsDistributionTop.getTopSet()); - json.put("hostsBytesTop", hostsBytesTop.getTopSet()); - json.put("hostsLastFinishedTop", hostsLastFinishedTop.getTopSet()); - json.put("mimeTypeDistribution", mimeTypeDistribution); json.put("mimeTypeBytes", mimeTypeBytes); json.put("statusCodeDistribution", statusCodeDistribution); diff --git a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml index eb7d8550c..589646e44 100644 --- a/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml +++ b/engine/src/main/resources/org/archive/crawler/restlet/profile-crawler-beans.cxml @@ -628,7 +628,10 @@ http://example.example/example - + + + +