Skip to content

Commit

Permalink
Merge pull request #123 from kris-sigur/HostsReport
Browse files Browse the repository at this point in the history
Hosts report
  • Loading branch information
nlevitt committed Jan 15, 2016
2 parents 70c688a + 3495851 commit cc97f01
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 70 deletions.
90 changes: 69 additions & 21 deletions engine/src/main/java/org/archive/crawler/reporting/HostsReport.java
Expand Up @@ -20,7 +20,7 @@
package org.archive.crawler.reporting;

import java.io.PrintWriter;
import java.util.Map;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand All @@ -36,33 +36,81 @@ public class HostsReport extends Report {

private final static Logger logger =
Logger.getLogger(HostsReport.class.getName());

@Override

int maxSortSize = -1;
public int getMaxSortSize() {
return maxSortSize;
}
/**
* The maximum number of hosts allowed in a report while still sorting it. If the number of hosts exceeds
* this value, the generated report will not be sorted. A negative signifies no limit (always sort).
* A value of zero means never sort. Default -1, always sort. This matches the behavior before this
* parameter was introduced.
*
* This value can not be overridden by a sheet. It may be safely edited at runtime.
*
* @param maxSortSize
*/
public void setMaxSortSize(int maxSortSize) {
this.maxSortSize = maxSortSize;
}

boolean suppressEmptyHosts = false;
public boolean isSuppressEmptyHosts() {
return suppressEmptyHosts;
}
/**
* If true, hosts for whom no URLs have been fetched will be suppressed in this report.
* Such hosts are recorded when the crawler encounters an URL for a host but has not yet (and may never)
* processed any URL for the host. This can happen for many reason's, related to scoping and queue budgeting
* among others.
* Default behavior is to include these non-crawled hosts.
*
* This value can not be overridden by a sheet. It may be safely edited at runtime.
*
* @param suppressEmptyHosts
*/
public void setSuppressEmptyHosts(boolean suppressEmptyHosts) {
this.suppressEmptyHosts = suppressEmptyHosts;
}

@Override
public void write(final PrintWriter writer, StatisticsTracker stats) {
// TODO: only perform sorting on manageable number of hosts
DisposableStoredSortedMap<Long,String> hd = stats.calcReverseSortedHostsDistribution();
Collection<String> keys = null;
DisposableStoredSortedMap<Long, String> hd = null;
if (maxSortSize<0 || maxSortSize>stats.serverCache.hostKeys().size()) {
hd = stats.calcReverseSortedHostsDistribution();
keys = hd.values();
} else {
keys = stats.serverCache.hostKeys();
}
writer.print("[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n");
for (Map.Entry<Long,String> entry : hd.entrySet()) {
for (String key : keys) {
// key is -count, value is hostname
try {
CrawlHost host = stats.serverCache.getHostFor(entry.getValue());
writeReportLine(writer,
host.getSubstats().getFetchSuccesses(),
host.getSubstats().getTotalBytes(),
host.fixUpName(),
host.getSubstats().getRobotsDenials(),
host.getSubstats().getRemaining(),
host.getSubstats().getNovelUrls(),
host.getSubstats().getNovelBytes(),
host.getSubstats().getDupByHashUrls(),
host.getSubstats().getDupByHashBytes(),
host.getSubstats().getNotModifiedUrls(),
host.getSubstats().getNotModifiedBytes());
CrawlHost host = stats.serverCache.getHostFor(key);
long fetchSuccesses = host.getSubstats().getFetchSuccesses();
if (!suppressEmptyHosts || fetchSuccesses>0) {
writeReportLine(writer,
fetchSuccesses,
host.getSubstats().getTotalBytes(),
host.fixUpName(),
host.getSubstats().getRobotsDenials(),
host.getSubstats().getRemaining(),
host.getSubstats().getNovelUrls(),
host.getSubstats().getNovelBytes(),
host.getSubstats().getDupByHashUrls(),
host.getSubstats().getDupByHashBytes(),
host.getSubstats().getNotModifiedUrls(),
host.getSubstats().getNotModifiedBytes());
}
} catch (Exception e) {
logger.log(Level.WARNING, "unable to tally host stats for " + entry.getValue(), e);
logger.log(Level.WARNING, "unable to tally host stats for " + key, e);
}
}
hd.dispose();
if (hd!=null) {
hd.dispose();
}
}

protected void writeReportLine(PrintWriter writer, Object ... fields) {
Expand Down
Expand Up @@ -50,7 +50,6 @@
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.crawler.util.TopNSet;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
Expand Down Expand Up @@ -139,7 +138,7 @@ public class StatisticsTracker
Checkpointable,
BeanNameAware {
@SuppressWarnings("unused")
private static final long serialVersionUID = 5L;
private static final long serialVersionUID = 6L;

protected SeedModule seeds;
public SeedModule getSeeds() {
Expand Down Expand Up @@ -295,11 +294,6 @@ public CrawledBytesHistotable getCrawledBytes() {
protected ConcurrentHashMap<String, CrawledBytesHistotable> statsBySource =
new ConcurrentHashMap<String, CrawledBytesHistotable>();

/* Keep track of 'top' hosts for live reports */
protected TopNSet hostsDistributionTop;
protected TopNSet hostsBytesTop;
protected TopNSet hostsLastFinishedTop;

/**
* Record of seeds and latest results
*/
Expand Down Expand Up @@ -356,10 +350,6 @@ public void start() {
this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords",
isRecover, SeedRecord.class);

this.hostsDistributionTop = new TopNSet(getLiveHostReportSize());
this.hostsBytesTop = new TopNSet(getLiveHostReportSize());
this.hostsLastFinishedTop = new TopNSet(getLiveHostReportSize());

if(isRecover) {
JSONObject json = recoveryCheckpoint.loadJson(beanName);

Expand All @@ -369,19 +359,6 @@ public void start() {
crawlPauseStarted = json.getLong("crawlPauseStarted");
tallyCurrentPause();

JSONUtils.putAllLongs(
hostsDistributionTop.getTopSet(),
json.getJSONObject("hostsDistributionTop"));
hostsDistributionTop.updateBounds();
JSONUtils.putAllLongs(
hostsBytesTop.getTopSet(),
json.getJSONObject("hostsBytesTop"));
hostsBytesTop.updateBounds();
JSONUtils.putAllLongs(
hostsLastFinishedTop.getTopSet(),
json.getJSONObject("hostsLastFinishedTop"));
hostsLastFinishedTop.updateBounds();

JSONUtils.putAllAtomicLongs(
mimeTypeDistribution,
json.getJSONObject("mimeTypeDistribution"));
Expand Down Expand Up @@ -780,11 +757,7 @@ public void crawledURISuccessful(CrawlURI curi) {
incrementMapCount(mimeTypeDistribution, mime);
incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());

// Save hosts stats.
ServerCache sc = serverCache;
saveHostStats(sc.getHostFor(curi.getUURI()).getHostName(),
curi.getContentSize());

if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) {
saveSourceStats(curi.getSourceTag(),
sc.getHostFor(curi.getUURI()).getHostName());
Expand Down Expand Up @@ -814,22 +787,6 @@ protected void tallySourceStats(CrawlURI curi) {
sourceStats.accumulate(curi);
}

/**
* Update some running-stats based on a URI success
*
* @param hostname
* @param size
*/
protected void saveHostStats(String hostname, long size) {
// TODO: consider moving 'top' accounting elsewhere, such
// as the frontier or ServerCache itself

CrawlHost host = serverCache.getHostFor(hostname);
hostsDistributionTop.update(hostname, host.getSubstats().getFetchSuccesses());
hostsBytesTop.update(hostname, host.getSubstats().getSuccessBytes());
hostsLastFinishedTop.update(hostname, host.getSubstats().getLastSuccessTime());
}

public void crawledURINeedRetry(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed, will retry");
}
Expand Down Expand Up @@ -1105,10 +1062,6 @@ public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
json.put("crawlPauseStarted",virtualCrawlPauseStarted);
json.put("crawlTotalPausedTime",crawlTotalPausedTime);

json.put("hostsDistributionTop", hostsDistributionTop.getTopSet());
json.put("hostsBytesTop", hostsBytesTop.getTopSet());
json.put("hostsLastFinishedTop", hostsLastFinishedTop.getTopSet());

json.put("mimeTypeDistribution", mimeTypeDistribution);
json.put("mimeTypeBytes", mimeTypeBytes);
json.put("statusCodeDistribution", statusCodeDistribution);
Expand Down
Expand Up @@ -628,7 +628,10 @@ http://example.example/example
<list>
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport">
<property name="maxSortSize" value="-1" />
<property name="suppressEmptyHosts" value="false" />
</bean>
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
Expand Down

0 comments on commit cc97f01

Please sign in to comment.