Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hosts report #123

Merged
merged 5 commits into from Jan 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
90 changes: 69 additions & 21 deletions engine/src/main/java/org/archive/crawler/reporting/HostsReport.java
Expand Up @@ -20,7 +20,7 @@
package org.archive.crawler.reporting;

import java.io.PrintWriter;
import java.util.Map;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand All @@ -36,33 +36,81 @@ public class HostsReport extends Report {

private final static Logger logger =
Logger.getLogger(HostsReport.class.getName());

@Override

int maxSortSize = -1;
public int getMaxSortSize() {
return maxSortSize;
}
/**
* The maximum number of hosts allowed in a report while still sorting it. If the number of hosts exceeds
* this value, the generated report will not be sorted. A negative signifies no limit (always sort).
* A value of zero means never sort. Default -1, always sort. This matches the behavior before this
* parameter was introduced.
*
* This value can not be overridden by a sheet. It may be safely edited at runtime.
*
* @param maxSortSize
*/
public void setMaxSortSize(int maxSortSize) {
this.maxSortSize = maxSortSize;
}

boolean suppressEmptyHosts = false;
public boolean isSuppressEmptyHosts() {
return suppressEmptyHosts;
}
/**
* If true, hosts for whom no URLs have been fetched will be suppressed in this report.
* Such hosts are recorded when the crawler encounters an URL for a host but has not yet (and may never)
* processed any URL for the host. This can happen for many reason's, related to scoping and queue budgeting
* among others.
* Default behavior is to include these non-crawled hosts.
*
* This value can not be overridden by a sheet. It may be safely edited at runtime.
*
* @param suppressEmptyHosts
*/
public void setSuppressEmptyHosts(boolean suppressEmptyHosts) {
this.suppressEmptyHosts = suppressEmptyHosts;
}

@Override
public void write(final PrintWriter writer, StatisticsTracker stats) {
// TODO: only perform sorting on manageable number of hosts
DisposableStoredSortedMap<Long,String> hd = stats.calcReverseSortedHostsDistribution();
Collection<String> keys = null;
DisposableStoredSortedMap<Long, String> hd = null;
if (maxSortSize<0 || maxSortSize>stats.serverCache.hostKeys().size()) {
hd = stats.calcReverseSortedHostsDistribution();
keys = hd.values();
} else {
keys = stats.serverCache.hostKeys();
}
writer.print("[#urls] [#bytes] [host] [#robots] [#remaining] [#novel-urls] [#novel-bytes] [#dup-by-hash-urls] [#dup-by-hash-bytes] [#not-modified-urls] [#not-modified-bytes]\n");
for (Map.Entry<Long,String> entry : hd.entrySet()) {
for (String key : keys) {
// key is -count, value is hostname
try {
CrawlHost host = stats.serverCache.getHostFor(entry.getValue());
writeReportLine(writer,
host.getSubstats().getFetchSuccesses(),
host.getSubstats().getTotalBytes(),
host.fixUpName(),
host.getSubstats().getRobotsDenials(),
host.getSubstats().getRemaining(),
host.getSubstats().getNovelUrls(),
host.getSubstats().getNovelBytes(),
host.getSubstats().getDupByHashUrls(),
host.getSubstats().getDupByHashBytes(),
host.getSubstats().getNotModifiedUrls(),
host.getSubstats().getNotModifiedBytes());
CrawlHost host = stats.serverCache.getHostFor(key);
long fetchSuccesses = host.getSubstats().getFetchSuccesses();
if (!suppressEmptyHosts || fetchSuccesses>0) {
writeReportLine(writer,
fetchSuccesses,
host.getSubstats().getTotalBytes(),
host.fixUpName(),
host.getSubstats().getRobotsDenials(),
host.getSubstats().getRemaining(),
host.getSubstats().getNovelUrls(),
host.getSubstats().getNovelBytes(),
host.getSubstats().getDupByHashUrls(),
host.getSubstats().getDupByHashBytes(),
host.getSubstats().getNotModifiedUrls(),
host.getSubstats().getNotModifiedBytes());
}
} catch (Exception e) {
logger.log(Level.WARNING, "unable to tally host stats for " + entry.getValue(), e);
logger.log(Level.WARNING, "unable to tally host stats for " + key, e);
}
}
hd.dispose();
if (hd!=null) {
hd.dispose();
}
}

protected void writeReportLine(PrintWriter writer, Object ... fields) {
Expand Down
Expand Up @@ -50,7 +50,6 @@
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.crawler.util.TopNSet;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
Expand Down Expand Up @@ -139,7 +138,7 @@ public class StatisticsTracker
Checkpointable,
BeanNameAware {
@SuppressWarnings("unused")
private static final long serialVersionUID = 5L;
private static final long serialVersionUID = 6L;

protected SeedModule seeds;
public SeedModule getSeeds() {
Expand Down Expand Up @@ -289,11 +288,6 @@ public CrawledBytesHistotable getCrawledBytes() {
protected ConcurrentHashMap<String, ConcurrentMap<String, AtomicLong>> sourceHostDistribution =
new ConcurrentHashMap<String, ConcurrentMap<String,AtomicLong>>();

/* Keep track of 'top' hosts for live reports */
protected TopNSet hostsDistributionTop;
protected TopNSet hostsBytesTop;
protected TopNSet hostsLastFinishedTop;

/**
* Record of seeds and latest results
*/
Expand Down Expand Up @@ -350,10 +344,6 @@ public void start() {
this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords",
isRecover, SeedRecord.class);

this.hostsDistributionTop = new TopNSet(getLiveHostReportSize());
this.hostsBytesTop = new TopNSet(getLiveHostReportSize());
this.hostsLastFinishedTop = new TopNSet(getLiveHostReportSize());

if(isRecover) {
JSONObject json = recoveryCheckpoint.loadJson(beanName);

Expand All @@ -363,19 +353,6 @@ public void start() {
crawlPauseStarted = json.getLong("crawlPauseStarted");
tallyCurrentPause();

JSONUtils.putAllLongs(
hostsDistributionTop.getTopSet(),
json.getJSONObject("hostsDistributionTop"));
hostsDistributionTop.updateBounds();
JSONUtils.putAllLongs(
hostsBytesTop.getTopSet(),
json.getJSONObject("hostsBytesTop"));
hostsBytesTop.updateBounds();
JSONUtils.putAllLongs(
hostsLastFinishedTop.getTopSet(),
json.getJSONObject("hostsLastFinishedTop"));
hostsLastFinishedTop.updateBounds();

JSONUtils.putAllAtomicLongs(
mimeTypeDistribution,
json.getJSONObject("mimeTypeDistribution"));
Expand Down Expand Up @@ -758,11 +735,7 @@ public void crawledURISuccessful(CrawlURI curi) {
incrementMapCount(mimeTypeDistribution, mime);
incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());

// Save hosts stats.
ServerCache sc = serverCache;
saveHostStats(sc.getHostFor(curi.getUURI()).getHostName(),
curi.getContentSize());

if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) {
saveSourceStats((String)curi.getData().get(A_SOURCE_TAG),
sc.getHostFor(curi.getUURI()).
Expand All @@ -783,22 +756,6 @@ protected void saveSourceStats(String source, String hostname) {

}

/**
* Update some running-stats based on a URI success
*
* @param hostname
* @param size
*/
protected void saveHostStats(String hostname, long size) {
// TODO: consider moving 'top' accounting elsewhere, such
// as the frontier or ServerCache itself

CrawlHost host = serverCache.getHostFor(hostname);
hostsDistributionTop.update(hostname, host.getSubstats().getFetchSuccesses());
hostsBytesTop.update(hostname, host.getSubstats().getSuccessBytes());
hostsLastFinishedTop.update(hostname, host.getSubstats().getLastSuccessTime());
}

public void crawledURINeedRetry(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed, will retry");
}
Expand Down Expand Up @@ -1074,10 +1031,6 @@ public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
json.put("crawlPauseStarted",virtualCrawlPauseStarted);
json.put("crawlTotalPausedTime",crawlTotalPausedTime);

json.put("hostsDistributionTop", hostsDistributionTop.getTopSet());
json.put("hostsBytesTop", hostsBytesTop.getTopSet());
json.put("hostsLastFinishedTop", hostsLastFinishedTop.getTopSet());

json.put("mimeTypeDistribution", mimeTypeDistribution);
json.put("mimeTypeBytes", mimeTypeBytes);
json.put("statusCodeDistribution", statusCodeDistribution);
Expand Down
Expand Up @@ -628,7 +628,10 @@ http://example.example/example
<list>
<bean id="crawlSummaryReport" class="org.archive.crawler.reporting.CrawlSummaryReport" />
<bean id="seedsReport" class="org.archive.crawler.reporting.SeedsReport" />
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport" />
<bean id="hostsReport" class="org.archive.crawler.reporting.HostsReport">
<property name="maxSortSize" value="-1" />
<property name="suppressEmptyHosts" value="false" />
</bean>
<bean id="sourceTagsReport" class="org.archive.crawler.reporting.SourceTagsReport" />
<bean id="mimetypesReport" class="org.archive.crawler.reporting.MimetypesReport" />
<bean id="responseCodeReport" class="org.archive.crawler.reporting.ResponseCodeReport" />
Expand Down