Skip to content

Commit

Permalink
DEPENDNECY: Use heritrix-commons latest snapshot 3.1.2-SNAPSHOT, and …
Browse files Browse the repository at this point in the history
…LaxHttpParser in warc/arc loading

code to explicitly reference the heritrix mods to these, allow for standard httpclient 3.1 to be loaded
alongside heritrix mods without conflict
  • Loading branch information
ikreymer committed Mar 28, 2013
1 parent 8792320 commit de92848
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 35 deletions.
21 changes: 2 additions & 19 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -278,16 +278,6 @@
<artifactId>wayback-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>wayback-hadoop-java</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>wayback-hadoop</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>wayback-webapp</artifactId>
Expand All @@ -304,24 +294,17 @@
<dependency>
<groupId>org.archive.heritrix</groupId>
<artifactId>heritrix-commons</artifactId>
<version>3.1.1</version>
<version>3.1.2-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.archive</groupId>
<artifactId>ia-web-commons</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<!--
<dependency>
<groupId>org.archive.heritrix</groupId>
<artifactId>heritrix-modules</artifactId>
<version>3.1.0-SNAPSHOT</version>
</dependency>
-->
<dependency>
<groupId>org.archive.access-control</groupId>
<artifactId>access-control</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>0.1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.mozilla</groupId>
Expand Down
6 changes: 6 additions & 0 deletions wayback-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
<dependency>
<groupId>org.archive.heritrix</groupId>
<artifactId>heritrix-commons</artifactId>
<exclusions>
<exclusion>
<groupId>org.archive.overlays</groupId>
<artifactId>archive-overlay-commons-httpclient</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.archive</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
Expand All @@ -50,7 +49,7 @@
import org.archive.io.RecordingInputStream;
import org.archive.io.ReplayInputStream;
import org.archive.io.arc.ARCWriter;
import org.archive.net.LaxURI;
import org.archive.url.LaxURI;
import org.archive.util.Recorder;
import org.archive.wayback.util.ByteOp;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import java.io.IOException;

import org.archive.format.gzip.zipnum.TimestampCustomDedupIterator;
import org.archive.format.gzip.zipnum.TimestampBestPickDedupIterator;
import org.archive.format.gzip.zipnum.ZipNumCluster;
import org.archive.format.gzip.zipnum.ZipNumParams;
import org.archive.util.iterator.CloseableIterator;
Expand Down Expand Up @@ -61,7 +61,7 @@ public CloseableIterator<CaptureSearchResult> getPrefixIterator(
}

if (timestampDedupLength > 0) {
cdxIter = new TimestampCustomDedupIterator(cdxIter, timestampDedupLength);
cdxIter = new TimestampBestPickDedupIterator(cdxIter, timestampDedupLength);
}

return new AdaptedIterator<String,CaptureSearchResult>(cdxIter, this);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.logging.Logger;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
Expand All @@ -33,6 +32,7 @@
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter;
Expand Down Expand Up @@ -97,7 +97,7 @@ private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {

CaptureSearchResult result = genericResult(rec);

if(type.equals(WARCConstants.RESPONSE)) {
if(type.equals(WARCConstants.WARCRecordType.RESPONSE)) {
String mime = annotater.transformHTTPMime(header.getMimetype());
if(mime != null && mime.equals("text/dns")) {
// close to complete reading, then the digest is legit
Expand All @@ -108,27 +108,27 @@ private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
} else {
result = adaptWARCHTTPResponse(result,rec);
}
} else if(type.equals(WARCConstants.REVISIT)) {
} else if(type.equals(WARCConstants.WARCRecordType.REVISIT)) {
// also set the mime type:
result.setMimeType("warc/revisit");

} else if(type.equals(WARCConstants.REQUEST)) {
} else if(type.equals(WARCConstants.WARCRecordType.REQUEST)) {

if(processAll) {
// also set the mime type:
result.setMimeType("warc/request");
} else {
result = null;
}
} else if(type.equals(WARCConstants.METADATA)) {
} else if(type.equals(WARCConstants.WARCRecordType.METADATA)) {

if(processAll) {
// also set the mime type:
result.setMimeType("warc/metadata");
} else {
result = null;
}
} else if(type.equals(WARCConstants.WARCINFO)) {
} else if(type.equals(WARCConstants.WARCRecordType.WARCINFO)) {

result.setMimeType(WARC_FILEDESC_VERSION);

Expand Down Expand Up @@ -169,7 +169,7 @@ private CaptureSearchResult genericResult(WARCRecord rec) {
String origUrl = header.getUrl();
if(origUrl == null) {
String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
if(type.equals(WARCConstants.WARCINFO)) {
if(type.equals(WARCConstants.WARCRecordType.WARCINFO)) {
String filename = header.getHeaderValue(
WARCConstants.HEADER_KEY_FILENAME).toString();
result.setOriginalUrl("filedesc:"+filename);
Expand Down Expand Up @@ -268,7 +268,7 @@ private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
// need to parse the documents HTTP message and headers here: WARCReader
// does not implement this... yet..

byte [] statusBytes = HttpParser.readRawLine(rec);
byte [] statusBytes = LaxHttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
Expand All @@ -284,7 +284,7 @@ private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
StatusLine status = new StatusLine(statusLine);
result.setHttpCode(String.valueOf(status.getStatusCode()));

Header[] headers = HttpParser.parseHeaders(rec,
Header[] headers = LaxHttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
import java.util.Map;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
import org.apache.commons.httpclient.util.EncodingUtil;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.util.LaxHttpParser;
import org.archive.wayback.core.Resource;
import org.archive.wayback.replay.HttpHeaderOperation;

Expand Down Expand Up @@ -71,7 +71,7 @@ public void parseHeaders() throws IOException {
return;
}

byte [] statusBytes = HttpParser.readRawLine(rec);
byte [] statusBytes = LaxHttpParser.readRawLine(rec);
int eolCharCount = getEolCharsCount(statusBytes);
if (eolCharCount <= 0) {
throw new RecoverableIOException("Failed to read http status where one " +
Expand All @@ -87,7 +87,7 @@ public void parseHeaders() throws IOException {

this.status = statusLine.getStatusCode();

Header[] tmpHeaders = HttpParser.parseHeaders(rec,
Header[] tmpHeaders = LaxHttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
headers = new Hashtable<String,String>();
this.setInputStream(rec);
Expand Down

0 comments on commit de92848

Please sign in to comment.