Skip to content

Commit

Permalink
Merge pull request #126 from internetarchive/content-encoding-deflate
Browse files Browse the repository at this point in the history
Add support of "deflate" content-encoding to TextReplayRenderer.
  • Loading branch information
kngenie committed Aug 2, 2016
2 parents e3435f6 + 987ed06 commit 9344cdc
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 48 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.archive.wayback.replay;

import java.io.IOException;
import java.util.Map;

import org.archive.wayback.core.Resource;

/**
* Base class for a wrapper Resource that decodes content encoded with
* {@code Content-Encoding}.
*/
public abstract class DecodingResource extends Resource {

private Resource source;

protected DecodingResource(Resource source) {
this.source = source;
}

@Override
public long getRecordLength() {
return source.getRecordLength();
}

@Override
public Map<String, String> getHttpHeaders() {
return source.getHttpHeaders();
}

@Override
public void close() throws IOException {
source.close();
}

@Override
public int getStatusCode() {
return source.getStatusCode();
}

@Override
public String getRefersToTargetURI() {
return source.getRefersToTargetURI();
}

@Override
public String getRefersToDate() {
return source.getRefersToDate();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,30 @@
* limitations under the License.
*/

/**
*
* Provide a wrapper for a Resource that is gzip encoded, that is,
* Resources that have the header:
* Content-Type: gzip
*
* Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output
*
*/

package org.archive.wayback.replay;

import java.io.IOException;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;

import org.archive.wayback.core.Resource;

public class GzipDecodingResource extends Resource {
/**
* Provide a wrapper for a Resource that is gzip encoded, that is,
* Resources that have the header:
* Content-Type: gzip
*
* Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output
*/
public class GzipDecodingResource extends DecodingResource {

private static final Logger LOGGER = Logger.getLogger(GzipDecodingResource.class.getName());

public static final String GZIP = "gzip";

private Resource source;

public GzipDecodingResource(Resource source) {
this.source = source;
super(source);
// 2 for GZIP MAGIC bytes.
source.mark(2);
try {
Expand All @@ -70,33 +64,4 @@ public GzipDecodingResource(Resource source) {
}
}

@Override
public long getRecordLength() {
return source.getRecordLength();
}

@Override
public Map<String, String> getHttpHeaders() {
return source.getHttpHeaders();
}

@Override
public void close() throws IOException {
source.close();
}

@Override
public int getStatusCode() {
return source.getStatusCode();
}

@Override
public String getRefersToTargetURI() {
return source.getRefersToTargetURI();
}

@Override
public String getRefersToDate() {
return source.getRefersToDate();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
*
* Provide a wrapper for a Resource that is gzip encoded, that is,
* Resources that have the header:
* Content-Type: gzip
*
* Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output
*
*/

package org.archive.wayback.replay;

import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

import org.archive.wayback.core.Resource;

/**
* A wrapper Resource that decodes content encoded with {@code deflate}.
* Supports both standard deflate encoding (with zlib headers)
* and non-standard encoding (without zlib headers).
*/
public class InflatingResource extends DecodingResource {

private static final Logger LOGGER = Logger.getLogger(InflatingResource.class.getName());

/** value of {@code Content-Encoding} header */
public static final String CONTENT_ENCODING_NAME = "deflate";

public InflatingResource(Resource source) {
super(source);
// Some implementation of "deflate" just implement RFC-1951 (compression
// algorithm), ignoring RFC-1950 (zlib header). As InflaterInputStream does
// not check header at instantiation, we need to check the header ourselves.
source.mark(2);
byte[] zlibHeader = new byte[2];
try {
int n = source.read(zlibHeader);
try {
source.reset();
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "reset() failed after peeking first two bytes",
ex);
}
if (n < 2) {
// unlikely be zlib compressed.
setInputStream(source);
return;
}
if (zlibHeader[0] != (byte)0x78) {
// assume header-less deflate
setInputStream(new InflaterInputStream(source, new Inflater(true)));
return;
}
// deflate with zlib header
setInputStream(new InflaterInputStream(source));
} catch (IOException ex) {
setInputStream(source);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,15 @@ public static Resource decodeResource(Resource headersResource,
String encoding = HttpHeaderOperation.getHeaderValue(headers,
HttpHeaderOperation.HTTP_CONTENT_ENCODING);
if (encoding != null) {
if (encoding.toLowerCase().equals(GzipDecodingResource.GZIP)) {
final String lcEncoding = encoding.toLowerCase();
Resource decodingResource = null;
if (lcEncoding.equals(GzipDecodingResource.GZIP)) {
decodingResource = new GzipDecodingResource(payloadResource);
} else if (lcEncoding.equals(InflatingResource.CONTENT_ENCODING_NAME)) {
decodingResource = new InflatingResource(payloadResource);
}

if (decodingResource != null) {
// if headersResource (revisit) has Content-Encoding, set it aside.
Map<String, String> revHeaders = headersResource.getHttpHeaders();
String revEncoding = HttpHeaderOperation.getHeaderValue(
Expand All @@ -287,10 +295,8 @@ public static Resource decodeResource(Resource headersResource,
HttpHeaderOperation.HTTP_TRANSFER_ENC_HEADER);
}

return new GzipDecodingResource(payloadResource);
return decodingResource;
}

// TODO: check for other encodings?
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.archive.wayback.replay;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.zip.Deflater;
import java.util.zip.DeflaterOutputStream;

import junit.framework.TestCase;

import org.archive.io.warc.TestWARCReader;
import org.archive.io.warc.TestWARCRecordInfo;
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.core.Resource;
import org.archive.wayback.resourcestore.resourcefile.WarcResource;

/**
*
*/
public class InflatingResourceTest extends TestCase {

/* (non-Javadoc)
* @see junit.framework.TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
}

protected byte[] deflateContent(String content, boolean noHeader) throws IOException {
ByteArrayOutputStream deflated = new ByteArrayOutputStream();
DeflaterOutputStream deflator = new DeflaterOutputStream(deflated,
new Deflater(5, noHeader));
deflator.write(content.getBytes("UTF-8"));
deflator.close();
return deflated.toByteArray();
}

protected Resource testResource(String content, boolean noHeader) throws IOException {
final byte[] payload = deflateContent(content, false);
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream();
Writer w = new OutputStreamWriter(blockbuf);
w.write("HTTP/1.0 200 OK\r\n");
w.write("Content-Length: " + payload.length + "\r\n");
w.write("Content-Encoding: deflate\r\n");
w.write("\r\n");
w.flush();
blockbuf.write(payload);
TestWARCRecordInfo recinfo = new TestWARCRecordInfo(blockbuf.toByteArray());
TestWARCReader ar = new TestWARCReader(recinfo);
WARCRecord rec = (WARCRecord)ar.get(0);
Resource resource = new WarcResource(rec, ar);
resource.parseHeaders();
return resource;
}

public void testStandardDeflatedContent() throws Exception {
Resource resource = testResource("ABCDEFG", false);
InflatingResource wrapped = new InflatingResource(resource);

byte[] content = new byte[80];
int n = wrapped.read(content);
wrapped.close();
String text = new String(content, 0, n, "UTF-8");
assertEquals("ABCDEFG", text);
}

public void testHeaderlessDeflatedContent() throws Exception {
Resource resource = testResource("ABCDEFG", true);
InflatingResource wrapped = new InflatingResource(resource);

byte[] content = new byte[80];
int n = wrapped.read(content);
wrapped.close();
String text = new String(content, 0, n, "UTF-8");
assertEquals("ABCDEFG", text);
}

}

0 comments on commit 9344cdc

Please sign in to comment.