forked from iipc/openwayback
-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #126 from internetarchive/content-encoding-deflate
Add support of "deflate" content-encoding to TextReplayRenderer.
- Loading branch information
Showing
5 changed files
with
272 additions
and
48 deletions.
There are no files selected for viewing
69 changes: 69 additions & 0 deletions
69
wayback-core/src/main/java/org/archive/wayback/replay/DecodingResource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* | ||
* This file is part of the Wayback archival access software | ||
* (http://archive-access.sourceforge.net/projects/wayback/). | ||
* | ||
* Licensed to the Internet Archive (IA) by one or more individual | ||
* contributors. | ||
* | ||
* The IA licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.archive.wayback.replay; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
|
||
import org.archive.wayback.core.Resource; | ||
|
||
/** | ||
* Base class for a wrapper Resource that decodes content encoded with | ||
* {@code Content-Encoding}. | ||
*/ | ||
public abstract class DecodingResource extends Resource { | ||
|
||
private Resource source; | ||
|
||
protected DecodingResource(Resource source) { | ||
this.source = source; | ||
} | ||
|
||
@Override | ||
public long getRecordLength() { | ||
return source.getRecordLength(); | ||
} | ||
|
||
@Override | ||
public Map<String, String> getHttpHeaders() { | ||
return source.getHttpHeaders(); | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
source.close(); | ||
} | ||
|
||
@Override | ||
public int getStatusCode() { | ||
return source.getStatusCode(); | ||
} | ||
|
||
@Override | ||
public String getRefersToTargetURI() { | ||
return source.getRefersToTargetURI(); | ||
} | ||
|
||
@Override | ||
public String getRefersToDate() { | ||
return source.getRefersToDate(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
wayback-core/src/main/java/org/archive/wayback/replay/InflatingResource.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/* | ||
* This file is part of the Wayback archival access software | ||
* (http://archive-access.sourceforge.net/projects/wayback/). | ||
* | ||
* Licensed to the Internet Archive (IA) by one or more individual | ||
* contributors. | ||
* | ||
* The IA licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/** | ||
* | ||
* Provide a wrapper for a Resource that is gzip encoded, that is, | ||
* Resources that have the header: | ||
* Content-Type: gzip | ||
* | ||
* Used by TextReplayRenderers and other ReplayRenderers that add content to the resulting output | ||
* | ||
*/ | ||
|
||
package org.archive.wayback.replay; | ||
|
||
import java.io.IOException; | ||
import java.util.logging.Level; | ||
import java.util.logging.Logger; | ||
import java.util.zip.Inflater; | ||
import java.util.zip.InflaterInputStream; | ||
|
||
import org.archive.wayback.core.Resource; | ||
|
||
/** | ||
* A wrapper Resource that decodes content encoded with {@code deflate}. | ||
* Supports both standard deflate encoding (with zlib headers) | ||
* and non-standard encoding (without zlib headers). | ||
*/ | ||
public class InflatingResource extends DecodingResource { | ||
|
||
private static final Logger LOGGER = Logger.getLogger(InflatingResource.class.getName()); | ||
|
||
/** value of {@code Content-Encoding} header */ | ||
public static final String CONTENT_ENCODING_NAME = "deflate"; | ||
|
||
public InflatingResource(Resource source) { | ||
super(source); | ||
// Some implementation of "deflate" just implement RFC-1951 (compression | ||
// algorithm), ignoring RFC-1950 (zlib header). As InflaterInputStream does | ||
// not check header at instantiation, we need to check the header ourselves. | ||
source.mark(2); | ||
byte[] zlibHeader = new byte[2]; | ||
try { | ||
int n = source.read(zlibHeader); | ||
try { | ||
source.reset(); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.WARNING, "reset() failed after peeking first two bytes", | ||
ex); | ||
} | ||
if (n < 2) { | ||
// unlikely be zlib compressed. | ||
setInputStream(source); | ||
return; | ||
} | ||
if (zlibHeader[0] != (byte)0x78) { | ||
// assume header-less deflate | ||
setInputStream(new InflaterInputStream(source, new Inflater(true))); | ||
return; | ||
} | ||
// deflate with zlib header | ||
setInputStream(new InflaterInputStream(source)); | ||
} catch (IOException ex) { | ||
setInputStream(source); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
99 changes: 99 additions & 0 deletions
99
wayback-core/src/test/java/org/archive/wayback/replay/InflatingResourceTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* This file is part of the Wayback archival access software | ||
* (http://archive-access.sourceforge.net/projects/wayback/). | ||
* | ||
* Licensed to the Internet Archive (IA) by one or more individual | ||
* contributors. | ||
* | ||
* The IA licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.archive.wayback.replay; | ||
|
||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.io.Writer; | ||
import java.util.zip.Deflater; | ||
import java.util.zip.DeflaterOutputStream; | ||
|
||
import junit.framework.TestCase; | ||
|
||
import org.archive.io.warc.TestWARCReader; | ||
import org.archive.io.warc.TestWARCRecordInfo; | ||
import org.archive.io.warc.WARCRecord; | ||
import org.archive.wayback.core.Resource; | ||
import org.archive.wayback.resourcestore.resourcefile.WarcResource; | ||
|
||
/** | ||
* | ||
*/ | ||
public class InflatingResourceTest extends TestCase { | ||
|
||
/* (non-Javadoc) | ||
* @see junit.framework.TestCase#setUp() | ||
*/ | ||
protected void setUp() throws Exception { | ||
super.setUp(); | ||
} | ||
|
||
protected byte[] deflateContent(String content, boolean noHeader) throws IOException { | ||
ByteArrayOutputStream deflated = new ByteArrayOutputStream(); | ||
DeflaterOutputStream deflator = new DeflaterOutputStream(deflated, | ||
new Deflater(5, noHeader)); | ||
deflator.write(content.getBytes("UTF-8")); | ||
deflator.close(); | ||
return deflated.toByteArray(); | ||
} | ||
|
||
protected Resource testResource(String content, boolean noHeader) throws IOException { | ||
final byte[] payload = deflateContent(content, false); | ||
ByteArrayOutputStream blockbuf = new ByteArrayOutputStream(); | ||
Writer w = new OutputStreamWriter(blockbuf); | ||
w.write("HTTP/1.0 200 OK\r\n"); | ||
w.write("Content-Length: " + payload.length + "\r\n"); | ||
w.write("Content-Encoding: deflate\r\n"); | ||
w.write("\r\n"); | ||
w.flush(); | ||
blockbuf.write(payload); | ||
TestWARCRecordInfo recinfo = new TestWARCRecordInfo(blockbuf.toByteArray()); | ||
TestWARCReader ar = new TestWARCReader(recinfo); | ||
WARCRecord rec = (WARCRecord)ar.get(0); | ||
Resource resource = new WarcResource(rec, ar); | ||
resource.parseHeaders(); | ||
return resource; | ||
} | ||
|
||
public void testStandardDeflatedContent() throws Exception { | ||
Resource resource = testResource("ABCDEFG", false); | ||
InflatingResource wrapped = new InflatingResource(resource); | ||
|
||
byte[] content = new byte[80]; | ||
int n = wrapped.read(content); | ||
wrapped.close(); | ||
String text = new String(content, 0, n, "UTF-8"); | ||
assertEquals("ABCDEFG", text); | ||
} | ||
|
||
public void testHeaderlessDeflatedContent() throws Exception { | ||
Resource resource = testResource("ABCDEFG", true); | ||
InflatingResource wrapped = new InflatingResource(resource); | ||
|
||
byte[] content = new byte[80]; | ||
int n = wrapped.read(content); | ||
wrapped.close(); | ||
String text = new String(content, 0, n, "UTF-8"); | ||
assertEquals("ABCDEFG", text); | ||
} | ||
|
||
} |