Skip to content

Commit

Permalink
Add mime-type detection to SelectorReplayDispatcher,
Browse files Browse the repository at this point in the history
Implement SimpleMimeTypeDetector (experimental, needs more tests with real-workd examples).
  • Loading branch information
kngenie committed Sep 8, 2014
1 parent e9756f0 commit 65dfc40
Show file tree
Hide file tree
Showing 20 changed files with 3,197 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public class CompositeResource extends Resource {
private final Resource payloadResource;
/**
* constructor.
* @param headersResource Resource providing HTTP heeaders
* @param headersResource Resource providing HTTP headers
* (revisit record).
* @param payloadResource Resource providing HTTP response entity
* (revisited original record).
Expand Down Expand Up @@ -51,6 +51,8 @@ public int getStatusCode() {
*/
@Override
public long getRecordLength() {
// mmm, is this right?? maybe this method should not be
// part of public interface of Resource.
return payloadResource.getRecordLength();
}

Expand All @@ -59,18 +61,27 @@ public long getRecordLength() {
*/
@Override
public Map<String, String> getHttpHeaders() {
// TODO: if headerResource has no HTTP headers (old
// revisit WARC record), get it from payloadResource.
return headersResource.getHttpHeaders();
// revisit record had no HTTP headers in early days.
if (headersResource.getRecordLength() == 0)
return payloadResource.getHttpHeaders();
else
return headersResource.getHttpHeaders();
}
@Override
public void parseHeaders() throws IOException {
// currently this is not supposed to be used.
// it is assumed parseHeaders() is already
// called on each Resource.
headersResource.parseHeaders();
payloadResource.parseHeaders();
}
@Override
public String getHeader(String headerName) {
return headersResource.getHeader(headerName);
// revisit record had no HTTP headers in early days.
if (headersResource.getRecordLength() == 0)
return payloadResource.getHeader(headerName);
else
return headersResource.getHeader(headerName);
}
@Override
public void setChunkedEncoding() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,40 +30,104 @@
import org.archive.wayback.exception.BetterRequestException;
import org.archive.wayback.memento.MementoConstants;
import org.archive.wayback.memento.MementoUtils;
import org.archive.wayback.replay.mimetype.MimeTypeDetector;
import org.archive.wayback.webapp.AccessPoint;

/**
* ReplayDispatcher instance which uses a configurable ClosestResultSelector
* to find the best result to show from a given set, and a list of
* ReplayRendererSelector to determine how best to replay that result to a user.
*
* <p>Optionally it can be configured with {@link MimeTypeDetector}s used for
* overriding unknown ({@code "unk"}) or often-misused ({@code "text/html"})
* value of {@link CaptureSearchResult#getMimeType()}.</p>
*
* @author brad
* @version $Date$, $Revision$
*/
public class SelectorReplayDispatcher implements ReplayDispatcher {
private List<ReplayRendererSelector> selectors = null;
private List<MimeTypeDetector> mimeTypeDetectors = null;
private ClosestResultSelector closestSelector = null;

@Override
public ReplayRenderer getRenderer(WaybackRequest wbRequest,
CaptureSearchResult result, Resource resource) {
return getRenderer(wbRequest, result, resource, resource);
/**
* check if mime-type detection is suggested for mimeType.
* @param mimeType mime-type to test (must not be null/empty/"unk")
* @return {@code true} if mime-type should be determined
* by looking into Resource.
*/
protected boolean shouldDetectMimeType(String mimeType) {
// TODO: want to make this configurable?
if (mimeType.startsWith("text/html"))
return true;
return false;
}

@Override
public ReplayRenderer getRenderer(WaybackRequest wbRequest,
CaptureSearchResult result, Resource httpHeadersResource,
Resource payloadResource) {
CaptureSearchResult result, Resource resource) {
// if content-type is already specified, don't override it.
if (wbRequest.getForcedContentType() == null) {
String mimeType = result.getMimeType();
// TODO: this code should be encapsulated in CaptureSearchResult.getMimeType()
if (AccessPoint.REVISIT_STR.equals(mimeType)) {
if (result.getDuplicatePayload() != null) {
mimeType = result.getDuplicatePayload().getMimeType();
} else {
// let following code get it from resource
mimeType = null;
}
}
// Many old ARCs have "unk" or "no-type" in ARC header even though
// HTTP response has valid Content-Type header. CDX writer does not fix
// it (although it's capable of fixing it internally). If CaptureSearchResult
// says mimeType is "unk", try reading Content-Type header from the resource.
if (mimeType == null || mimeType.isEmpty() || "unk".equals(mimeType)) {
mimeType = resource.getHeader("Content-Type");
}
// "unk" and "" are changed to Content-Type header value (or null if in fact missing)
// so null test is enough.
if (mimeType == null || shouldDetectMimeType(mimeType)) {
if (mimeTypeDetectors != null) {
for (MimeTypeDetector detector : mimeTypeDetectors) {
String detected = detector.sniff(resource);
if (detected != null) {
// detected mimeType is communicated to Selectors
// through forcedContentType. better way? replace
// CaptureSearchResult.mimeType?
wbRequest.setForcedContentType(detected);
}
}
}
} else {
// hmm, now CaptureSearchResult.mimeType can be set to
// forcedContentType - it should work, but this may
// be a bad design.
wbRequest.setForcedContentType(mimeType);
}
}

if (selectors != null) {
for (ReplayRendererSelector selector : selectors) {
if (selector.canHandle(wbRequest, result, httpHeadersResource,
payloadResource)) {
if (selector.canHandle(wbRequest, result, resource, resource)) {
return selector.getRenderer();
}
}
}
return null;
}

@Override
public ReplayRenderer getRenderer(WaybackRequest wbRequest,
CaptureSearchResult result, Resource httpHeadersResource,
Resource payloadResource) {
if (httpHeadersResource == payloadResource)
return getRenderer(wbRequest, result, httpHeadersResource);
else {
Resource resource = new CompositeResource(httpHeadersResource, payloadResource);
return getRenderer(wbRequest, result, resource);
}
}

public CaptureSearchResult getClosest(WaybackRequest wbRequest,
CaptureSearchResults results) throws BetterRequestException {

Expand Down Expand Up @@ -91,12 +155,22 @@ public CaptureSearchResult getClosest(WaybackRequest wbRequest,
public List<ReplayRendererSelector> getSelectors() {
return selectors;
}

/**
* @param selectors the List of ReplayRendererSelector to use
*/
public void setSelectors(List<ReplayRendererSelector> selectors) {
this.selectors = selectors;
}

public List<MimeTypeDetector> getMimeTypeDetectors() {
return mimeTypeDetectors;
}

public void setMimeTypeDetectors(List<MimeTypeDetector> sniffers) {
this.mimeTypeDetectors = sniffers;
}

/**
* @param closestSelector the closestSelector to set
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ public class StandardCharsetDetector extends CharsetDetector {
@Override
public String getCharset(Resource httpHeadersResource,
Resource payloadResource, WaybackRequest wbRequest) throws IOException {
Resource resource = new CompositeResource(httpHeadersResource,
payloadResource);
Resource resource = httpHeadersResource != payloadResource ? new CompositeResource(
httpHeadersResource, payloadResource) : payloadResource;
for (EncodingSniffer sniffer : SNIFFERS) {
String charset = sniffer.sniff(resource);
if (charset != null)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.replay.mimetype;

import org.archive.wayback.core.Resource;

/**
* ContentTypeSniffer looks into capture resource content
* to determine actual content-type, which take precedence
* over content-type declared in the HTTP header.
* @see SelectorReplayRenderer
*/
public interface MimeTypeDetector {
public String sniff(Resource resource);
}
Loading

0 comments on commit 65dfc40

Please sign in to comment.