Skip to content

Commit

Permalink
Merge pull request #41 from sebastian-nagel/extract-tool
Browse files Browse the repository at this point in the history
Tool to extract a WARC record (or its headers or payload)
  • Loading branch information
ato committed May 29, 2020
2 parents c26fdef + 70f9cd6 commit bcaad8b
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/org/netpreserve/jwarc/IOUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ public static void copy(InputStream inputStream, OutputStream outputStream) thro
}
}

public static ReadableByteChannel gunzipChannel(ReadableByteChannel gzipped) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
buffer.flip();
return new GunzipChannel(gzipped, buffer);
}

static Socket connect(String scheme, String host, int port) throws IOException {
Objects.requireNonNull(host);
if ("http".equalsIgnoreCase(scheme)) {
Expand Down
156 changes: 156 additions & 0 deletions src/org/netpreserve/jwarc/tools/ExtractTool.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* SPDX-License-Identifier: Apache-2.0
* Copyright (C) 2020 National Library of Australia and the jwarc contributors
*/

package org.netpreserve.jwarc.tools;

import org.netpreserve.jwarc.*;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;
import java.util.Optional;

import static java.nio.charset.StandardCharsets.UTF_8;

public class ExtractTool {

private static enum ExtractAction { RECORD, HEADERS, PAYLOAD; };

private static void writeWarcHeaders(WritableByteChannel out, WarcRecord record) throws IOException {
StringBuilder sb = new StringBuilder();
sb.append(record.version().toString()).append("\r\n");
record.headers().appendTo(sb);
sb.append("\r\n");
out.write(ByteBuffer.wrap(sb.toString().getBytes(UTF_8)));
}

private static void writeHttpHeaders(WritableByteChannel out, WarcRecord record) throws IOException {
if (record instanceof WarcResponse) {
HttpResponse response = ((WarcResponse) record).http();
out.write(ByteBuffer.wrap(response.serializeHeader()));
} else if (record instanceof WarcRequest) {
HttpRequest request = ((WarcRequest) record).http();
out.write(ByteBuffer.wrap(request.serializeHeader()));
}
}

private static void writePayload(WritableByteChannel out, WarcRecord record) throws IOException {
MessageBody payload;
List<String> contentEncodings = Collections.emptyList();
if (record instanceof WarcResponse) {
HttpResponse response = ((WarcResponse) record).http();
payload = response.body();
contentEncodings = response.headers().all("Content-Encoding");
} else if (record instanceof WarcRequest) {
HttpRequest request = ((WarcRequest) record).http();
payload = request.body();
contentEncodings = request.headers().all("Content-Encoding");
} else {
payload = record.body();
}
if (contentEncodings.isEmpty()) {
writeBody(out, payload);
} else {
if (contentEncodings.size() > 1) {
System.err.println("Multiple Content-Encodings not supported: " + contentEncodings);
} else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
|| contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
writeBody(out, IOUtils.gunzipChannel(payload));
} else {
System.err.println("Content-Encoding not supported: " + contentEncodings.get(0));
}
}
}

private static void writeBody(WritableByteChannel out, ReadableByteChannel body) throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(8192);
while (body.read(buffer) > -1) {
buffer.flip();
out.write(buffer);
buffer.compact();
}
}

private static void usage(int exitValue) {
System.err.println("");
System.err.println("ExtractTool [-h] [--payload | --headers] filename offset");
System.err.println("");
System.err.println("Options:");
System.err.println("");
System.err.println(" --headers\toutput only record (and HTTP) headers");
System.err.println(" --payload\toutput only record payload, if necessary");
System.err.println(" \tdecode transfer and/or content encoding");
System.exit(exitValue);
}

public static void main(String[] args) throws IOException {
ExtractAction action = ExtractAction.RECORD;
Path warcFile = null;
long offset = -1;
for (String arg : args) {
switch (arg) {
case "-h":
case "--help":
usage(0);
case "--headers":
action = ExtractAction.HEADERS;
break;
case "--payload":
action = ExtractAction.PAYLOAD;
break;
default:
if (warcFile == null) {
warcFile = Paths.get(arg);
if (!warcFile.toFile().canRead()) {
System.err.println("Cannot read WARC file: " + warcFile);
usage(1);
}
} else if (offset == -1) {
try {
offset = Long.parseLong(arg);
} catch (NumberFormatException e) {
System.err.println(e.getMessage());
usage(1);
}
} else {
System.err.println("Unknown argument: " + arg);
usage(1);
}
}
}
if (warcFile == null || offset == -1) {
usage(1);
}
try (FileChannel channel = FileChannel.open(warcFile);
WarcReader reader = new WarcReader(channel.position(offset))) {
Optional<WarcRecord> record = reader.next();
if (!record.isPresent()) {
System.err.println("No record found at position " + offset);
System.exit(1);
}
WritableByteChannel out = Channels.newChannel(System.out);
switch (action) {
case RECORD:
writeWarcHeaders(out, record.get());
writeBody(out, record.get().body());
break;
case HEADERS:
writeWarcHeaders(out, record.get());
writeHttpHeaders(out, record.get());
break;
case PAYLOAD:
writePayload(out, record.get());
break;
}
}
}
}
4 changes: 4 additions & 0 deletions src/org/netpreserve/jwarc/tools/WarcTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ public static void main(String[] args) throws Exception {
case "cdx":
CdxTool.main(rest);
break;
case "extract":
ExtractTool.main(rest);
break;
case "fetch":
FetchTool.main(rest);
break;
Expand Down Expand Up @@ -59,6 +62,7 @@ private static void usage() {
System.out.println("Commands:");
System.out.println("");
System.out.println(" cdx List records in CDX format");
System.out.println(" extract Extract record by offset");
System.out.println(" fetch Download a URL recording the request and response");
System.out.println(" filter Copy records that match a given filter expression");
System.out.println(" ls List records in WARC file(s)");
Expand Down

0 comments on commit bcaad8b

Please sign in to comment.