This repository has been archived by the owner. It is now read-only.
Permalink
Browse files

better io (#261)

OpenJDK's implementation of io utilities sucks. It will raise a java.lang.OutOfMemoryError: Requested array size exceeds VM limit exception when copying array larger than 2G. This will affect large warc files. org.apache.commons.io provides more robust io utilities.
  • Loading branch information...
zackwang authored and lintool committed Feb 11, 2017
1 parent b59e3f2 commit bc2acd896bdb42b1015ca8d9c6ee89a33a24a0fb
@@ -23,6 +23,8 @@
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
@@ -61,8 +63,12 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
DataOutputStream dout = new DataOutputStream(baos);
dout.write(metaline.getBytes());
dout.write("\n".getBytes());
copyStream(record, (int) meta.getLength(), true, dout);
long recordLength = meta.getLength();
long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), dout);
if (len != recordLength) {
LOG.error("Read " + len + " bytes but expected " + recordLength + " bytes. Continuing...");
}
return baos.toByteArray();
}
@@ -76,11 +82,7 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
public static byte[] getContent(ARCRecord record) throws IOException {
ARCRecordMetaData meta = record.getMetaData();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dout = new DataOutputStream(baos);
copyStream(record, (int) meta.getLength(), true, dout);
return baos.toByteArray();
return copyToByteArray(record, (int) meta.getLength(), true);
}
/**
@@ -108,22 +110,14 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
return content;
}
private static long copyStream(final InputStream is, final int recordLength,
boolean enforceLength, final DataOutputStream out) throws IOException {
byte [] scratchbuffer = new byte[recordLength];
int read = 0;
long tot = 0;
while ((tot < recordLength) && (read = is.read(scratchbuffer)) != -1) {
int write = read;
// never write more than enforced length
write = (int) Math.min(write, recordLength - tot);
tot += read;
out.write(scratchbuffer, 0, write);
}
if (enforceLength && tot != recordLength) {
LOG.error("Read " + tot + " bytes but expected " + recordLength + " bytes. Continuing...");
}
private static byte[] copyToByteArray(InputStream is, final int recordLength,
boolean enforceLength) throws IOException {
return tot;
BoundedInputStream bis = new BoundedInputStream(is, recordLength);
byte[] rawContents = IOUtils.toByteArray(bis);
if (enforceLength && rawContents.length != recordLength) {
LOG.error("Read " + rawContents.length + " bytes but expected " + recordLength + " bytes. Continuing...");
}
return rawContents;
}
}
@@ -17,6 +17,8 @@
package org.warcbase.data;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCReader;
@@ -100,11 +102,7 @@ public static String getWarcResponseMimeType(byte[] contents) {
}
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream dout = new DataOutputStream(baos);
copyStream(record, len, true, dout);
return baos.toByteArray();
return copyToByteArray(record, len, true);
} catch (Exception e) {
// Catch exceptions related to any corrupt archive files.
return new byte[0];
@@ -131,22 +129,14 @@ public static String getWarcResponseMimeType(byte[] contents) {
return baos.toByteArray();
}
private static long copyStream(final InputStream is, final int recordLength,
boolean enforceLength, final DataOutputStream out) throws IOException {
byte [] scratchbuffer = new byte[recordLength];
int read = 0;
long tot = 0;
while ((tot < recordLength) && (read = is.read(scratchbuffer)) != -1) {
int write = read;
// never write more than enforced length
write = (int) Math.min(write, recordLength - tot);
tot += read;
out.write(scratchbuffer, 0, write);
}
if (enforceLength && tot != recordLength) {
LOG.error("Read " + tot + " bytes but expected " + recordLength + " bytes. Continuing...");
}
private static byte[] copyToByteArray(InputStream is, final int recordLength,
boolean enforceLength) throws IOException {
return tot;
BoundedInputStream bis = new BoundedInputStream(is, recordLength);
byte[] rawContents = IOUtils.toByteArray(bis);
if (enforceLength && rawContents.length != recordLength) {
LOG.error("Read " + rawContents.length + " bytes but expected " + recordLength + " bytes. Continuing...");
}
return rawContents;
}
}

0 comments on commit bc2acd8

Please sign in to comment.