Permalink
Browse files

better io (#261)

OpenJDK's implementation of io utilities sucks. It will raise a java.lang.OutOfMemoryError: Requested array size exceeds VM limit exception when copying array larger than 2G. This will affect large warc files. org.apache.commons.io provides more robust io utilities.
  • Loading branch information...
1 parent b59e3f2 commit bc2acd896bdb42b1015ca8d9c6ee89a33a24a0fb @zackwang zackwang committed with Feb 11, 2017
@@ -23,6 +23,8 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.commons.io.input.BoundedInputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
@@ -61,8 +63,12 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
DataOutputStream dout = new DataOutputStream(baos);
dout.write(metaline.getBytes());
dout.write("\n".getBytes());
- copyStream(record, (int) meta.getLength(), true, dout);
+ long recordLength = meta.getLength();
+ long len = IOUtils.copyLarge(new BoundedInputStream(record, recordLength), dout);
+ if (len != recordLength) {
+ LOG.error("Read " + len + " bytes but expected " + recordLength + " bytes. Continuing...");
+ }
return baos.toByteArray();
}
@@ -76,11 +82,7 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
public static byte[] getContent(ARCRecord record) throws IOException {
ARCRecordMetaData meta = record.getMetaData();
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutputStream dout = new DataOutputStream(baos);
- copyStream(record, (int) meta.getLength(), true, dout);
-
- return baos.toByteArray();
+ return copyToByteArray(record, (int) meta.getLength(), true);
}
/**
@@ -108,22 +110,14 @@ public static ARCRecord fromBytes(byte[] bytes) throws IOException {
return content;
}
- private static long copyStream(final InputStream is, final int recordLength,
- boolean enforceLength, final DataOutputStream out) throws IOException {
- byte [] scratchbuffer = new byte[recordLength];
- int read = 0;
- long tot = 0;
- while ((tot < recordLength) && (read = is.read(scratchbuffer)) != -1) {
- int write = read;
- // never write more than enforced length
- write = (int) Math.min(write, recordLength - tot);
- tot += read;
- out.write(scratchbuffer, 0, write);
- }
- if (enforceLength && tot != recordLength) {
- LOG.error("Read " + tot + " bytes but expected " + recordLength + " bytes. Continuing...");
- }
+ private static byte[] copyToByteArray(InputStream is, final int recordLength,
+ boolean enforceLength) throws IOException {
- return tot;
+ BoundedInputStream bis = new BoundedInputStream(is, recordLength);
+ byte[] rawContents = IOUtils.toByteArray(bis);
+ if (enforceLength && rawContents.length != recordLength) {
+ LOG.error("Read " + rawContents.length + " bytes but expected " + recordLength + " bytes. Continuing...");
+ }
+ return rawContents;
}
}
@@ -17,6 +17,8 @@
package org.warcbase.data;
import org.apache.commons.httpclient.HttpParser;
+import org.apache.commons.io.input.BoundedInputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCReader;
@@ -100,11 +102,7 @@ public static String getWarcResponseMimeType(byte[] contents) {
}
try {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutputStream dout = new DataOutputStream(baos);
- copyStream(record, len, true, dout);
-
- return baos.toByteArray();
+ return copyToByteArray(record, len, true);
} catch (Exception e) {
// Catch exceptions related to any corrupt archive files.
return new byte[0];
@@ -131,22 +129,14 @@ public static String getWarcResponseMimeType(byte[] contents) {
return baos.toByteArray();
}
- private static long copyStream(final InputStream is, final int recordLength,
- boolean enforceLength, final DataOutputStream out) throws IOException {
- byte [] scratchbuffer = new byte[recordLength];
- int read = 0;
- long tot = 0;
- while ((tot < recordLength) && (read = is.read(scratchbuffer)) != -1) {
- int write = read;
- // never write more than enforced length
- write = (int) Math.min(write, recordLength - tot);
- tot += read;
- out.write(scratchbuffer, 0, write);
- }
- if (enforceLength && tot != recordLength) {
- LOG.error("Read " + tot + " bytes but expected " + recordLength + " bytes. Continuing...");
- }
+ private static byte[] copyToByteArray(InputStream is, final int recordLength,
+ boolean enforceLength) throws IOException {
- return tot;
+ BoundedInputStream bis = new BoundedInputStream(is, recordLength);
+ byte[] rawContents = IOUtils.toByteArray(bis);
+ if (enforceLength && rawContents.length != recordLength) {
+ LOG.error("Read " + rawContents.length + " bytes but expected " + recordLength + " bytes. Continuing...");
+ }
+ return rawContents;
}
}

0 comments on commit bc2acd8

Please sign in to comment.