Skip to content

Commit

Permalink
Use simpler read 500k chars and split method suggested by @nlevitt
Browse files Browse the repository at this point in the history
While we temporarily use a little more memory this version is a lot
less codes. It also allows us to do away with the BufferedReader.
  • Loading branch information
ato committed Nov 1, 2017
1 parent 1fae77c commit 68cecee
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 101 deletions.
Expand Up @@ -174,10 +174,9 @@ public synchronized void updateRobots(CrawlURI curi) {

InputStream contentBodyStream = null;
try {
BufferedReader reader;
contentBodyStream = curi.getRecorder().getContentReplayInputStream();

reader = new BufferedReader(new InputStreamReader(contentBodyStream));
InputStreamReader reader = new InputStreamReader(contentBodyStream);
robotstxt = new Robotstxt(reader);
validRobots = true;
} catch (IOException e) {
Expand Down
108 changes: 27 additions & 81 deletions modules/src/main/java/org/archive/modules/net/Robotstxt.java
Expand Up @@ -18,16 +18,17 @@
*/
package org.archive.modules.net;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.nio.CharBuffer;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.archive.bdb.AutoKryo;
Expand All @@ -43,7 +44,8 @@ public class Robotstxt implements Serializable {
private static final Logger logger =
Logger.getLogger(Robotstxt.class.getName());

protected static final long MAX_SIZE = 500*1024;
protected static final int MAX_SIZE = 500*1024;
private static final Pattern LINE_SEPARATOR = Pattern.compile("\r\n|\r|\n");

// all user agents contained in this robots.txt
// in order of declaration
Expand All @@ -63,12 +65,16 @@ public class Robotstxt implements Serializable {
public Robotstxt() {
}

public Robotstxt(BufferedReader reader) throws IOException {
initializeFromReader(reader);
public Robotstxt(Reader reader) throws IOException {
try {
initializeFromReader(reader);
} finally {
IOUtils.closeQuietly(reader);
}
}

public Robotstxt(ReadSource customRobots) {
BufferedReader reader = new BufferedReader(customRobots.obtainReader());
Reader reader = customRobots.obtainReader();
try {
initializeFromReader(reader);
} catch (IOException e) {
Expand All @@ -80,25 +86,24 @@ public Robotstxt(ReadSource customRobots) {
}
}

protected void initializeFromReader(BufferedReader reader) throws IOException {
BoundedLineReader lineReader = new BoundedLineReader(reader, MAX_SIZE);
String read;
protected void initializeFromReader(Reader reader) throws IOException {
CharBuffer buffer = CharBuffer.allocate(MAX_SIZE);
while (buffer.hasRemaining() && reader.read(buffer) >= 0) ;
buffer.flip();

String[] lines = LINE_SEPARATOR.split(buffer);
if (buffer.limit() == buffer.capacity()) {
int processed = buffer.capacity() - lines[lines.length - 1].length();
logger.warning("processed " + processed + " characters, ignoring the rest (see HER-1990)");
// discard the partial line at the end so we don't process a truncated path
lines[lines.length - 1] = "";
}

// current is the disallowed paths for the preceding User-Agent(s)
RobotsDirectives current = null;
while (reader != null) {
do {
read = lineReader.readLine();
// Skip comments & blanks
} while (read != null && ((read = read.trim()).startsWith("#") || read.length() == 0));
if (read == null) {
if (lineReader.reachedLimit()) {
// we count characters instead of bytes because the byte count isn't easily available
logger.warning("processed " + lineReader.getCharsProcessed() +
" characters, ignoring the rest (see HER-1990)");
}
reader.close();
reader = null;
} else {
for (String read: lines) {
read = read.trim();
if (!read.isEmpty() && !read.startsWith("#")) {
// remove any html markup
read = read.replaceAll("<[^>]+>","");
int commentIndex = read.indexOf("#");
Expand Down Expand Up @@ -188,65 +193,6 @@ protected void initializeFromReader(BufferedReader reader) throws IOException {
}
}

/**
* Read lines from a reader until a character limit is reached.
*
* Always returns whole lines. If the limit would cause a partial line to be
* read the data is discarded.
*/
private static class BoundedLineReader {
private final Reader reader;
private long remaining;
private long charsProcessed = 0;

BoundedLineReader(Reader reader, long limit) {
this.reader = reader;
this.remaining = limit;
}

String readLine() throws IOException {
StringBuilder buffer = new StringBuilder();

while (!reachedLimit()) {
int c = reader.read();

if (c < 0) { // end of file
if (buffer.length() > 0) { // file didn't end on a linefeed
charsProcessed += buffer.length();
return buffer.toString();
} else {
return null;
}
}

remaining--;

if (c == '\r' || c == '\n') {
charsProcessed += buffer.length() + 1;
return buffer.toString();
}

buffer.append((char) c);
}

return null;
}

boolean reachedLimit() {
return remaining <= 0;
}

/**
* Returns the number of characters that have been read.
*
* Includes newline characters.
* Excludes any partial line data read before the limit is reached.
*/
long getCharsProcessed() {
return charsProcessed;
}
}

/**
* Does this policy effectively allow everything? (No
* disallows or timing (crawl-delay) directives?)
Expand Down
38 changes: 20 additions & 18 deletions modules/src/test/java/org/archive/modules/net/RobotstxtTest.java
Expand Up @@ -20,6 +20,7 @@

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;

Expand All @@ -29,7 +30,7 @@

public class RobotstxtTest extends TestCase {
public void testParseRobots() throws IOException {
BufferedReader reader = new BufferedReader(new StringReader("BLAH"));
Reader reader = new StringReader("BLAH");
Robotstxt r = new Robotstxt(reader);
assertFalse(r.hasErrors);
assertEquals(0,r.getNamedUserAgents().size());
Expand Down Expand Up @@ -57,8 +58,7 @@ public void testParseRobots() throws IOException {
}

static Robotstxt sampleRobots1() throws IOException {
BufferedReader reader = new BufferedReader(
new StringReader(
Reader reader = new StringReader(
"User-agent: *\n" +
"Disallow: /cgi-bin/\n" +
"Disallow: /details/software\n" +
Expand All @@ -78,13 +78,12 @@ static Robotstxt sampleRobots1() throws IOException {
"Disallow: /\n" +
"Crawl-Delay: 20\n"+
"Allow: /images/\n"
));
);
return new Robotstxt(reader);
}

Robotstxt whitespaceFlawedRobots() throws IOException {
BufferedReader reader = new BufferedReader(
new StringReader(
Reader reader = new StringReader(
" User-agent: *\n" +
" Disallow: /cgi-bin/\n" +
" Disallow: /details/software\n" +
Expand All @@ -100,7 +99,7 @@ Robotstxt whitespaceFlawedRobots() throws IOException {
" Disallow: /\n" +
" Crawl-Delay: 20\n"+
" Allow: /images/\n"
));
);
return new Robotstxt(reader);
}

Expand Down Expand Up @@ -144,8 +143,7 @@ public void evalRobots(Robotstxt r) throws IOException {
}

Robotstxt htmlMarkupRobots() throws IOException {
BufferedReader reader = new BufferedReader(
new StringReader(
Reader reader = new StringReader(
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n"
+"<HEAD>\n"
+"<TITLE>/robots.txt</TITLE>\n"
Expand All @@ -157,7 +155,7 @@ Robotstxt htmlMarkupRobots() throws IOException {
+"\n"
+"</BODY>\n"
+"</HTML>\n"
));
);
return new Robotstxt(reader);
}

Expand Down Expand Up @@ -188,7 +186,7 @@ public void testCompactSerialization() throws IOException {
"Disallow:/service\n";

StringReader sr = new StringReader(TEST_ROBOTS_TXT);
Robotstxt rt = new Robotstxt(new BufferedReader(sr));
Robotstxt rt = new Robotstxt(sr);
{
RobotsDirectives da = rt.getDirectivesFor("a", false);
RobotsDirectives db = rt.getDirectivesFor("b", false);
Expand Down Expand Up @@ -216,7 +214,7 @@ public void testSeparatedSections() throws IOException {
+ "User-agent: a\n"
+ "Crawl-delay: 99\n";
StringReader sr = new StringReader(TEST_ROBOTS_TXT);
Robotstxt rt = new Robotstxt(new BufferedReader(sr));
Robotstxt rt = new Robotstxt(sr);

assertFalse(rt.getDirectivesFor("a").allows("/foo"));

Expand All @@ -231,14 +229,18 @@ public void testSeparatedSections() throws IOException {
public void testSizeLimit() throws IOException {
StringBuilder builder = new StringBuilder(
"User-agent: a\n" +
" Disallow: /\n" +
"User-Agent: b\n");
" Disallow: /\n" +
"User-Agent: b\nDisallow: /");
for (int i = 0; i < Robotstxt.MAX_SIZE; i++) {
builder.append(' ');
}
builder.append("Disallow: /\n");
Robotstxt rt = new Robotstxt(new BufferedReader(new StringReader(builder.toString())));
assertFalse("we should parse the first part", rt.getDirectivesFor("a").allows("/foo"));
assertTrue("but ignore anything after the size limit", rt.getDirectivesFor("b").allows("/foo"));
builder.append("\nUser-Agent: c\nDisallow: /\n");
Robotstxt rt = new Robotstxt(new StringReader(builder.toString()));
assertFalse("we should parse the first few lines",
rt.getDirectivesFor("a").allows("/foo"));
assertTrue("ignore the line that breaks the size limit",
rt.getDirectivesFor("b").allows("/foo"));
assertTrue("and also ignore any lines after the size limit",
rt.getDirectivesFor("c").allows("/foo"));
}
}

0 comments on commit 68cecee

Please sign in to comment.