From 998edf3479569f6da08e7a3cc154fe95849b7ddc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 13 Nov 2017 21:20:28 -0800 Subject: [PATCH 1/3] max size for extracted form elements --- .../archive/modules/forms/ExtractorHTMLForms.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/src/main/java/org/archive/modules/forms/ExtractorHTMLForms.java b/modules/src/main/java/org/archive/modules/forms/ExtractorHTMLForms.java index 8326da558..aeab21c54 100644 --- a/modules/src/main/java/org/archive/modules/forms/ExtractorHTMLForms.java +++ b/modules/src/main/java/org/archive/modules/forms/ExtractorHTMLForms.java @@ -143,17 +143,17 @@ protected void analyze(CrawlURI curi, CharSequence cs) { for (Object offset : curi.getDataList(ExtractorHTML.A_FORM_OFFSETS)) { int offsetInt = (Integer) offset; CharSequence relevantSequence = cs.subSequence(offsetInt, cs.length()); - String method = findAttributeValueGroup("(?i)^[^>]*\\smethod\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence); - String action = findAttributeValueGroup("(?i)^[^>]*\\saction\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence); - String enctype = findAttributeValueGroup("(?i)^[^>]*\\senctype\\s*=\\s*([^>\\s]+)[^>]*>",1,relevantSequence); + String method = findAttributeValueGroup("(?i)^[^>]*\\smethod\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,relevantSequence); + String action = findAttributeValueGroup("(?i)^[^>]*\\saction\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,relevantSequence); + String enctype = findAttributeValueGroup("(?i)^[^>]*\\senctype\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,relevantSequence); HTMLForm form = new HTMLForm(); form.setMethod(method); form.setAction(action); form.setEnctype(enctype); for(CharSequence input : findGroups("(?i)(]*>)|()",1,relevantSequence)) { - String type = findAttributeValueGroup("(?i)^[^>]*\\stype\\s*=\\s*([^>\\s]+)[^>]*>",1,input); - String name = findAttributeValueGroup("(?i)^[^>]*\\sname\\s*=\\s*([^>\\s]+)[^>]*>",1,input); - String value = findAttributeValueGroup("(?i)^[^>]*\\svalue\\s*=\\s*([^>\\s]+)[^>]*>",1,input); + String type = findAttributeValueGroup("(?i)^[^>]*\\stype\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,input); + String name = findAttributeValueGroup("(?i)^[^>]*\\sname\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,input); + String value = findAttributeValueGroup("(?i)^[^>]*\\svalue\\s*=\\s*([^>\\s]{1,50000})[^>]*>",1,input); Matcher m = TextUtils.getMatcher("(?i)^[^>]*\\schecked\\s*[^>]*>", input); boolean checked = false; try { From a7b7c6cf5a0d567dad91fd6c499279cde2004508 Mon Sep 17 00:00:00 2001 From: Kenji Nagahashi Date: Fri, 8 Dec 2017 17:15:03 -0800 Subject: [PATCH 2/3] fix for test failures in a workspace on NFS-mounted filesystem ContentDigestHistoryTest does not close BdbModule. It results in failure to delete bdb directory in following tests. Added tearDown() method that closes BdbModule. --- .../archive/modules/recrawl/ContentDigestHistoryTest.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java index cd007b744..ea2980e06 100644 --- a/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java +++ b/modules/src/test/java/org/archive/modules/recrawl/ContentDigestHistoryTest.java @@ -128,6 +128,14 @@ protected BdbModule bdb() throws IOException { return bdb; } + @Override + protected void tearDown() throws Exception { + if (bdb != null) { + bdb.close(); + } + super.tearDown(); + } + public void testBasics() throws InterruptedException, IOException { historyStore().store.clear(); assertTrue(historyStore().store.isEmpty()); From 95759149ae162155fa0adbe4f500445305ad2963 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 16 Jan 2018 17:12:24 -0800 Subject: [PATCH 3/3] catch exceptions scoping outlinks to stop them from derailing processing of the parent url --- .../crawler/prefetch/CandidateScoper.java | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/engine/src/main/java/org/archive/crawler/prefetch/CandidateScoper.java b/engine/src/main/java/org/archive/crawler/prefetch/CandidateScoper.java index abfc7b7bd..18fdeca94 100644 --- a/engine/src/main/java/org/archive/crawler/prefetch/CandidateScoper.java +++ b/engine/src/main/java/org/archive/crawler/prefetch/CandidateScoper.java @@ -19,11 +19,13 @@ package org.archive.crawler.prefetch; -import static org.archive.modules.fetcher.FetchStatusCodes.S_OUT_OF_SCOPE; +import java.util.logging.Level; +import java.util.logging.Logger; import org.archive.crawler.framework.Scoper; import org.archive.modules.CrawlURI; import org.archive.modules.ProcessResult; +import org.archive.modules.fetcher.FetchStatusCodes; /** * Simple single-URI scoper, considers passed-in URI as candidate; sets @@ -35,11 +37,19 @@ public class CandidateScoper extends Scoper { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; + private static final Logger logger = Logger.getLogger(CandidateScoper.class.getName()); + @Override protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedException { - if (!isInScope(curi)) { - // Scope rejected - curi.setFetchStatus(S_OUT_OF_SCOPE); + try { + if (!isInScope(curi)) { + // Scope rejected + curi.setFetchStatus(FetchStatusCodes.S_OUT_OF_SCOPE); + return ProcessResult.FINISH; + } + } catch (Exception e) { + curi.setFetchStatus(FetchStatusCodes.S_RUNTIME_EXCEPTION); + logger.log(Level.SEVERE, "problem scoping " + curi, e); return ProcessResult.FINISH; } return ProcessResult.PROCEED;