diff --git a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java index 9122cf18c..195392795 100644 --- a/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java +++ b/modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java @@ -76,9 +76,29 @@ public class ExtractorHTML extends ContentExtractor implements InitializingBean public final static String A_META_ROBOTS = "meta-robots"; + { + setMaxElementLength(64); + } + public int getMaxElementLength() { + return (Integer) kp.get("maxElementLength"); + } + public void setMaxElementLength(int max) { + kp.put("maxElementLength",max); + } + /** * Compiled relevant tag extractor. + * + * HER-1998 - Modified part 8 to allow conditional html comments. + * Conditional HTML comment example: + * "" + * + * This technique is commonly used to reference CSS & JavaScript that are designed to deal with the quirks of a specific version of Internet Explorer. + * There is another syntax for conditional comments which already gets parsed by the regex since it doesn't start with "" + + ""; + + ExtractorHTML extractor = new ExtractorHTML(); + UriErrorLoggerModule ulm = new UnitTestUriLoggerModule(); + extractor.setLoggerModule(ulm); + CrawlMetadata metadata = new CrawlMetadata(); + metadata.afterPropertiesSet(); + extractor.setMetadata(metadata); + extractor.afterPropertiesSet(); + + extractor.extract(curi, cs); + + Link[] links = curi.getOutLinks().toArray(new Link[0]); + Arrays.sort(links); + + String dest1 = "http://www.example.com/foo.gif"; + String dest2 = "http://www.example.com/foo.js"; + + assertEquals("outlink1 from conditional comment img src",dest1, + links[0].getDestination().toString()); + assertEquals("outlink2 from conditional comment script src",dest2, + links[1].getDestination().toString()); + + } }