Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
  • 4 commits
  • 2 files changed
  • 0 commit comments
  • 2 contributors
Commits on Mar 16, 2012
Adam Miller HER-1998 - ExtractorHTML modified to parse html inside conditional co…
…mments. Still ignores normal comments
1a333fb
Commits on Mar 20, 2012
Adam Miller HER-1998 - Adding comments, adjusting test case to prevent subclasses…
… from failing new tests
a108795
@nlevitt nlevitt Merge remote-tracking branch 'adam-miller/master' 4eab646
@nlevitt nlevitt * ExtractorHTML.java
    some fixup on RELEVANT_TAG_EXTRACTOR javadoc comment
335b32a
View
73 modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java
@@ -76,30 +76,6 @@
public final static String A_META_ROBOTS = "meta-robots";
-
- /**
- * Compiled relevant tag extractor.
- *
- * <p>
- * This pattern extracts either:
- * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or
- * <li> (2) &lt;style&gt;...&lt;/style&gt; or
- * <li> (3) &lt;meta ...&gt; or
- * <li> (4) any other open-tag with at least one attribute
- * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
- * <p>
- * groups:
- * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
- * <li> 2: just script open tag
- * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
- * <li> 4: just style open tag
- * <li> 5: entire other tag, without '<' '>'
- * <li> 6: element
- * <li> 7: META
- * <li> 8: !-- comment --
- */
-// version w/ less unnecessary backtracking
-
{
setMaxElementLength(64);
}
@@ -110,11 +86,58 @@ public void setMaxElementLength(int max) {
kp.put("maxElementLength",max);
}
+
+ /**
+ * Relevant tag extractor.
+ *
+ * <p>
+ * This pattern extracts either:
+ * </p>
+ * <ul>
+ * <li>(1) whole &lt;script&gt;...&lt;/script&gt; or
+ * <li>(2) &lt;style&gt;...&lt;/style&gt; or
+ * <li>(3) &lt;meta ...&gt; or
+ * <li>(4) any other open-tag with at least one attribute (eg matches
+ * "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")
+ * </ul>
+ * <p>
+ * groups:
+ * </p>
+ * <ul>
+ * <li>1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT
+ * <li>2: just script open tag
+ * <li>3: STYLE TYPE=moo&gt;zoo&lt;/STYLE
+ * <li>4: just style open tag
+ * <li>5: entire other tag, without '&lt;' '>'
+ * <li>6: element
+ * <li>7: META
+ * <li>8: !-- comment --
+ * </ul>
+ *
+ * <p>
+ * HER-1998 - Modified part 8 to allow conditional html comments.
+ * Conditional HTML comment example:
+ * "&lt;!--[if expression]> HTML &lt;![endif]-->"
+ * </p>
+ *
+ * <p>
+ * This technique is commonly used to reference CSS &amp; JavaScript that
+ * are designed to deal with the quirks of a specific version of Internet
+ * Explorer. There is another syntax for conditional comments which already
+ * gets parsed by the regex since it doesn't start with "&lt;!--" Ex.
+ * &lt;!if expression> HTML &lt;!endif>
+ * </p>
+ *
+ * <p>
+ * https://en.wikipedia.org/wiki/Conditional_Comments
+ * </p>
+ */
+ // version w/ less unnecessary backtracking
static final String RELEVANT_TAG_EXTRACTOR =
"(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2
"|((style[^>]*+)>.*?</style)" + // 3, 4
"|(((meta)|(?:\\w{1,"+MAX_ELEMENT_REPLACE+"}))\\s+[^>]*+)" + // 5, 6, 7
- "|(!--.*?--))>"; // 8
+ "|(!--(?!\\[if).*?--))>"; // 8
// version w/ problems with unclosed script tags
// static final String RELEVANT_TAG_EXTRACTOR =
View
35 modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java
@@ -66,6 +66,7 @@
"<img src=\"foo.gif\"> IMG",
"http://www.archive.org/start/foo.gif",
+
};
@@ -379,5 +380,39 @@ public void testFlashvarsEmbedAttribute() throws URIException {
assertTrue("outlinks should contain: "+expected,
CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
}
+
+ /**
+ * HER-1998
+ * @throws URIException
+ */
+ public void testConditionalComment1() throws URIException {
+ CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
+
+ CharSequence cs =
+ "<!--[if IE 6]><img src=\"foo.gif\"><![endif]-->" +
+ "<!--[if IE 6]><script src=\"foo.js\"><![endif]-->";
+
+ ExtractorHTML extractor = new ExtractorHTML();
+ UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
+ extractor.setLoggerModule(ulm);
+ CrawlMetadata metadata = new CrawlMetadata();
+ metadata.afterPropertiesSet();
+ extractor.setMetadata(metadata);
+ extractor.afterPropertiesSet();
+
+ extractor.extract(curi, cs);
+
+ Link[] links = curi.getOutLinks().toArray(new Link[0]);
+ Arrays.sort(links);
+
+ String dest1 = "http://www.example.com/foo.gif";
+ String dest2 = "http://www.example.com/foo.js";
+
+ assertEquals("outlink1 from conditional comment img src",dest1,
+ links[0].getDestination().toString());
+ assertEquals("outlink2 from conditional comment script src",dest2,
+ links[1].getDestination().toString());
+
+ }
}

No commit comments for this range

Something went wrong with that request. Please try again.