Skip to content

Commit

Permalink
NUTCH-2404 Fix for Failed Jenkin build #1588 after merging pull request
Browse files Browse the repository at this point in the history
apache#192 (NUTCH-2389).
  • Loading branch information
kaidul committed Jul 31, 2017
1 parent 5f6c383 commit a6870de
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 11 deletions.
5 changes: 0 additions & 5 deletions src/plugin/jsoup-extractor/build.xml
Expand Up @@ -20,9 +20,4 @@

<import file="../build-plugin.xml"/>

<!-- Deploy Unit test dependencies -->
<target name="deploy-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
</target>

</project>
Expand Up @@ -27,11 +27,11 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Map.Entry;

import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.nutch.core.jsoup.extractor.JsoupExtractorConstants;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
Expand All @@ -43,8 +43,8 @@ public class TestJsoupHtmlParser {

private static final String SAMPLE_CONF_FILE = "jsoup-extractor-example.xml";
private static final String SAMPLE_URL = "https://www.youtube.com/watch?v=pzMpwW4ppRM";
private static final String TITLE = "Large scale crawling with Apache Nutch\t";
private static final String PUBLISHER = "LuceneSolrRevolution\t";
private static final String TITLE = "Large scale crawling with Apache Nutch";
private static final String PUBLISHER = "LuceneSolrRevolution";

@Test
public void testJsoupHtmlParser() {
Expand All @@ -70,11 +70,11 @@ public void testJsoupHtmlParser() {
parser.parse(SAMPLE_URL, page);

for(Entry<CharSequence, ByteBuffer> entry: page.getMetadata().entrySet()) {
System.out.println(entry.getKey().toString() + " => " + Bytes.toString(entry.getValue().array()));
System.out.println(entry.getKey().toString() + " => " + new String(entry.getValue().array(), StandardCharsets.UTF_8));
}

assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("title")).array()), TITLE);
assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("publisherName")).array()), PUBLISHER);
assertEquals(new String(page.getMetadata().get(new Utf8("title")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), TITLE);
assertEquals(new String(page.getMetadata().get(new Utf8("publisherName")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), PUBLISHER);

} catch (MalformedURLException ex) {
ex.printStackTrace();
Expand Down

0 comments on commit a6870de

Please sign in to comment.