diff --git a/src/plugin/jsoup-extractor/build.xml b/src/plugin/jsoup-extractor/build.xml index 963e1365fd..a0ca008ec8 100644 --- a/src/plugin/jsoup-extractor/build.xml +++ b/src/plugin/jsoup-extractor/build.xml @@ -20,9 +20,4 @@ - - - - - diff --git a/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java b/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java index e12270836f..649cf57a8c 100644 --- a/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java +++ b/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java @@ -27,11 +27,11 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Map.Entry; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.util.Bytes; import org.apache.nutch.core.jsoup.extractor.JsoupExtractorConstants; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParseUtil; @@ -43,8 +43,8 @@ public class TestJsoupHtmlParser { private static final String SAMPLE_CONF_FILE = "jsoup-extractor-example.xml"; private static final String SAMPLE_URL = "https://www.youtube.com/watch?v=pzMpwW4ppRM"; - private static final String TITLE = "Large scale crawling with Apache Nutch\t"; - private static final String PUBLISHER = "LuceneSolrRevolution\t"; + private static final String TITLE = "Large scale crawling with Apache Nutch"; + private static final String PUBLISHER = "LuceneSolrRevolution"; @Test public void testJsoupHtmlParser() { @@ -70,11 +70,11 @@ public void testJsoupHtmlParser() { parser.parse(SAMPLE_URL, page); for(Entry entry: page.getMetadata().entrySet()) { - System.out.println(entry.getKey().toString() + " => " + Bytes.toString(entry.getValue().array())); + System.out.println(entry.getKey().toString() + " => " + new String(entry.getValue().array(), StandardCharsets.UTF_8)); } - assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("title")).array()), TITLE); - assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("publisherName")).array()), PUBLISHER); + assertEquals(new String(page.getMetadata().get(new Utf8("title")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), TITLE); + assertEquals(new String(page.getMetadata().get(new Utf8("publisherName")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), PUBLISHER); } catch (MalformedURLException ex) { ex.printStackTrace();