diff --git a/src/plugin/jsoup-extractor/build.xml b/src/plugin/jsoup-extractor/build.xml
index 963e1365fd..a0ca008ec8 100644
--- a/src/plugin/jsoup-extractor/build.xml
+++ b/src/plugin/jsoup-extractor/build.xml
@@ -20,9 +20,4 @@
-
-
-
-
-
diff --git a/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java b/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
index e12270836f..649cf57a8c 100644
--- a/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
+++ b/src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
@@ -27,11 +27,11 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.Map.Entry;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.util.Bytes;
import org.apache.nutch.core.jsoup.extractor.JsoupExtractorConstants;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
@@ -43,8 +43,8 @@ public class TestJsoupHtmlParser {
private static final String SAMPLE_CONF_FILE = "jsoup-extractor-example.xml";
private static final String SAMPLE_URL = "https://www.youtube.com/watch?v=pzMpwW4ppRM";
- private static final String TITLE = "Large scale crawling with Apache Nutch\t";
- private static final String PUBLISHER = "LuceneSolrRevolution\t";
+ private static final String TITLE = "Large scale crawling with Apache Nutch";
+ private static final String PUBLISHER = "LuceneSolrRevolution";
@Test
public void testJsoupHtmlParser() {
@@ -70,11 +70,11 @@ public void testJsoupHtmlParser() {
parser.parse(SAMPLE_URL, page);
for(Entry entry: page.getMetadata().entrySet()) {
- System.out.println(entry.getKey().toString() + " => " + Bytes.toString(entry.getValue().array()));
+ System.out.println(entry.getKey().toString() + " => " + new String(entry.getValue().array(), StandardCharsets.UTF_8));
}
- assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("title")).array()), TITLE);
- assertEquals(Bytes.toString(page.getMetadata().get(new Utf8("publisherName")).array()), PUBLISHER);
+ assertEquals(new String(page.getMetadata().get(new Utf8("title")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), TITLE);
+ assertEquals(new String(page.getMetadata().get(new Utf8("publisherName")).array(), StandardCharsets.UTF_8).replaceAll("\\t", ""), PUBLISHER);
} catch (MalformedURLException ex) {
ex.printStackTrace();