From 0326a5f84b4ea4b0a7728c269253e4ddb0d9f095 Mon Sep 17 00:00:00 2001 From: Brad Heap Date: Wed, 11 May 2016 12:27:58 +1000 Subject: [PATCH] Change to weightChildNodes method of ArticleTextExtractor - weight for text in p tags will include all text in subtags, e.g. strong, em tags --- .../snacktory/ArticleTextExtractor.java | 4 + .../snacktory/ArticleTextExtractorTest.java | 6 + test_data/6.html | 1132 +++++++++++++++++ 3 files changed, 1142 insertions(+) create mode 100644 test_data/6.html diff --git a/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java b/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java index 82e0fff3..86e238e4 100644 --- a/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java +++ b/src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java @@ -307,6 +307,10 @@ protected int weightChildNodes(Element rootEl) { List pEls = new ArrayList(5); for (Element child : rootEl.children()) { String ownText = child.ownText(); + + // if you are on a paragraph, grab all the text including that surrounded by additional formatting. + if (child.tagName().equals("p")) ownText = child.text(); + int ownTextLength = ownText.length(); if (ownTextLength < 20) continue; diff --git a/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java b/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java index 33f8a2c8..8f13a3a7 100644 --- a/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java +++ b/src/test/java/de/jetwick/snacktory/ArticleTextExtractorTest.java @@ -60,6 +60,12 @@ public void testData5() throws Exception { assertTrue(res.getKeywords().isEmpty()); } + @Test + public void testData6() throws Exception { + JResult res = extractor.extractContent(readFileAsString("test_data/6.html")); + assertEquals(res.getText(), "Acting Governor of Balkh province, Atta Mohammad Noor, said that differences between leaders of the National Unity Government (NUG) – namely President Ashraf Ghani and CEO Abdullah Abdullah— have paved the ground for mounting insecurity. Hundreds of worried relatives gathered outside Kabul hospitals on Tuesday desperate for news of loved ones following the deadly suicide bombing earlier in the day."); + } + @Test public void testCNN() throws Exception { // http://edition.cnn.com/2011/WORLD/africa/04/06/libya.war/index.html?on.cnn=1 diff --git a/test_data/6.html b/test_data/6.html new file mode 100644 index 00000000..60c08125 --- /dev/null +++ b/test_data/6.html @@ -0,0 +1,1132 @@ + + + + + + + + + + + + + + + + + + + + + + + +TOLOnews 6pm News 19 April 2016 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+ +
+
+ +
+
+
+ +
+
+
+
+
+
+
+
+
+ + + + + + + + + + +
+ +
+ + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Instagram
+
+
+
+
+
+
+
+
+
+
+
+Tuesday 10 May 2016 +
+ +
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+ + + + +
+
+
+ + +
+ + + +
+
+
+ +
+ +Print + + + +
+
+
+
+
+ +
+ +Print + + +
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+

+ +Videos - + +6pm News Bulletin +

+ + +

shuja-19-april-16Top news in this Bulletin:

+

Acting Governor of Balkh province, Atta Mohammad Noor, said that differences between leaders of the National Unity Government (NUG) – namely President Ashraf Ghani and CEO Abdullah Abdullah— have paved the ground for mounting insecurity.

+

To watch the whole news bulletin, click here:

+

+ + + + + + + + + + + + + + + + +

+

Hundreds of worried relatives gathered outside Kabul hospitals on Tuesday desperate for news of loved ones following the deadly suicide bombing earlier in the day.

+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Nightly News

+
+
+ + +
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+
TOLOnews Live
+
+
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+
+

+
+
+
+
+
+
+
+
+ + + +
+
+
+ +
+
+
+
+
+

TOLOnews Poll

+
+
+ +

+ What is your view on the future of the peace talks with Taliban?

+
+
+
+ + +
+
+ + +
+
+ + +
+
+
+
+ +
+
+ +
+
+ + + + +
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +
+ +
+
+
+
+
+ + +
+ + +
+ + +
+
+
+
+ + + + + + + +
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+

RSS

+
+
+
+
+
+
TOLOnews.com RSS Feed
+
+
+
+
+
+
+
+
+ + +
+
+ +
+
+
+ +
+ + +
+
+
+
+ + + + + + + +