Skip to content
This repository has been archived by the owner on Mar 9, 2021. It is now read-only.

Commit

Permalink
Merge pull request #47 from nzv8fan/issue42
Browse files Browse the repository at this point in the history
Fix Issue #42 to improve content identification
  • Loading branch information
karussell committed May 11, 2016
2 parents a5fe61e + 0326a5f commit cb24ab4
Show file tree
Hide file tree
Showing 3 changed files with 1,142 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/main/java/de/jetwick/snacktory/ArticleTextExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ protected int weightChildNodes(Element rootEl) {
List<Element> pEls = new ArrayList<Element>(5);
for (Element child : rootEl.children()) {
String ownText = child.ownText();

// if you are on a paragraph, grab all the text including that surrounded by additional formatting.
if (child.tagName().equals("p")) ownText = child.text();

int ownTextLength = ownText.length();
if (ownTextLength < 20)
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ public void testData5() throws Exception {
assertTrue(res.getKeywords().isEmpty());
}

@Test
public void testData6() throws Exception {
JResult res = extractor.extractContent(readFileAsString("test_data/6.html"));
assertEquals(res.getText(), "Acting Governor of Balkh province, Atta Mohammad Noor, said that differences between leaders of the National Unity Government (NUG) – namely President Ashraf Ghani and CEO Abdullah Abdullah— have paved the ground for mounting insecurity. Hundreds of worried relatives gathered outside Kabul hospitals on Tuesday desperate for news of loved ones following the deadly suicide bombing earlier in the day.");
}

@Test
public void testCNN() throws Exception {
// http://edition.cnn.com/2011/WORLD/africa/04/06/libya.war/index.html?on.cnn=1
Expand Down
Loading

0 comments on commit cb24ab4

Please sign in to comment.