Skip to content
This repository has been archived by the owner on Mar 9, 2021. It is now read-only.

Commit

Permalink
Merge pull request #48 from nzv8fan/updatedTestCases
Browse files Browse the repository at this point in the history
Updated test cases
  • Loading branch information
karussell committed May 12, 2016
2 parents cb24ab4 + ecb063f commit c8f6e98
Show file tree
Hide file tree
Showing 4 changed files with 762 additions and 9 deletions.
Expand Up @@ -58,7 +58,7 @@ public ArticleTextExtractor() {
+ "|arti(cle|kel)|instapaper_body");
setNegative("nav($|igation)|user|com(ment|bx)|(^com-)|contact|"
+ "foot|masthead|(me(dia|ta))|outbrain|promo|related|scroll|(sho(utbox|pping))|"
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard");
+ "sidebar|sponsor|tags|tool|widget|player|disclaimer|toc|infobox|vcard|post-ratings");
}

public ArticleTextExtractor setUnlikely(String unlikelyStr) {
Expand Down
Expand Up @@ -63,7 +63,13 @@ public void testData5() throws Exception {
@Test
public void testData6() throws Exception {
JResult res = extractor.extractContent(readFileAsString("test_data/6.html"));
assertEquals(res.getText(), "Acting Governor of Balkh province, Atta Mohammad Noor, said that differences between leaders of the National Unity Government (NUG) – namely President Ashraf Ghani and CEO Abdullah Abdullah— have paved the ground for mounting insecurity. Hundreds of worried relatives gathered outside Kabul hospitals on Tuesday desperate for news of loved ones following the deadly suicide bombing earlier in the day.");
assertTrue("data6:" + res.getText(), res.getText().equals("Acting Governor of Balkh province, Atta Mohammad Noor, said that differences between leaders of the National Unity Government (NUG) – namely President Ashraf Ghani and CEO Abdullah Abdullah— have paved the ground for mounting insecurity. Hundreds of worried relatives gathered outside Kabul hospitals on Tuesday desperate for news of loved ones following the deadly suicide bombing earlier in the day."));
}

@Test
public void testData7() throws Exception {
JResult res = extractor.extractContent(readFileAsString("test_data/7.html"));
assertTrue("data7:" + res.getText(), res.getText().startsWith("Over 100 school girls have been poisoned in western Farah province of Afghanistan during the school hours."));
}

@Test
Expand Down
Expand Up @@ -32,7 +32,7 @@ public void testNoException() throws Exception {
JResult res = new HtmlFetcher().fetchAndExtract("http://www.tumblr.com/xeb22gs619", 10000, true);
// System.out.println("tumblr:" + res.getUrl());

res = new HtmlFetcher().fetchAndExtract("http://www.faz.net/-01s7fc", 10000, true);
// res = new HtmlFetcher().fetchAndExtract("http://www.faz.net/-01s7fc", 10000, true);
// System.out.println("faz:" + res.getUrl());

res = new HtmlFetcher().fetchAndExtract("http://www.google.com/url?sa=x&q=http://www.taz.de/1/politik/asien/artikel/1/anti-atomkraft-nein-danke/&ct=ga&cad=caeqargbiaaoataaoabaltmh7qrialaawabibwrllurf&cd=d5glzns5m_4&usg=afqjcnetx___sph8sjwhjwi-_mmdnhilra&utm_source=twitterfeed&utm_medium=twitter", 10000, true);
Expand Down Expand Up @@ -65,14 +65,14 @@ public void testWithTitle() throws Exception {
// }
@Test
public void testEncoding() throws Exception {
JResult res = new HtmlFetcher().fetchAndExtract("http://www.yomiuri.co.jp/science/20140401-OYT1T50144.html", 10000, true);
assertEquals("承服できない・悪意ない…小保方晴子氏コメント:科学:読売新聞YOMIURI ONLINE", res.getTitle());
JResult res = new HtmlFetcher().fetchAndExtract("http://www.yomiuri.co.jp/science/", 10000, true);
assertEquals("科学・ITニュース:読売新聞(YOMIURI ONLINE)", res.getTitle());
}

@Test
public void testHashbang() throws Exception {
JResult res = new HtmlFetcher().fetchAndExtract("http://www.facebook.com/democracynow", 10000, true);
assertTrue(res.getTitle(), res.getTitle().startsWith("Democracy Now! "));
assertTrue(res.getTitle(), res.getTitle().startsWith("Democracy Now!"));

// not available anymore
// res = new HtmlFetcher().fetchAndExtract("http://twitter.com/#!/th61/status/57141697720745984", 10000, true);
Expand All @@ -88,19 +88,19 @@ public void testImage() throws Exception {

@Test
public void testFurther() throws Exception {
JResult res = new HtmlFetcher().fetchAndExtract("http://linksunten.indymedia.org/de/node/41619?utm_source=twitterfeed&utm_medium=twitter", 10000, true);
JResult res = new HtmlFetcher().fetchAndExtract("https://linksunten.indymedia.org/de/node/41619?utm_source=twitterfeed&utm_medium=twitter", 10000, true);
assertTrue(res.getText(), res.getText().startsWith("Es gibt kein ruhiges Hinterland! Schon wieder den "));
}

@Test
public void testDoubleResolve() throws Exception {
JResult res = new HtmlFetcher().fetchAndExtract("http://t.co/eZRKcEYI", 10000, true);
assertTrue(res.getTitle(), res.getTitle().startsWith("teleject/Responsive-Web-Design-Artboards "));
assertTrue(res.getTitle(), res.getTitle().startsWith("GitHub - teleject/Responsive-Web-Design-Artboards"));
}

@Test
public void testXml() throws Exception {
String str = new HtmlFetcher().fetchAsString("http://karussell.wordpress.com/feed/", 10000);
String str = new HtmlFetcher().fetchAsString("https://karussell.wordpress.com/feed/", 10000);
assertTrue(str, str.startsWith("<?xml version="));
}
}

0 comments on commit c8f6e98

Please sign in to comment.