Merge pull request #14 from intenthq/remove-smart-title-extraction

Stop being too smart with titles
intenthq · Sep 15, 2015 · 9944ca4 · 9944ca4
2 parents 6c731f1 + 00ee5eb
commit 9944ca4
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 106 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,7 @@
 language: scala
 scala:
    - 2.11.7
-script: "sbt clean coverage test"
+script: "sbt clean coverage test it:test"
 after_success: "sbt coveralls"
 cache:
   directories:

diff --git a/src/it/scala/com/intenthq/gander/GanderIT.scala b/src/it/scala/com/intenthq/gander/GanderIT.scala
@@ -33,7 +33,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "Here at Intent HQ we believe how important it is to write good code. Why? First, because writing good code is much cheaper and more fun than writing bad code. Second, because if you write good code chances are that the product you are building will be much better. Third, and more important, because writing good code is what we are supposed to do: after all, we are getting paid for doing our job well",
-      title = "What is good code? A scientific definition.",
+      title = "What is good code? A scientific definition. - Intent HQ Engineering blog",
       metaDescription = "How would you define good code? This article gives a pseudo-scientific answer to that question after asking a sample of 65 developers that same question.",
       metaKeywords = "",
       lang = Some("en-GB"),
@@ -47,7 +47,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of certain price promotions.",
-      title = "Disneyland Paris faces pricing probe",
+      title = "Disneyland Paris faces pricing probe - BBC News",
       metaDescription = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of promotions available in other European member states.",
       metaKeywords = "",
       lang = Some("en"),
@@ -62,7 +62,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "Manchester City striker Sergio Aguero will miss Tuesday's Champions League opener against Juventus at Etihad Stadium because of a knee injury",
-      title = "BBC Sport",
+      title = "BBC Sport - Sergio Aguero: Manchester City striker to miss Juventus visit",
       metaDescription = "Manchester City striker Sergio Aguero will miss Tuesday's Champions League opener against Juventus with a knee injury.",
       metaKeywords = "BBC, Sport, BBC Sport, bbc.co.uk, world, uk, international, foreign, british, online, service",
       lang = Some("en-GB"),
@@ -76,7 +76,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "From Goldman on the FOMC operation twist announcement: ------------- 1. As we had expected, the Federal Open Market Committee decided to \"do the twist\" and increase the duration of its securities holdings by selling shorter-maturity securities ($400bn of Treasuries with maturity of 3 years or less)",
-      title = "GOLDMAN: 4 Key Points On The FOMC Announcement",
+      title = "GOLDMAN: 4 Key Points On The FOMC Announcement - Business Insider",
       metaDescription = "Here it is.",
       metaKeywords = "",
       lang = Some("en"),
@@ -89,7 +89,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "Los aliados de la OTAN ofrecieron este martes respaldo político a Turquía en su ofensiva contra el Estado Islámico tras una reunión convocada de urgencia por el Gobierno de Ankara.",
-      title = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo"                                                                                                                        ,
+      title = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo | Internacional | EL PAÍS"                                                                                                                        ,
       metaDescription = "La Alianza se ha reunido este martes con carácter de urgencia a pedición de Ankara para tratar el avance del Estado Islámico",
       metaKeywords = "otan, apoyar, cautela, ofensiva, turca, turco, yihadismo, alianza, haber, reunir, martes, urgencia, pedición, ankara, secretario, general, jens stoltenberg, resaltar, unidad, aliado",
       lang = Some("es"),
@@ -107,7 +107,7 @@ class GanderIT extends Specification {
     check(extract(url, Charsets.ISO_8859_1),
       url = url,
       content = "ROMA La strada è tracciata, la relazione potrebbe arrivare a Palazzo Chigi prima della pausa estiva. Il ministro dell’Interno Angelino Alfano non proporrà lo scioglimento per mafia del comune di Roma, ma nella relazione al governo",
-      title = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato",
+      title = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato - Corriere.it",
       metaDescription = "Non si propone lo scioglimento ma si lascia aperta la possibilità di una «diversa valutazione»",
       metaKeywords = "Ignazio Marino, Angelino Alfano",
       lang = Some("it"),
@@ -134,7 +134,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content = "No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim",
-      title = "Para Leão, Arena Barueri não é casa do Tricolor - São Paulo",
+      title = "Para Leão, Arena Barueri não é casa do Tricolor - São Paulo | Lancenet.com.br",
       metaDescription = "No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim. Isso porque no estádio do Morumbi haverá, nesta ...",
       metaKeywords = "Leao,Arena,Barueri,casa,Tricolor",
       lang = Some("pt"),
@@ -147,7 +147,7 @@ class GanderIT extends Specification {
     check(extract(url),
       url = url,
       content     = "Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo",
-      title       = "'Filho do gramado', Leão administra o São Paulo na base da conversa",
+      title       = "'Filho do gramado', Leão administra o São Paulo na base da conversa | globoesporte.com",
       metaDescription = "Emerson Le&atilde;o cobra lideran&ccedil;a ao S&atilde;o Paulo (Foto: M&aacute;rio &Acirc;ngelo / Ag. Estado) Emerson Le&atilde;o n&atilde;o foi ao campo na manh&atilde; desta ter&ccedil;a-feira no centro de treinamento do S&atilde;o Paulo. Bem humorado e com roupa casual, preferiu acompanhar de longe ...",
       metaKeywords = "notícias, notícia, são paulo",
       lang = None,

diff --git a/src/main/scala/com/intenthq/gander/extractors/ContentExtractor.scala b/src/main/scala/com/intenthq/gander/extractors/ContentExtractor.scala
@@ -19,16 +19,8 @@ object ContentExtractor {
 
   val logger: Logger = LoggerFactory.getLogger(getClass)
 
-  def extractTitle(doc: Document): String = {
-    val titleElem = byTag("title")(doc)
-    titleElem.headOption.map { x =>
-      val titleText = x.text
-      List(" | ", " - ", " » ", " · ").collectFirst {
-        case separator if titleText.contains(separator) => titleText.split(Pattern.quote(separator)).head
-      }.getOrElse(titleText)
-    }.getOrElse("")
-     .replace("&#65533;", "").trim
-  }
+  def extractTitle(doc: Document): String =
+    byTag("title")(doc).headOption.map(_.text).getOrElse("").replace("&#65533;", "").trim
 
   def extractLang(doc: Document): Option[String] =
     byTag("html")(doc).headOption.map(_.attr("lang")).filter(_.nonEmpty).orElse(

diff --git a/src/test/scala/com/intenthq/gander/ContentExtractorSpec.scala b/src/test/scala/com/intenthq/gander/ContentExtractorSpec.scala
@@ -5,105 +5,50 @@ import org.jsoup.Jsoup
 import org.specs2.mutable.Specification
 
 class ContentExtractorSpec extends Specification {
-
   "extractTitle" >> {
     def docFromTitle(title: String) = Jsoup.parse(s"<html><head><title>$title</title></head><body></body></html>")
     "should extract a title" >> {
       val title = "the title"
-
       extractTitle(docFromTitle(title)) must_== title
     }
     "should extract an empty title" >> {
       val title = ""
-
       extractTitle(docFromTitle(title)) must_== title
     }
-    "should keep the first segment if the title contains a separator" >> {
-      "| (2 segments)" >> {
-        val title = "The first segment | Wikipedia, the free encyclopaedia"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "| (3 segments)" >> {
-        val title = "The first segment | other 1 | other 2"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "- (2 segments)" >> {
-        val title = "The first segment - Wikipedia, the free encyclopaedia"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "- (3 segments)" >> {
-        val title = "The first segment - other 1 - other 2"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "- not used as a sparator" >> {
-        val title = "this-is-a-title"
-
-        extractTitle(docFromTitle(title)) must_== title
-      }
-      "» (2 segments)" >> {
-        val title = "The first segment » Wikipedia, the free encyclopaedia"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "» (3 segments)" >> {
-        val title = "The first segment » other 1 » other 2"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "» not used as a separator" >> {
-        val title = "«this is a title»"
-
-        extractTitle(docFromTitle(title)) must_== title
-      }
-      "· (2 segments)" >> {
-        val title = "The first segment · Wikipedia, the free encyclopaedia"
-
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
-      "· (3 segments)" >> {
-        val title = "The first segment · other 1 · other 2"
+  }
 
-        extractTitle(docFromTitle(title)) must_== "The first segment"
-      }
+  "extractLang" >> {
+    "should extract lang from html tag and give priority to it" >> {
+      val html =
+        """<html lang="ca">
+          |  <head>
+          |    <meta http-equiv="Content-Language" content="en">
+          |    <meta property="og:locale" content="en_GB" />
+          |  </head>
+          |<body></body></html>""".stripMargin
+
+      extractLang(Jsoup.parse(html)) must beSome("ca")
     }
-
-    "extractLang" >> {
-      "should extract lang from html tag and give priority to it" >> {
-        val html =
-          """<html lang="ca">
-            |  <head>
-            |    <meta http-equiv="Content-Language" content="en">
-            |    <meta property="og:locale" content="en_GB" />
-            |  </head>
-            |<body></body></html>""".stripMargin
-
-        extractLang(Jsoup.parse(html)) must beSome("ca")
-      }
-      "should extract language from meta tag with more priority than og:locale" >> {
-        val html =
-          """<html>
-            |  <head>
-            |    <meta http-equiv="Content-Language" content="ca">
-            |    <meta property="og:locale" content="en_GB" />
-            |  </head>
-            |<body></body></html>""".stripMargin
-
-        extractLang(Jsoup.parse(html)) must beSome("ca")
-      }
-      "should extract language from og:locale" >> {
-        val html =
-          """<html>
-            |  <head>
-            |    <meta property="og:locale" content="ca" />
-            |  </head>
-            |<body></body></html>""".stripMargin
-
-        extractLang(Jsoup.parse(html)) must beSome("ca")
-      }
+    "should extract language from meta tag with more priority than og:locale" >> {
+      val html =
+        """<html>
+          |  <head>
+          |    <meta http-equiv="Content-Language" content="ca">
+          |    <meta property="og:locale" content="en_GB" />
+          |  </head>
+          |<body></body></html>""".stripMargin
+
+      extractLang(Jsoup.parse(html)) must beSome("ca")
+    }
+    "should extract language from og:locale" >> {
+      val html =
+        """<html>
+          |  <head>
+          |    <meta property="og:locale" content="ca" />
+          |  </head>
+          |<body></body></html>""".stripMargin
+
+      extractLang(Jsoup.parse(html)) must beSome("ca")
     }
   }
 }