Skip to content
This repository has been archived by the owner on Sep 26, 2023. It is now read-only.

Commit

Permalink
Merge pull request #14 from intenthq/remove-smart-title-extraction
Browse files Browse the repository at this point in the history
Stop being too smart with titles
  • Loading branch information
albertpastrana committed Sep 15, 2015
2 parents 6c731f1 + 00ee5eb commit 9944ca4
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 106 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
@@ -1,7 +1,7 @@
language: scala
scala:
- 2.11.7
script: "sbt clean coverage test"
script: "sbt clean coverage test it:test"
after_success: "sbt coveralls"
cache:
directories:
Expand Down
16 changes: 8 additions & 8 deletions src/it/scala/com/intenthq/gander/GanderIT.scala
Expand Up @@ -33,7 +33,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "Here at Intent HQ we believe how important it is to write good code. Why? First, because writing good code is much cheaper and more fun than writing bad code. Second, because if you write good code chances are that the product you are building will be much better. Third, and more important, because writing good code is what we are supposed to do: after all, we are getting paid for doing our job well",
title = "What is good code? A scientific definition.",
title = "What is good code? A scientific definition. - Intent HQ Engineering blog",
metaDescription = "How would you define good code? This article gives a pseudo-scientific answer to that question after asking a sample of 65 developers that same question.",
metaKeywords = "",
lang = Some("en-GB"),
Expand All @@ -47,7 +47,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of certain price promotions.",
title = "Disneyland Paris faces pricing probe",
title = "Disneyland Paris faces pricing probe - BBC News",
metaDescription = "Disneyland Paris is facing a pricing probe following accusations that UK and German customers are being frozen out of promotions available in other European member states.",
metaKeywords = "",
lang = Some("en"),
Expand All @@ -62,7 +62,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "Manchester City striker Sergio Aguero will miss Tuesday's Champions League opener against Juventus at Etihad Stadium because of a knee injury",
title = "BBC Sport",
title = "BBC Sport - Sergio Aguero: Manchester City striker to miss Juventus visit",
metaDescription = "Manchester City striker Sergio Aguero will miss Tuesday's Champions League opener against Juventus with a knee injury.",
metaKeywords = "BBC, Sport, BBC Sport, bbc.co.uk, world, uk, international, foreign, british, online, service",
lang = Some("en-GB"),
Expand All @@ -76,7 +76,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "From Goldman on the FOMC operation twist announcement: ------------- 1. As we had expected, the Federal Open Market Committee decided to \"do the twist\" and increase the duration of its securities holdings by selling shorter-maturity securities ($400bn of Treasuries with maturity of 3 years or less)",
title = "GOLDMAN: 4 Key Points On The FOMC Announcement",
title = "GOLDMAN: 4 Key Points On The FOMC Announcement - Business Insider",
metaDescription = "Here it is.",
metaKeywords = "",
lang = Some("en"),
Expand All @@ -89,7 +89,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "Los aliados de la OTAN ofrecieron este martes respaldo político a Turquía en su ofensiva contra el Estado Islámico tras una reunión convocada de urgencia por el Gobierno de Ankara.",
title = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo" ,
title = "La OTAN apoya con cautela la ofensiva turca contra el yihadismo | Internacional | EL PAÍS" ,
metaDescription = "La Alianza se ha reunido este martes con carácter de urgencia a pedición de Ankara para tratar el avance del Estado Islámico",
metaKeywords = "otan, apoyar, cautela, ofensiva, turca, turco, yihadismo, alianza, haber, reunir, martes, urgencia, pedición, ankara, secretario, general, jens stoltenberg, resaltar, unidad, aliado",
lang = Some("es"),
Expand All @@ -107,7 +107,7 @@ class GanderIT extends Specification {
check(extract(url, Charsets.ISO_8859_1),
url = url,
content = "ROMA La strada è tracciata, la relazione potrebbe arrivare a Palazzo Chigi prima della pausa estiva. Il ministro dell’Interno Angelino Alfano non proporrà lo scioglimento per mafia del comune di Roma, ma nella relazione al governo",
title = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato",
title = "La relazione di Alfano sulla mafia: fatti gravi, il sindaco ha sottovalutato - Corriere.it",
metaDescription = "Non si propone lo scioglimento ma si lascia aperta la possibilità di una «diversa valutazione»",
metaKeywords = "Ignazio Marino, Angelino Alfano",
lang = Some("it"),
Expand All @@ -134,7 +134,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim",
title = "Para Leão, Arena Barueri não é casa do Tricolor - São Paulo",
title = "Para Leão, Arena Barueri não é casa do Tricolor - São Paulo | Lancenet.com.br",
metaDescription = "No próximo sábado, o São Paulo jogará, como mandante, na Arena Barueri diante do Mogi Mirim. Isso porque no estádio do Morumbi haverá, nesta ...",
metaKeywords = "Leao,Arena,Barueri,casa,Tricolor",
lang = Some("pt"),
Expand All @@ -147,7 +147,7 @@ class GanderIT extends Specification {
check(extract(url),
url = url,
content = "Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo",
title = "'Filho do gramado', Leão administra o São Paulo na base da conversa",
title = "'Filho do gramado', Leão administra o São Paulo na base da conversa | globoesporte.com",
metaDescription = "Emerson Leão cobra liderança ao São Paulo (Foto: Mário Ângelo / Ag. Estado) Emerson Leão não foi ao campo na manhã desta terça-feira no centro de treinamento do São Paulo. Bem humorado e com roupa casual, preferiu acompanhar de longe ...",
metaKeywords = "notícias, notícia, são paulo",
lang = None,
Expand Down
Expand Up @@ -19,16 +19,8 @@ object ContentExtractor {

val logger: Logger = LoggerFactory.getLogger(getClass)

def extractTitle(doc: Document): String = {
val titleElem = byTag("title")(doc)
titleElem.headOption.map { x =>
val titleText = x.text
List(" | ", " - ", " » ", " · ").collectFirst {
case separator if titleText.contains(separator) => titleText.split(Pattern.quote(separator)).head
}.getOrElse(titleText)
}.getOrElse("")
.replace("�", "").trim
}
def extractTitle(doc: Document): String =
byTag("title")(doc).headOption.map(_.text).getOrElse("").replace("�", "").trim

def extractLang(doc: Document): Option[String] =
byTag("html")(doc).headOption.map(_.attr("lang")).filter(_.nonEmpty).orElse(
Expand Down
119 changes: 32 additions & 87 deletions src/test/scala/com/intenthq/gander/ContentExtractorSpec.scala
Expand Up @@ -5,105 +5,50 @@ import org.jsoup.Jsoup
import org.specs2.mutable.Specification

class ContentExtractorSpec extends Specification {

"extractTitle" >> {
def docFromTitle(title: String) = Jsoup.parse(s"<html><head><title>$title</title></head><body></body></html>")
"should extract a title" >> {
val title = "the title"

extractTitle(docFromTitle(title)) must_== title
}
"should extract an empty title" >> {
val title = ""

extractTitle(docFromTitle(title)) must_== title
}
"should keep the first segment if the title contains a separator" >> {
"| (2 segments)" >> {
val title = "The first segment | Wikipedia, the free encyclopaedia"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"| (3 segments)" >> {
val title = "The first segment | other 1 | other 2"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"- (2 segments)" >> {
val title = "The first segment - Wikipedia, the free encyclopaedia"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"- (3 segments)" >> {
val title = "The first segment - other 1 - other 2"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"- not used as a sparator" >> {
val title = "this-is-a-title"

extractTitle(docFromTitle(title)) must_== title
}
"» (2 segments)" >> {
val title = "The first segment » Wikipedia, the free encyclopaedia"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"» (3 segments)" >> {
val title = "The first segment » other 1 » other 2"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"» not used as a separator" >> {
val title = "«this is a title»"

extractTitle(docFromTitle(title)) must_== title
}
"· (2 segments)" >> {
val title = "The first segment · Wikipedia, the free encyclopaedia"

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"· (3 segments)" >> {
val title = "The first segment · other 1 · other 2"
}

extractTitle(docFromTitle(title)) must_== "The first segment"
}
"extractLang" >> {
"should extract lang from html tag and give priority to it" >> {
val html =
"""<html lang="ca">
| <head>
| <meta http-equiv="Content-Language" content="en">
| <meta property="og:locale" content="en_GB" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}

"extractLang" >> {
"should extract lang from html tag and give priority to it" >> {
val html =
"""<html lang="ca">
| <head>
| <meta http-equiv="Content-Language" content="en">
| <meta property="og:locale" content="en_GB" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}
"should extract language from meta tag with more priority than og:locale" >> {
val html =
"""<html>
| <head>
| <meta http-equiv="Content-Language" content="ca">
| <meta property="og:locale" content="en_GB" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}
"should extract language from og:locale" >> {
val html =
"""<html>
| <head>
| <meta property="og:locale" content="ca" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}
"should extract language from meta tag with more priority than og:locale" >> {
val html =
"""<html>
| <head>
| <meta http-equiv="Content-Language" content="ca">
| <meta property="og:locale" content="en_GB" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}
"should extract language from og:locale" >> {
val html =
"""<html>
| <head>
| <meta property="og:locale" content="ca" />
| </head>
|<body></body></html>""".stripMargin

extractLang(Jsoup.parse(html)) must beSome("ca")
}
}
}

0 comments on commit 9944ca4

Please sign in to comment.