In [1]:
import de.l3s.archivespark._
import de.l3s.archivespark.implicits._
import de.l3s.archivespark.enrich._
import de.l3s.archivespark.enrich.functions._
import de.l3s.archivespark.specific.warc.implicits._
import de.l3s.archivespark.specific.warc.tempas._
import de.l3s.archivespark.specific.warc.enrichfunctions._
import scala.util._

In [89]:
val minAmountsPerPage = 3
val verifyTerm = "speise"

In [3]:
val speisekarte = ArchiveSpark.load(sc, TempasWaybackSpec("speisekarte", from = 2000, to = 2003, pages = 10, resultsPerPage = 100)).cache

In [4]:
val restaurant = ArchiveSpark.load(sc, TempasWaybackSpec("restaurant", from = 2000, to = 2003, pages = 10, resultsPerPage = 100)).cache

In [5]:
val records = speisekarte.union(restaurant)

In [6]:
records.count

3567

In [7]:
val recordsWithContents = records.enrich(StringContent).cache

In [9]:
var menus = recordsWithContents.filterValue(StringContent) {s => s.isDefined && s.get.toLowerCase.contains(verifyTerm)}

In [88]:
menus.count

391

In [102]:
val regexSuffix = """([\d\.\, ]+)(dm|€|eur|euro|\&euro\;|\&\#8364\;|\&\#x20AC\;)""".r
val regexPrefix = """(dm|€|eur|euro|\&euro\;|\&\#8364\;|\&\#x20AC\;)([\d\.\, ]+)""".r
val amounts = StringContent.map("amounts") {text: String =>
    val matches = {
        val lc = text.toLowerCase.replace("\r", " ")
        val suffixMatches = regexSuffix.findAllMatchIn(lc).map(m => (m.group(1).trim, m.group(2)))
        val prefixMatches = regexPrefix.findAllMatchIn(lc).map(m => (m.group(2), m.group(1).trim))
        suffixMatches ++ prefixMatches
    }.toList
    matches.flatMap{case (a, c) =>
        val amount = a.trim
        val currency = if (c == "dm") "dm" else "eur"
        if (amount.isEmpty) None
        else scala.util.Try {
            val comma = amount.lastIndexOf(",")
            val point = amount.lastIndexOf(".")
            if (comma > point) amount.replace(".", "").replace(",", ".").toDouble
            else if (point > comma) amount.replace(",", "").toDouble
            else amount.toDouble
        }.toOption.map(currency + _)
    }
}

In [103]:
val menusWithAmounts = menus.enrich(amounts).filterValue(amounts) {a => a.isDefined && a.get.size >= minAmountsPerPage}

In [104]:
menusWithAmounts.count

49

In [70]:
println(menusWithAmounts.peekJson)

{
  "record":{
    "url":"http://ganz-muenchen.de/01essen_resto_sitar2.html",
    "year":2001
  },
  "payload":{
    "string":{
      "_":"<html>\n\n\t<head>\n\t\t<meta http-equiv=\"content-type\" content=\"text/html;charset=ISO-8859-1\">\n\t\t<title>Essen + Trinken: Sitar - die Karte</title>\n\t\t<meta name=\"keywords\" content=\"Alles &uuml;ber M&uuml;nchen, Restaurant, Sitar, indisch, indisch essen, Inder, Tandoor, essen, trinken, ausgehen, Leute treffen, Stadtf&uuml;hrer, munich, m&uuml;nchen, Muenchen\">\n\t\t<meta name=\"description\" content=\"Das Sitar erwartet Sie mit nordindischer K&uuml;che, die weniger Hot, weniger scharf gew&uuml;rzt dem bayerischen Gaumen sehr mundet. Ein Blick in die K&uuml;che verr&auml;t, hier wird typisch indisch zubereitet. Da steht der Tandoor, ein Holzkohlenlehmofen. So eine Art gro&szlig;er Topf, in dem die Glut rund 400 bis 500 Grad Hitze entwickelt, mit der die Gerichte zubereitet werden. Da zieht dann schon einmal der Ober die Blicke auf sich, 

In [35]:
val links = HtmlAttribute("href").ofEach(Html.all("a"))

In [36]:
val recordsWithLinks = recordsWithContents.enrich(links).filterValue(links) {l => l.isDefined && l.nonEmpty}

In [37]:
println(recordsWithLinks.peekJson)

{
  "record":{
    "url":"http://home.t-online.de/home/lotus-gg/lotus.htm",
    "year":2002
  },
  "payload":{
    "string":{
      "_":"<HTML>\n<HEAD>\n   <META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=iso-8859-1\">\n   <TITLE>Private Homepages in T-Online - Error 404</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"#FFFFFF\">\n<IMG SRC=\"/err_img/logo.gif\" HEIGHT=30 WIDTH=164>\n<BR><B><FONT SIZE=+2 FACE=\"Arial,Helvetica\">Private Homepage</FONT></B><BR>\n<BR><HR WIDTH=\"100%\">\n<B><FONT FACE=\"Arial,Helvetica\">Das Dokument konnte nicht gefunden\nwerden.</FONT>\n<BR>\n<FONT Size=-1 FACE=\"Arial,Helvetica\">Bitte �berpr�fen Sie Ihren URL\n<HR Size=1>\nHinweis : <BR>\nBei Dateinamen wird auf Gro�- und Kleinschreibung unterschieden</FONT></B>\n<BR>\n<HR WIDTH=\"100%\">\n<FONT Size=-1 FACE=\"Arial,Helvetica\">We are sorry, but the private homepage server could not find the file you asked for.\n<BR>\nPlease check the URL to ensure that the path is correct.\n<BR>\n<HR WIDTH=\"100%\">\n

In [38]:
val outLinks = recordsWithLinks.flatMap { record =>
    val url = new java.net.URI(record.url)
    val year = record.year
    record.value(links).get.flatMap{l =>
        Try{url.resolve(l)}.toOption.filter(_.getHost == url.getHost).map(_.toString)
    }.distinct.map((record.url, year, _))
}

In [39]:
outLinks.peek

(http://home.t-online.de/home/sebgast/speisek.htm,2000,http://home.t-online.de/home/sebgast)

In [40]:
val level1 = outLinks.map{case (from, year, to) =>
    TempasYearResult(to, year)
}.distinct.map(new TempasWaybackRecord(_)).enrich(StringContent).cache

In [85]:
level1.count

6082

In [86]:
val level1Menus = level1.filterValue(StringContent) {s => s.isDefined && s.get.toLowerCase.contains(verifyTerm)}

In [87]:
level1Menus.count

192

In [105]:
val level1WithAmounts = level1Menus.enrich(amounts).filterValue(amounts) {a => a.isDefined && a.get.size >= minAmountsPerPage}

In [106]:
level1WithAmounts.count

22

In [44]:
println(level1WithAmounts.peekJson)

{
  "record":{
    "url":"http://hotel-ponyhof-knotte.de/abendkarte.htm",
    "year":2000
  },
  "payload":{
    "string":{
      "_":"<html>\n\n<head>\n<title>Abendkarte</title>\n<meta name=\"GENERATOR\" content=\"Microsoft FrontPage 4.0\">\n<meta name=\"ProgId\" content=\"FrontPage.Editor.Document\">\n\n\n<meta name=\"Microsoft Theme\" content=\"ponyhof 111, default\"><meta name=\"Microsoft Border\" content=\"l, default\"></head>\n\n<body background=\"_themes/ponyhof/hinterg.jpg\" bgcolor=\"#FFFFCC\" text=\"#000000\" link=\"#0033CC\" vlink=\"#993399\" alink=\"#FF9900\"><!--msnavigation--><table border=\"0\" cellpadding=\"0\" cellspacing=\"0\" width=\"100%\"><tr><td valign=\"top\" width=\"1%\"><!--mstheme--><font face=\"verdana, arial, helvetica\">\n\n<p align=\"center\"><img border=\"0\" src=\"_borders/logo.gif\" width=\"126\" height=\"124\"></p>\n<p><script language=\"JavaScript\"><!--\nMSFPhover = \n  (((navigator.appName == \"Netscape\") && \n  (parseInt(navigator.appVersion) >= 3

In [107]:
val exchangeRate = 1.95583
val euros = level1WithAmounts.union(menusWithAmounts).flatMap{record =>
    val values = record.value(amounts).getOrElse(Seq.empty)
    val dm = values.filter(_.startsWith("dm")).map{value =>
        (value.drop(2).toDouble / exchangeRate, "dm")
    }
    val euros = values.filter(_.startsWith("eur")).map{value =>
        (value.drop(3).toDouble, "eur")
    }
    
    (dm ++ euros).map((record.url, _))
}

In [108]:
euros.peek

(http://hotel-ponyhof-knotte.de/abendkarte.htm,(7.925024158541387,dm))

In [109]:
val originUrls = recordsWithContents.map(r => (r.url, r.url)).union(outLinks.map{case (from, year, to) => (to, from)}).distinct

In [110]:
val urlAmounts = originUrls.join(euros).map{case (url, (from, (amount, currency))) =>
    ((from, currency), (amount, 1))
}

In [118]:
urlAmounts.peek

((http://home.t-online.de/home/gasthaus.sonne/speisekarte.html,dm),(4.70388530700521,1))

In [119]:
val currencyAvg = urlAmounts.reduceByKey{case ((a1, c1), (a2, c2)) =>
    (a1 + a2, c1 + c2)
}.map{case ((from, currency), (amount, count)) => (currency, (amount / count, 1))}

In [120]:
currencyAvg.filter{case (currency, _) => currency == "dm"}.count

37

In [121]:
currencyAvg.filter{case (currency, _) => currency == "eur"}.count

12

In [122]:
val avgMap = currencyAvg.reduceByKey{case ((a1, c1), (a2, c2)) =>
    (a1 + a2, c1 + c2)
}.map{case (currency, (amount, count)) => (currency, amount / count)}.collectAsMap

In [123]:
avgMap

Map(eur -> 9.584057992679567, dm -> 7.787867861750461)

In [124]:
avgMap("eur") / avgMap("dm")

1.230639523270671