In [1]:
import de.l3s.archivespark._
import de.l3s.archivespark.implicits._
import de.l3s.archivespark.nativescala.implicits._
import de.l3s.archivespark.enrich._
import de.l3s.archivespark.enrich.functions._
import de.l3s.archivespark.specific.warc.implicits._
import de.l3s.archivespark.specific.warc._
import de.l3s.archivespark.specific.warc.specs._
import de.l3s.archivespark.specific.books._

## Web Archive Analysis Through CDX Server / Wayback Machine

In [2]:
val rdd = ArchiveSpark.load(sc, WaybackSpec("l3s.de", matchPrefix = true))

In [3]:
rdd.take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20020729002934",
    "digest":"S6D4JRHXN6U5QHEUDI5OXXYYIBJ7CHWF",
    "originalUrl":"http://www.l3s.de:80/",
    "surtUrl":"de,l3s)/",
    "mime":"text/html",
    "compressedSize":966,
    "meta":"-",
    "status":200
  }
}

In [4]:
rdd.count

26460

In [5]:
val htmlOnline = rdd.filter(r => r.status == 200 && r.mime == "text/html")

In [6]:
println(htmlOnline.enrich(HtmlText).take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20020729002934",
    "digest":"S6D4JRHXN6U5QHEUDI5OXXYYIBJ7CHWF",
    "originalUrl":"http://www.l3s.de:80/",
    "surtUrl":"de,l3s)/",
    "mime":"text/html",
    "compressedSize":966,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "body":{
          "text":"_home     [ English Version ] [ Version français ] [ Hrvatska verzija ] [ Wersja Polska ] _content     _communications     _contact    "
        }
      }
    }
  }
}


In [7]:
val uniquePages = htmlOnline.distinctValue(_.surtUrl) {(a, b) => a}
val title = HtmlText.of(Html.first("title"))
uniquePages.mapValues(title).take(10).foreach(println)

                                                                                ServerRequestInterceptorOperations (Edutella (Service Extensions) API)
TENCompetence -
Home
ServerSocketProxy (Project JXTA (TM) Technology 0.1 API Documentation)
Jbed API: Package com.jbed.tina.event
Ana L.  Zapater Alemany -
L3S Research Center | News Archive
AccessibleTableModelChange (Edutella (Service Extensions) API)
net.jxta.codat Class Hierarchy (Project JXTA (TM) Technology 0.1 API Documentation)
VIFACHEM -


In [8]:
val withTitle = uniquePages.enrich(title)
val interestingTitlePages = withTitle.filterValue(title) {t =>
    t.isDefined && t.get.split(" ").size > 5
}
interestingTitlePages.mapValues(title).take(10).foreach(println)

ServerSocketProxy (Project JXTA (TM) Technology 0.1 API Documentation)
Ana L.  Zapater Alemany -
L3S Research Center | News Archive
net.jxta.codat Class Hierarchy (Project JXTA (TM) Technology 0.1 API Documentation)
Forschungszentrum L3S - Lageplan |  Map
Messenger (Project JXTA (TM) Technology 0.1 API Documentation)
Uses of Package javax.sound.midi (Edutella (Service Extensions) API)
Learning Lab Lower Saxony - Thanh-Thu Phan Tan
L3S Research Center - PROLEARN Workshop "Technology Enhanced Learning for Learning Organisations"
L3S Research Center | Contact Form


In [9]:
val titleEntities = Entities.of(title)
val records = interestingTitlePages.enrich(titleEntities).take(10).toSeq
records.filterNonEmpty(titleEntities).map(_.toJsonString).foreach(println)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20110102211216",
    "digest":"5VUPUZBFR6ULC7LZQWZDBD2KD75P4UFH",
    "originalUrl":"http://www.l3s.de/web/page15g.do?sp=page15g&alt1g=page29g&kcond1g.userOID=59&changedAlts=alt1g&rcond41g.userOID=59&kcond2g.userOID=59",
    "surtUrl":"de,l3s)/web/page15g.do?alt1g=page29g&changedalts=alt1g&kcond1g.useroid=59&kcond2g.useroid=59&rcond41g.useroid=59&sp=page15g",
    "mime":"text/html",
    "compressedSize":2405,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":{
            "_":"Ana L.  Zapater Alemany -",
            "entities":{
              "persons":[
                "Ana",
                "L.",
                "Zapater",
                "Alemany"
              ],
              "organizations":[
                
              ],
              "locations":[
                
              ],
              "dates":[
                
              ]
            }
   

## Internet Archive Books Analysis with Local Metadata on HDFS

In [10]:
val books = ArchiveSpark.load(sc, IATxtBooksHdfsSpec("booksmeta"))

In [11]:
books.take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  }
}

In [12]:
books.enrich(FirstSentence).take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "first-sentence":". Us» "
  }
}

In [13]:
books.enrich(Prefix(1000)).take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "prefix-1000":". Us» \n\nADVENTURES \n\n\n\n\\ \n\n\n\nOP \n\n«OBIMSON CRÜ30E, \n\nwritÎen BY II im self. \n\n\n\n\nDigitized by L.OOQ le \n\n\n\nIMPRIMERIE DB M.ÀSSÀN 9 RÜB DE Y ADGIBARD 9 H* 1 I . \n\u2014 PAK LES SOlKfe DK TIRZUOLO, SON S9CCESSEUK DltSlCK*. \n\n\n\n\nDigitized by L.OOQ le \n\n\n\n\n\nDANIEL DE F OË. \n\n\n\nS \n\n\n\nDigitized by LooQle \n\n\n\nROBINSON CRUSOE , \n\n* PAR \n\nDANIEL DE FOË. \n\nT...

In [14]:
books.enrich(Entities.on(Prefix(1000))).map(_.toJsonString).collect.foreach(println)

                                                                                {
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "prefix-1000":{
      "entities":{
        "persons":[
          "VARENNE",
          "FERDINAND",
          "ALEXANDRE",
          "FRANCISQUE",
          "LooQle",
          "CRUSOE",
          "DENIS",
          "ROBINSON",
          "BOREL"
        ],
        "organizations":[
          "CHASLES",
          "BU"
        ],
        "locations":[
          "LA",
          "BOUDERIE",
          "PARIS"
        ],
        "dates":[
          "1836"
        ]
      }
    }
  }
}
{
  "record":{
    "subjects":"holmes, anp, scarlet, fop, anc, sherlock, 