In [1]:
import de.l3s.archivespark._
import de.l3s.archivespark.implicits._
import de.l3s.archivespark.nativescala.implicits._
import de.l3s.archivespark.enrich._
import de.l3s.archivespark.enrich.functions._
import de.l3s.archivespark.specific.warc.implicits._
import de.l3s.archivespark.specific.warc._
import de.l3s.archivespark.specific.warc.specs._
import de.l3s.archivespark.specific.books._

## Web Archive Analysis Through CDX Server / Wayback Machine

In [2]:
val rdd = ArchiveSpark.load(sc, WaybackSpec("l3s.de", matchPrefix = true))

In [3]:
rdd.take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20020729002934",
    "digest":"S6D4JRHXN6U5QHEUDI5OXXYYIBJ7CHWF",
    "originalUrl":"http://www.l3s.de:80/",
    "surtUrl":"de,l3s)/",
    "mime":"text/html",
    "compressedSize":966,
    "meta":"-",
    "status":200
  }
}

In [4]:
rdd.count

26460

In [5]:
println(rdd.enrich(HtmlText).take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20020729002934",
    "digest":"S6D4JRHXN6U5QHEUDI5OXXYYIBJ7CHWF",
    "originalUrl":"http://www.l3s.de:80/",
    "surtUrl":"de,l3s)/",
    "mime":"text/html",
    "compressedSize":966,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "body":{
          "text":"Languages English Deutsch Search form Search About L3S L3S Overview Vision L3S Web Observatory Mentoring Guidelines PhD Program Graduations Memberships Facts and Figures News 2015 2014 2013 2012 2011 L3S-Team and Boards Scientific Advisory Board Directors Members Group Leaders Staff Guest Researchers Alumni Projects Research Area Intelligent Access to Information Next Generation Internet E-Science Web Governance Virtual Communities Project Archive Startup@L3S Jobs Jobs Dates + Events Other Event Archive Publications Downloads L3S Annual Report Flyers L3S @t work Posters Contact Directions L3S downloads You are here Home Alexandria at J

In [6]:
val uniquePages = rdd.distinctValue(_.surtUrl) {(a, b) => a}
val title = HtmlText.of(Html.first("title"))
uniquePages.mapValues(title).take(10).foreach(println)

                                                                                404 Not Found
403 Forbidden
L3S Research Center
403 Forbidden
404 Not Found
403 Forbidden
L3S Research Center
L3S Research Center
403 Forbidden
L3S Research Center


In [7]:
val withTitle = uniquePages.enrich(title)
val interestingPages = withTitle.filterValue(title) {t =>
    t.isDefined && t.get.nonEmpty && !t.get.startsWith("40") && !t.get.startsWith("L3S")
}
interestingPages.mapValues(title).take(10).foreach(println)

Policy Bibliography
ELAN e.V. | Startseite
Christian  Kohlsch�tter -
ELAN e.V. | Startseite
ELAN e.V. | Startseite
Besnik's Home Page
ELAN e.V. | Startseite
ELAN e.V. | Startseite
ELAN e.V. | Startseite
ELAN e.V. | Startseite


In [8]:
val records = interestingPages.enrich(Entities.of(title)).take(10).toSeq
records.distinctValue(title){(a, b) => a}.map(_.toJsonString).foreach(println)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20060504084130",
    "digest":"RKKPOSHBGDXPB4UDZELOU32NBHHZP4UF",
    "originalUrl":"http://www.l3s.de:80/~kohlschuetter/links/",
    "surtUrl":"de,l3s)/~kohlschuetter/links",
    "mime":"text/html",
    "compressedSize":1104,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":{
            "_":"Christian  Kohlsch�tter -",
            "entities":{
              "persons":[
                "Christian",
                "Kohlsch"
              ],
              "organizations":[
                
              ],
              "locations":[
                
              ],
              "dates":[
                
              ]
            }
          }
        }
      }
    }
  }
}
{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20060720192539",
    "digest":"HNGUQOYQWGJ77XAF3HY5KB5TI3G4YSOM",
    "originalUrl":"http://www.l3s.de:80/~olmedilla/policy/policyPaper

## Internet Archive Books Analysis with Local Metadata on HDFS

In [9]:
val books = ArchiveSpark.load(sc, IATextBooksHdfsSpec("booksmeta"))

In [10]:
books.take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  }
}

In [11]:
books.enrich(FirstSentence).take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "first-sentence":". Us» "
  }
}

In [12]:
books.enrich(Prefix(1000)).take(1).head.toJsonString

{
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "prefix-1000":". Us» \n\nADVENTURES \n\n\n\n\\ \n\n\n\nOP \n\n«OBIMSON CRÜ30E, \n\nwritÎen BY II im self. \n\n\n\n\nDigitized by L.OOQ le \n\n\n\nIMPRIMERIE DB M.ÀSSÀN 9 RÜB DE Y ADGIBARD 9 H* 1 I . \n\u2014 PAK LES SOlKfe DK TIRZUOLO, SON S9CCESSEUK DltSlCK*. \n\n\n\n\nDigitized by L.OOQ le \n\n\n\n\n\nDANIEL DE F OË. \n\n\n\nS \n\n\n\nDigitized by LooQle \n\n\n\nROBINSON CRUSOE , \n\n* PAR \n\nDANIEL DE FOË. \n\nT...

In [13]:
books.enrich(Entities.on(Prefix(1000))).map(_.toJsonString).collect.foreach(println)

                                                                                {
  "record":{
    "subjects":"adventure, fiction",
    "collections":"opensource",
    "creator":"Original : Defoe, Daniel (1660\u20131731) ; Translator : Borel, Pétrus (1809-1859)",
    "language":"french",
    "date":"1836",
    "publisher":"Borel et Varenne",
    "title":"Robinson Crusoé, tome 1",
    "publicdate":"2015-12-18 15:07:08"
  },
  "text":{
    "prefix-1000":{
      "entities":{
        "persons":[
          "VARENNE",
          "FERDINAND",
          "ALEXANDRE",
          "FRANCISQUE",
          "LooQle",
          "CRUSOE",
          "DENIS",
          "ROBINSON",
          "BOREL"
        ],
        "organizations":[
          "CHASLES",
          "BU"
        ],
        "locations":[
          "LA",
          "BOUDERIE",
          "PARIS"
        ],
        "dates":[
          "1836"
        ]
      }
    }
  }
}
{
  "record":{
    "subjects":"holmes, anp, scarlet, fop, anc, sherlock, 