### Example *BigData Web archive* created with Web2Warc (https://github.com/helgeho/Web2Warc)

In [1]:
import org.apache.spark.rdd._
import org.archive.webservices.archivespark._
import org.archive.webservices.archivespark.implicits._
import org.archive.webservices.archivespark.enrich.functions._
import org.archive.webservices.archivespark.specific.warc.implicits._
import org.archive.webservices.archivespark.specific.warc.specs._
import org.archive.webservices.archivespark.specific.warc.enrichfunctions._
import org.archive.webservices.archivespark.specific.warc.tempas._
import org.archive.webservices.archivespark.specific.books.enrichfuncs._
import edu.harvard.countway.mhl.archivespark._
import edu.harvard.countway.mhl.archivespark.search._

## The old way to load Web archive data from CDX / (W)ARC files stored on HDFS

In [2]:
val cdxPath = "/data/BigData-20171212025659/*.gz"
val warcPath = "/data/BigData-20171212025659"
val rdd = ArchiveSpark.hdfs(cdxPath, warcPath)(sc)

In [3]:
rdd.peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20171212025659",
    "digest":"af048dfce896122ef79342779cffa00b813d4b7f",
    "originalUrl":"http://cci.drexel.edu/bigdata/bigdata2017/",
    "surtUrl":"edu,drexel,cci)/bigdata/bigdata2017",
    "mime":"text/html",
    "compressedSize":1942,
    "meta":"-",
    "status":200
  }
}

## The new way to load Web archive data from CDX / (W)ARC files stored on HDFS

In [4]:
val rdd = ArchiveSpark.load(WarcCdxHdfsSpec(cdxPath, warcPath))

In [5]:
rdd.peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20171212025659",
    "digest":"af048dfce896122ef79342779cffa00b813d4b7f",
    "originalUrl":"http://cci.drexel.edu/bigdata/bigdata2017/",
    "surtUrl":"edu,drexel,cci)/bigdata/bigdata2017",
    "mime":"text/html",
    "compressedSize":1942,
    "meta":"-",
    "status":200
  }
}

In [6]:
rdd.count

49

In [7]:
val pages = rdd.filter(r => r.mime == "text/html" && r.status == 200)

In [8]:
pages.count

14

In [9]:
pages.enrich(Html.first("title")).peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20171212025659",
    "digest":"af048dfce896122ef79342779cffa00b813d4b7f",
    "originalUrl":"http://cci.drexel.edu/bigdata/bigdata2017/",
    "surtUrl":"edu,drexel,cci)/bigdata/bigdata2017",
    "mime":"text/html",
    "compressedSize":1942,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":"<title>2017 IEEE International Conference on Big Data </title>"
      }
    }
  }
}

In [10]:
val Title = HtmlText.of(Html.first("title"))

In [11]:
pages.enrich(Title).peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20171212025659",
    "digest":"af048dfce896122ef79342779cffa00b813d4b7f",
    "originalUrl":"http://cci.drexel.edu/bigdata/bigdata2017/",
    "surtUrl":"edu,drexel,cci)/bigdata/bigdata2017",
    "mime":"text/html",
    "compressedSize":1942,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":"2017 IEEE International Conference on Big Data"
        }
      }
    }
  }
}

In [12]:
print(pages.enrich(Entities).peekJson)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20171212025659",
    "digest":"af048dfce896122ef79342779cffa00b813d4b7f",
    "originalUrl":"http://cci.drexel.edu/bigdata/bigdata2017/",
    "surtUrl":"edu,drexel,cci)/bigdata/bigdata2017",
    "mime":"text/html",
    "compressedSize":1942,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "body":{
          "text":{
            "entities":{
              "persons":[
                
              ],
              "organizations":[
                "USA",
                "Accepted",
                "Online",
                "Program",
                "Workshops",
                "Discussion",
                "Forum",
                "Papers",
                "Organization",
                "Big",
                "Schedule",
                "Committee",
                "Data",
                "Homepage"
              ],
              "locations":[
                "Boston",
                "MA"

## Load Web archive data from Internet Archive CDX server / Wayback Machine

In [13]:
val wayback = ArchiveSpark.load(WaybackSpec("https://bigdata.ieee.org", from = 2015, to = 2017, pages = 1))

In [14]:
print(wayback.enrich(Title).enrich(Entities).peekJson)

                                                                                {
  "record":{
    "redirectUrl":"-",
    "timestamp":"20150110055735",
    "digest":"RYDMJYJBQQXAYWKLCSYNSX6QXEQ3VENJ",
    "originalUrl":"http://bigdata.ieee.org:80/",
    "surtUrl":"org,ieee,bigdata)/",
    "mime":"text/html",
    "compressedSize":6214,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":"IEEE Big Data"
        },
        "body":{
          "text":{
            "entities":{
              "persons":[
                
              ],
              "organizations":[
                "BDIW",
                "Newsweek",
                "Company",
                "Publications",
                "InformationWeek",
                "Manufacturing",
                "&",
                "Revolutionizing",
                "Devices",
                "The",
                "Presentations",
                "Smart",
                "Mach

## Load Web archive data from Tempas' search / Wayback Machine

**Tempas (Temporal Archive Search)**: http://tempas.L3S.de/v2

**Tempas2ArchiveSpark Data Specification**: https://github.com/helgeho/Tempas2ArchiveSpark

In [15]:
val tempas = ArchiveSpark.load(TempasWaybackSpec("big data", from = 2010, to = 2013, pages = 1, resultsPerPage = 10))

In [16]:
print(tempas.enrich(Title).enrich(Entities).peekJson)

{
  "record":{
    "url":"http://de.wikipedia.org/wiki/big_data",
    "year":2013
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":"Big Data â\u0080\u0093 Wikipedia"
        },
        "body":{
          "text":{
            "entities":{
              "persons":[
                "Wettbewerbsvorteilen",
                "aktuellen",
                "Seite",
                "Durch",
                "Erkennen",
                "Webstatistiken",
                "Erlangung",
                "Kommunikationsbereitschaft",
                "Daten",
                "in",
                "Datenmengen",
                "Volumen",
                "Tausenden",
                "Erzeugung",
                "Bei",
                "Schneller",
                "Schaffung",
                "Sozialforscherin",
                "Grundprinzipien",
                "Buch",
                "Entwicklung",
                "Thilo",
                "Lage",
                "der",
   

## Load medical journals from the Medical Herticate Library

**MHLonArchiveSpark Data Specification**: https://github.com/helgeho/MHLonArchiveSpark

In [17]:
val query = MhlSearchOptions(query = "polio", collections = MhlCollections.Statemedicalsocietyjournals)
val mhl = ArchiveSpark.load(MhlSearchSpec(query))

In [18]:
mhl.peekJson

{
  "record":{
    "id":"journal4190miss",
    "title":"Journal of the Missouri State Medical Association",
    "author":[
      "Missouri State Medical Association"
    ],
    "date":"1908",
    "subject":[
      "Medicine;Periodicals;Missouri"
    ],
    "language":[
      "eng"
    ],
    "mediatype":[
      "texts"
    ],
    "collection":[
      "francisacountwaylibrary",
      "statemedicalsocietyjournals",
      "medicalheritagelibrary",
      "americana"
    ],
    "description":[
      "Title from cover",
      "Published by: Missouri State Medical Association, <1910>-1952",
      "Only 6 nos. issued in v. 11"
    ],
    "score":0.0047816974
  }
}

In [19]:
val Snippet = FirstLines(100)

In [20]:
print(mhl.enrich(Snippet).peekJson)

{
  "record":{
    "id":"journal4190miss",
    "title":"Journal of the Missouri State Medical Association",
    "author":[
      "Missouri State Medical Association"
    ],
    "date":"1908",
    "subject":[
      "Medicine;Periodicals;Missouri"
    ],
    "language":[
      "eng"
    ],
    "mediatype":[
      "texts"
    ],
    "collection":[
      "francisacountwaylibrary",
      "statemedicalsocietyjournals",
      "medicalheritagelibrary",
      "americana"
    ],
    "description":[
      "Title from cover",
      "Published by: Missouri State Medical Association, <1910>-1952",
      "Only 6 nos. issued in v. 11"
    ],
    "score":0.0047816974
  },
  "text":{
    "first-100-lines":"\n\nBoston \n\nMedical Library \n8 The Fenway. \n\n\n\n\n\nJOURNAL \n\n\nMissouri State \n\n\nMedical Association \n\n\nVOLUME FOUR \nJULY. 1907 -JUNE. 1908 \n\n\nPUBLISHERS; \n\nMEDICAL PRESS COMPAI^Y \n\n\nST. LOUIS, MO. \n\n\nOCT 19 1909 \n\n\n\n\n\nj \n\n\"l \n\n\n! \n\nI \n\ni \n\n\nPCTi 9 1909 \

In [21]:
print(wayback.enrich(Snippet.of(StringContent)).peekJson)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20150110055735",
    "digest":"RYDMJYJBQQXAYWKLCSYNSX6QXEQ3VENJ",
    "originalUrl":"http://bigdata.ieee.org:80/",
    "surtUrl":"org,ieee,bigdata)/",
    "mime":"text/html",
    "compressedSize":6214,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "first-100-lines":"<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en-gb\" lang=\"en-gb\" dir=\"ltr\" >\r\n<head>\r\n  <meta http-equiv=\"X-UA-Compatible\" content=\"IE=8\" />\n  <link rel=\"stylesheet\" href=\"/templates/ieeetechnicalactivitiestemplate/css/template_css.php\" type=\"text/css\" media=\"all\" />\r\n  <link rel=\"stylesheet\" href=\"/templates/ieeetechnicalactivitiestemplate/css/print.css\" type=\"text/css\" media=\"print\" />\r\n  <base href=\"http://bigdata.ieee.org/\" /

In [22]:
print(wayback.enrich(Snippet.of(HtmlText)).peekJson)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20150110055735",
    "digest":"RYDMJYJBQQXAYWKLCSYNSX6QXEQ3VENJ",
    "originalUrl":"http://bigdata.ieee.org:80/",
    "surtUrl":"org,ieee,bigdata)/",
    "mime":"text/html",
    "compressedSize":6214,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "body":{
          "text":{
            "first-100-lines":"IEEE.org | IEEE Xplore Digital Library | IEEE Standards | IEEE Spectrum | More Sites Sign in IEEE Big Data Search IEEE Big Data Follow: Home About What's New Conferences Publications Education /*.slideshowControlsTop_28{ border: 0px solid #BDBDBD; border-bottom: 1px solid #BDBDBD; padding: 1px 0px; background-color: #6b1f73; background-image: url('./modules/mod_IEEEImageRotator/images/control_bkgd.png'); height: 20px; margin: 0px auto; width: 100%; line-height: 20px; } .slidePager_28 a{ color: #CCCCCC; padding-top: 1px; display: block; background-image: url('./modules/mod_IEEEImageRotato

In [23]:
print(tempas.enrich(Snippet.of(HtmlText)).peekJson)

{
  "record":{
    "url":"http://de.wikipedia.org/wiki/big_data",
    "year":2013
  },
  "payload":{
    "string":{
      "html":{
        "body":{
          "text":{
            "first-100-lines":"Big Data aus Wikipedia, der freien EnzyklopÃ¤die Wechseln zu: Navigation, Suche Dieser Artikel wurde wegen inhaltlicher MÃ¤ngel auf der QualitÃ¤tssicherungsseite der Redaktion Informatik eingetragen. Dies geschieht, um die QualitÃ¤t der Artikel aus dem Themengebiet Informatik auf ein akzeptables Niveau zu bringen. Hilf mit, die inhaltlichen MÃ¤ngel dieses Artikels zu beseitigen, und beteilige dich an der Diskussion! (+) BegrÃ¼ndung: --Crazy1880 08:16, 21. Feb. 2011 (CET) Als Big Data werden besonders groÃ\u009fe Datenmengen bezeichnet, die mit Hilfe von Standard-Datenbanken und Datenmanagement-Tools nicht oder nur unzureichend verarbeitet werden kÃ¶nnen. Problematisch sind dabei vor allem die Erfassung, die Speicherung, die Suche, Verteilung, Analyse und Visualisierung von groÃ\u009fen Daten

In [24]:
val SnippetEntities = Entities.of(Snippet)

In [25]:
print(mhl.enrich(SnippetEntities).peekJson)

{
  "record":{
    "id":"journal4190miss",
    "title":"Journal of the Missouri State Medical Association",
    "author":[
      "Missouri State Medical Association"
    ],
    "date":"1908",
    "subject":[
      "Medicine;Periodicals;Missouri"
    ],
    "language":[
      "eng"
    ],
    "mediatype":[
      "texts"
    ],
    "collection":[
      "francisacountwaylibrary",
      "statemedicalsocietyjournals",
      "medicalheritagelibrary",
      "americana"
    ],
    "description":[
      "Title from cover",
      "Published by: Missouri State Medical Association, <1910>-1952",
      "Only 6 nos. issued in v. 11"
    ],
    "score":0.0047816974
  },
  "text":{
    "first-100-lines":{
      "entities":{
        "persons":[
          "Louis",
          "D.",
          "M.",
          "Leannec",
          "ALLEN",
          "J.",
          "Phthisis"
        ],
        "organizations":[
          "Boston",
          "MEDICA",
          "Medical",
          "Library",
          "JOUR

In [26]:
val locations: RDD[String] = mhl.flatMapValues(SnippetEntities, "locations")

In [27]:
locations.take(10).foreach(println)

MO.
Paris
MINNESOTA
WANGENSTEEN
Minnesota
SOUTH
Cioloii
NORTH
United
DAKOTA


## Analyze Polio Symptoms

In [28]:
val symptomSet = Seq("extremity", "neck", "vomiting", "fever", "headache", "irritability", "abdominal", "lethargy")

In [29]:
val symptoms = LowerCase.map("symptoms") {text: String => symptomSet.filter(text.contains)}

In [30]:
val enriched = mhl.enrich(symptoms)

In [31]:
print(enriched.peekJson)

{
  "record":{
    "id":"journal4190miss",
    "title":"Journal of the Missouri State Medical Association",
    "author":[
      "Missouri State Medical Association"
    ],
    "date":"1908",
    "subject":[
      "Medicine;Periodicals;Missouri"
    ],
    "language":[
      "eng"
    ],
    "mediatype":[
      "texts"
    ],
    "collection":[
      "francisacountwaylibrary",
      "statemedicalsocietyjournals",
      "medicalheritagelibrary",
      "americana"
    ],
    "description":[
      "Title from cover",
      "Published by: Missouri State Medical Association, <1910>-1952",
      "Only 6 nos. issued in v. 11"
    ],
    "score":0.0047816974
  },
  "text":{
    "lowercase":{
      "symptoms":[
        "extremity",
        "neck",
        "vomiting",
        "fever",
        "headache",
        "irritability",
        "abdominal"
      ]
    }
  }
}

In [32]:
val symptomCounts = enriched.flatMapValues(symptoms).countByValue

In [33]:
symptomCounts.foreach(println)

(extremity,1942)
(neck,2039)
(vomiting,2037)
(fever,2047)
(headache,2042)
(irritability,1939)
(abdominal,2042)
(lethargy,1529)
