In [1]:
import de.l3s.archivespark.ArchiveSpark
import de.l3s.archivespark.enrich.functions._
import de.l3s.archivespark.nativescala.implicits._
import de.l3s.archivespark.implicits._

In [2]:
implicit val sparkContext = sc

In [3]:
val warcPath = "C:\\Users\\holzmann\\L3S\\Workspace\\archives_unleashed\\sample"
val cdxPath = s"${warcPath}\\*.cdx.gz"

In [4]:
val rdd = ArchiveSpark.hdfs(cdxPath, warcPath)

In [5]:
rdd.count

1780

In [46]:
rdd.take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222011410",
    "digest":"BW3EWQMXEQTVHHQNXFKUAYXQOKZKTJUL",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/04/07/radioactive-rainwater-saint-louis..",
    "surtUrl":"com,alexanderhiggins,blog)/2011/04/07/radioactive-rainwater-saint-louis..",
    "mime":"text/html",
    "meta":"-",
    "status":404
  }
}

In [47]:
rdd.filter(r => r.status == 200).take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222104911",
    "digest":"GQMVYL2OZ35ENISWKLG4TPIDIBZ4Y2TA",
    "originalUrl":"http://1.bp.blogspot.com/-sdlV-ifHuI8/TbxUJ1SHiOI/AAAAAAAAAAk/FrF8SWSqua0/s400/%25E3%2583%258F%25E3%2583%25AF%25E3%2582%25A4U238%25E8%25A1%25A8.jpg",
    "surtUrl":"com,blogspot,bp,1)/-sdlv-ifhui8/tbxuj1shioi/aaaaaaaaaak/frf8swsqua0/s400/%e3%83%8f%e3%83%af%e3%82%a4u238%e8%a1%a8.jpg",
    "mime":"image/jpeg",
    "meta":"-",
    "status":200
  }
}

In [48]:
rdd.filter(r => r.status == 200 && r.mime == "text/html").take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  }
}

In [49]:
val onlineHtml = rdd.filter(r => r.status == 200 && r.mime == "text/html")

In [50]:
onlineHtml.count

823

In [55]:
val responses = onlineHtml.enrich(Payload)

In [56]:
println(responses.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "recordHeader":{
    "WARC-Target-URI":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "WARC-Date":"2011-12-22T16:15:17Z",
    "WARC-IP-Address":"199.27.134.227",
    "WARC-Type":"response",
    "Content-Length":"109136",
    "WARC-Payload-Digest":"sha1:LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "Content-Type":"application/http; msgtype=response",
    "absolute-offset":"0",
    "WARC-Record-ID":"<urn:uuid:0e01324e-7eb7-4681-a933-133de2521502>",
    "reader-identifier":"sample.warc.gz"
  },
  "httpHeader":{
    "X-Pingback":"http://blog.alexanderhiggins.com/xmlrp

In [57]:
val responseStrings = responses.enrich(StringContent)

In [58]:
println(responseStrings.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "recordHeader":{
    "WARC-Target-URI":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "WARC-Date":"2011-12-22T16:15:17Z",
    "WARC-IP-Address":"199.27.134.227",
    "WARC-Type":"response",
    "Content-Length":"109136",
    "WARC-Payload-Digest":"sha1:LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "Content-Type":"application/http; msgtype=response",
    "absolute-offset":"0",
    "WARC-Record-ID":"<urn:uuid:0e01324e-7eb7-4681-a933-133de2521502>",
    "reader-identifier":"sample.warc.gz"
  },
  "httpHeader":{
    "X-Pingback":"http://blog.alexanderhiggins.com/xmlrp

In [59]:
val responseStringsTitles = responseStrings.enrich(Html.first("title"))

In [81]:
println(responseStringsTitles.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "recordHeader":{
    "WARC-Target-URI":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "WARC-Date":"2011-12-22T16:15:17Z",
    "WARC-IP-Address":"199.27.134.227",
    "WARC-Type":"response",
    "Content-Length":"109136",
    "WARC-Payload-Digest":"sha1:LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "Content-Type":"application/http; msgtype=response",
    "absolute-offset":"0",
    "WARC-Record-ID":"<urn:uuid:0e01324e-7eb7-4681-a933-133de2521502>",
    "reader-identifier":"sample.warc.gz"
  },
  "httpHeader":{
    "X-Pingback":"http://blog.alexanderhiggins.com/xmlrp

In [61]:
onlineHtml.take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  }
}

In [62]:
val titles = onlineHtml.enrich(Html.first("title"))

In [63]:
println(titles.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":"<title>CNN: 48% believe a Great Depression is coming within a year.</title>"
      }
    }
  }
}


In [64]:
val htmlTexts = titles.enrich(HtmlText)

In [65]:
htmlTexts.take(1).head.toJsonString

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":"<title>CNN: 48% believe a Great Depression is coming within a year.</title>",
        "body":{
          "text":"Alexander Higgins Blog The Latest Buzz, Analysis, and News Without the Snooze! Home Headlines Authors About Subscribe, Friend or Follow Advertise Economy Environment Headlines Health Member Submitted Projects Society T...

In [66]:
val TitleText = HtmlText.of(Html.first("title"))

In [67]:
val titleTexts = onlineHtml.enrich(TitleText)

In [68]:
println(titleTexts.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":"CNN: 48% believe a Great Depression is coming within a year."
        }
      }
    }
  }
}


In [78]:
println(onlineHtml.enrich(HtmlText.on(Html.first("a.lfloat", "linkLFloat"))).take(7).drop(6).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20120113023305",
    "digest":"JOX2BN6U524RONLFLVEIEJV4TRFOUELJ",
    "originalUrl":"http://www.facebook.com/directory/people/A-45138721-45682560",
    "surtUrl":"com,facebook)/directory/people/a-45138721-45682560",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "linkLFloat":{
          "text":"Facebook logo"
        }
      }
    }
  }
}


In [70]:
onlineHtml.mapValues(TitleText).take(10).foreach(println)

CNN: 48% believe a Great Depression is coming within a year.
"Maximum Alert": Japan Nuclear Reactor Core Breach Leaking Plutonium "3 Raging Meltdowns In Progress"
Forum Post: My two posts were erased in a super shady way!!!!! | OccupyWallSt.org
Forum Post: Dylan Radio - Bill Black and Two Passionate OWS Protesters | OccupyWallSt.org
Dec. 1, 2011 Forum Archive | OccupyWallSt.org
Forum Post: #OccupyCNN 10/14 | OccupyWallSt.org
Anja Daut |  Ankur Reddy | People Directory | Facebook
Truthdig Retreat in Santa Fe
Ana Paula Souza |  Anajis Fortanelli | People Directory | Facebook
Twitter


In [71]:
val titleTokens = titleTexts.mapEnrich(TitleText, "tokens") {title => title.split(" ")}

In [72]:
println(titleTokens.take(1).head.toJsonString)

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222161517",
    "digest":"LOD6KYDGJO2KR3W4ONCSAD5ZHP27RKLJ",
    "originalUrl":"http://blog.alexanderhiggins.com/2011/06/09/cnn-48-great-depression-coming-year-24481/",
    "surtUrl":"com,alexanderhiggins,blog)/2011/06/09/cnn-48-great-depression-coming-year-24481",
    "mime":"text/html",
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":{
            "_":"CNN: 48% believe a Great Depression is coming within a year.",
            "tokens":[
              "CNN:",
              "48%",
              "believe",
              "a",
              "Great",
              "Depression",
              "is",
              "coming",
              "within",
              "a",
              "year."
            ]
          }
        }
      }
    }
  }
}


In [80]:
val tokens = titleTokens.mapValues[Array[String]](TitleText, ".tokens")
val tokenCounts = tokens.flatMap(tokens => tokens.map(t => (t, 1))).reduceByKey(_ + _).collect

In [74]:
tokenCounts.sortBy{case (token, count) => -count}.take(20).foreach(println)

(|,249)
(-,122)
(OccupyWallSt.org,87)
(Facebook,76)
(Occupy,76)
(Forum,64)
(Post:,56)
(,51)
(the,40)
(Profile,28)
(and,27)
(Google,24)
(Archive,24)
(Tumblr,23)
(Twitter,22)
(of,22)
(User,20)
(Photos,19)
(to,19)
(Google+,19)


In [75]:
titleTokens.saveAsJson("tokens.gz")