This notebook requires FEL4ArchiveSpark: https://github.com/helgeho/FEL4ArchiveSpark

In [1]:
import de.l3s.archivespark._
import de.l3s.archivespark.implicits._
import de.l3s.archivespark.specific.warc._
import de.l3s.archivespark.specific.warc.specs._
import de.l3s.archivespark.specific.warc.implicits._
import de.l3s.archivespark.enrich._
import de.l3s.archivespark.enrich.functions._
import de.l3s.archivespark.enrich.dataloads._
import de.l3s.archivespark.enrichfunctions.fel._
import de.l3s.archivespark2triples._
import org.apache.hadoop.io.compress.GzipCodec

# Load the FEL model file

In [2]:
val modelFile = "english-nov15.hash"
sc.addFile("hdfs:///user/holzmann/" + modelFile)

# Initialize the web archive collection

In [3]:
val collection = "ArchiveIt-Collection-2950"
val cdxPath = s"/data/archiveit/$collection/cdx/*.cdx.gz"
val warcPath = s"/data/archiveit/$collection/warc"

In [4]:
val records = ArchiveSpark.load(sc, WarcCdxHdfsSpec(cdxPath, warcPath))

In [5]:
records.peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20111222021216",
    "digest":"ZPP7YDXYWOVKO3RILANIHUPL3REXJDZE",
    "originalUrl":"http://184.107.185.46/newsbeast/index.php?w=500&h=300&b=http://www.newsbeast.gr/files/1/2011/06/15/sintagma_1_5.jpg",
    "surtUrl":"46,185,107,184)/newsbeast/index.php?b=http://www.newsbeast.gr/files/1/2011/06/15/sintagma_1_5.jpg&h=300&w=500",
    "mime":"text/html",
    "compressedSize":464,
    "meta":"-",
    "status":404
  }
}

# Select successful responses of type HTML and detect duplicates

In [6]:
val responses = records.filter(r => r.status == 200 && r.mime == "text/html")

In [7]:
val earliestResponses = responses.map(r => ((r.surtUrl, r.digest), r)).reduceByKey{(r1, r2) =>
    if (r1.time < r2.time) r1 else r2
}

In [8]:
val duplicates = records.map(r => ((r.surtUrl, r.digest), r)).join(earliestResponses).map{case (_, records) => records}.filter{case (r1, r2) => r1.time != r2.time}

# Generate *ArchivedDocument* triples representing its versions

In [9]:
val versions = earliestResponses.map{case (_, r) => r}.union(duplicates.map{case (r1, r2) => r1})

In [10]:
versions.count

10503433

In [11]:
val documentTriples = ArchiveSpark2Triples.generateDocs(versions)

In [12]:
println(documentTriples.take(1).head)


<org,occupyalabama)/forum/showthread.php?goto=nextnewest&t=161> rdf:type owa:ArchivedDocument ;
    owa:numOfCaptures    "6"^^xsd:integer ;
    owa:firstCapture     "2011-12-03T05:56:19"^^xsd:date ;
    owa:lastCapture      "2012-01-03T03:36:09"^^xsd:date ;
    dc:hasVersion        <https://web.archive.org/web/20111203055619/http://occupyalabama.org/forum/showthread.php?t=161&goto=nextnewest>,
                         <https://web.archive.org/web/20111210061229/http://occupyalabama.org/forum/showthread.php?t=161&goto=nextnewest>,
                         <https://web.archive.org/web/20111217055735/http://occupyalabama.org/forum/showthread.php?t=161&goto=nextnewest>,
                         <https://web.archive.org/web/20111220030152/http://occupyalabama.org/forum/showthread.php?t=161&goto=nextnewest>,
                         <https://web.archive.org/web/20111227031921/http://occupyalabama.org/forum/showthread.php?t=161&goto=nextnewest>,
                         <https://web.archive.

# Create "*sameAs* triples" from duplicates

In [13]:
val sameAsTriples = ArchiveSpark2Triples.generateSameAsVersions(duplicates)

In [14]:
println(sameAsTriples.take(1).head)


<https://web.archive.org/web/20120105062548/http://www.livestream.com/forum/showpost.php?p=18958&postcount=9> rdf:type owa:VersionedDocument ;
    dc:date              "2012-01-05T06:25:48"^^xsd:date ;
    owl:sameAs           <https://web.archive.org/web/20111229064632/http://www.livestream.com/forum/showpost.php?p=18958&postcount=9> .



# Generate *VersionedDocument* triples with title and entities

In [15]:
val responsesRepartitioned = earliestResponses.map{case (_, r) => r}.repartition(1000)

In [16]:
val title = HtmlText.of(Html.first("title"))
val responsesWithTitles = responsesRepartitioned.enrich(title)

In [17]:
val fel = FEL(scoreThreshold = -5, modelFile = modelFile).on(HtmlText)
val responsesWithEntities = responsesWithTitles.enrich(fel)

In [18]:
responsesWithEntities.peekJson

{
  "record":{
    "redirectUrl":"-",
    "timestamp":"20120116222507",
    "digest":"OTM4ILGHNX4HTAEEN7U7XZTMKO2EVJ3X",
    "originalUrl":"https://www.facebook.com/help/mobile/iphone",
    "surtUrl":"com,facebook)/help/mobile/iphone",
    "mime":"text/html",
    "compressedSize":9206,
    "meta":"-",
    "status":200
  },
  "payload":{
    "string":{
      "html":{
        "title":{
          "text":"Facebook for iPhone App - Facebook Help Center | Facebook"
        },
        "body":{
          "text":{
            "entities":[
              {
                "span":"fb",
                "endOffset":10444,
                "score":-0.18161437557306173,
                "annotation":"Facebook",
                "startOffset":10442
              },
              {
          ...

In [19]:
val versionTriples = ArchiveSpark2Triples.generateVersionsMapped(responsesWithEntities) {(record, uid, doc) =>
    val recordTitle = record.value(title).getOrElse("")
    val recordEntities = record.value(fel).getOrElse(Seq.empty)
    doc.appendTriples("dc:title", s"""\"recordTitle\"""").appendChildren("schema:mentions", {
        recordEntities.zipWithIndex.map{case (entity, i) => TripleDoc(
            s"_:e$uid-$i",
            "oae:Entity",
            Seq(
                "oae:confidence" -> Seq(s""""${entity.score}"^^xsd:double"""),
                "oae:detectedAs" -> Seq(s"""\"${entity.span}\""""),
                "oae:position" -> Seq(s""""${entity.startOffset}"^^xsd:integer"""),
                "oae:hasMatchedURI" -> Seq(s"<http://dbpedia.org/resource/${entity.annotation}>")
            )
        )}
    })
}

In [20]:
println(versionTriples.take(1).head)


<https://web.archive.org/web/20120116222507/https://www.facebook.com/help/mobile/iphone> rdf:type owa:VersionedDocument ;
    dc:date              "2012-01-16T22:25:07"^^xsd:date ;
    dc:format            "text/html" ;
    dc:title             "recordTitle" ;
    schema:mentions      _:e0-0,
                         _:e0-1,
                         _:e0-2,
                         _:e0-3,
                         _:e0-4,
                         _:e0-5,
                         _:e0-6,
                         _:e0-7,
                         _:e0-8,
                         _:e0-9,
                         _:e0-10,
                         _:e0-11,
                         _:e0-12,
                         _:e0-13,
                         _:e0-14,
                         _:e0-15,
                         _:e0-16,
                         _:e0-17,
                         _:e0-18,
                         _:e0-19,
                         _:e0-20,
                         _:e0-21,


# Sort and store with headers

In [21]:
val headers = TripleHeader.append("oae" -> "http://www.ics.forth.gr/isl/oae/core#")

In [22]:
val triples = ArchiveSpark2Triples.toStringsSorted(headers, documentTriples, sameAsTriples, versionTriples)

In [23]:
triples.saveAsTextFile(s"$collection-Triples.gz", classOf[GzipCodec])