In [83]:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat

val spark = SparkSession.builder.master("local[*]").appName("SparkSQL").getOrCreate()

import spark.implicits._
import org.apache.spark.sql.functions._

spark = org.apache.spark.sql.SparkSession@515f6200


In [47]:
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "WARC/1.0")
val dataset = sc.newAPIHadoopFile(
    "data/WAT/CC-MAIN-20180918130631-20180918150631-00000.warc.wat",
    classOf[TextInputFormat],
    classOf[LongWritable],
    classOf[Text], conf
)

val data = dataset
    .map(x => x._2.toString)
    .filter(_.length > 0)
    .map(_.split("\r\n\r\n"))
    .mapPartitionsWithIndex {  
        (idx, iter) => if (idx == 0) iter.drop(2) else iter 
    }
    .map(arr => (arr(0), arr(1)))

data.take(1).foreach(println)

(
WARC-Type: metadata
WARC-Target-URI: http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80
WARC-Date: 2018-09-26T17:58:38Z
WARC-Record-ID: <urn:uuid:fa8c5f55-cf06-4127-aab5-605b5de2cb45>
WARC-Refers-To: <urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa>
Content-Type: application/json
Content-Length: 1454,{"Container":{"Filename":"CC-MAIN-20180918130631-20180918150631-00000.warc.gz","Compressed":true,"Offset":"485","Gzip-Metadata":{"Inflated-Length":"703","Footer-Length":"8","Inflated-CRC":"-129749966","Deflate-Length":"473","Header-Length":"10"}},"Envelope":{"Format":"WARC","WARC-Header-Length":"388","Actual-Content-Length":"311","WARC-Header-Metadata":{"WARC-IP-Address":"193.107.238.44","WARC-Target-URI":"http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80","WARC-Warcinfo-ID":"<urn:uuid:8d75a1d7-86c5-4788-9e02-7e28190391ba>","WARC-Date":"2018-09-18T13:25:39Z","Content-Length":"311","WARC-Record-ID":"<urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa

conf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
dataset = data/WAT/CC-MAIN-20180918130631-20180918150631-00000.warc.wat NewHadoopRDD[148] at newAPIHadoopFile at <console>:36
data = MapPartitionsRDD[153] at map at <console>:50


MapPartitionsRDD[153] at map at <console>:50

In [163]:
val uriMetaDataPairs = data
    .map(pair => {
        val (header, metadata) = pair
    
        val keyValuePairs = header
            .split("\r\n")
            .filter(_ != "")
            .map(line => {
                val splitList = line.split(": ")
                ((splitList(0), splitList(1)))
            })

        val targetURL = keyValuePairs
            .filter(_._1 == "WARC-Target-URI")(0)
            ._2
            .split("\\?")(0)
            .split("#")(0)

        (targetURL, metadata)
    })

uriMetaDataPairs.take(10).foreach(println)

(http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8,{"Container":{"Filename":"CC-MAIN-20180918130631-20180918150631-00000.warc.gz","Compressed":true,"Offset":"485","Gzip-Metadata":{"Inflated-Length":"703","Footer-Length":"8","Inflated-CRC":"-129749966","Deflate-Length":"473","Header-Length":"10"}},"Envelope":{"Format":"WARC","WARC-Header-Length":"388","Actual-Content-Length":"311","WARC-Header-Metadata":{"WARC-IP-Address":"193.107.238.44","WARC-Target-URI":"http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80","WARC-Warcinfo-ID":"<urn:uuid:8d75a1d7-86c5-4788-9e02-7e28190391ba>","WARC-Date":"2018-09-18T13:25:39Z","Content-Length":"311","WARC-Record-ID":"<urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa>","WARC-Type":"request","Content-Type":"application/http; msgtype=request"},"Block-Digest":"sha1:4TCDGVD4W5RLE5AJINFVJPPX56UQVP6N","Payload-Metadata":{"Actual-Content-Type":"application/http; msgtype=request","HTTP-Request-Metadata":{"Headers-Length":"309","Entity

uriMetaDataPairs = MapPartitionsRDD[260] at map at <console>:56


lastException: Throwable = null


MapPartitionsRDD[260] at map at <console>:56

In [166]:
val uriLinkPairs = uriMetaDataPairs
    .map(pair => {
        val (uri, unparsedMetaData) = pair

        val linkArrayStartIndex = unparsedMetaData.indexOf("\"Links\":") match {
            case x if (x > 0) => (x + ("\"Links\":[").length)
            case _ => 0
        }

        if (linkArrayStartIndex == 0) {
            (uri, Array[String]())
        } else {
             val unparsedLinkArray = unparsedMetaData
                .substring(linkArrayStartIndex)
                .split("]")(0)

            val links = unparsedLinkArray
                .split(",")
                .filter(_.contains("url"))
                .map(jsonObject => {
                    val startIndex = jsonObject.indexOf("\"url\":") + ("\"url\":").length + 1
                    val stopIndex = jsonObject.length - 2
                    
                    jsonObject.substring(startIndex, stopIndex)
                })
                .map(_.split("\\?")(0))
                .map(_.split("#")(0))
                .filter(_ != "")

            (uri, links)
        }
    })
    .filter(_._2.nonEmpty)

uriLinkPairs.take(10).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

lastException = null


Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 120.0 failed 1 times, most recent failure: Lost task 0.0 in stage 120.0 (TID 588, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 0
	at $line687.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$5.apply(<console>:83)
	at $line687.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$5.apply(<console>:83)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at $line687.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$i

In [161]:
val uriStaticLinkPairs = uriLinkPairs
    .map(pair => {
        val (uri, links) = pair
        
        val staticLinks = links
            .filter(!_.startsWith("/"))
            .filter(!_.startsWith("mailto"))
            .filter(!_.startsWith("javascript"))
        
        (uri, staticLinks)
    })

uriStaticLinkPairs.take(10).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 115.0 failed 1 times, most recent failure: Lost task 0.0 in stage 115.0 (TID 583, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 0
	at $line648.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$5.apply(<console>:81)
	at $line648.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$5.apply(<console>:81)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at $line648.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$i

In [150]:
println("?sda".split("\\?")(1))

sda
