In [1]:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat

val spark = SparkSession.builder.master("local[*]").appName("SparkSQL").getOrCreate()

import spark.implicits._
import org.apache.spark.sql.functions._

spark = org.apache.spark.sql.SparkSession@10d60859


In [2]:
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "WARC/1.0")
val dataset = sc.newAPIHadoopFile(
    "data/WAT/CC-MAIN-20180918130631-20180918150631-00000.warc.wat",
    classOf[TextInputFormat],
    classOf[LongWritable],
    classOf[Text], conf
)

val data = dataset
    .map(x => x._2.toString)
    .filter(_.length > 0)
    .map(_.split("\r\n\r\n"))
    .mapPartitionsWithIndex {  
        (idx, iter) => if (idx == 0) iter.drop(2) else iter 
    }
    .map(arr => (arr(0), arr(1)))

data.take(1).foreach(println)

(
WARC-Type: metadata
WARC-Target-URI: http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80
WARC-Date: 2018-09-26T17:58:38Z
WARC-Record-ID: <urn:uuid:fa8c5f55-cf06-4127-aab5-605b5de2cb45>
WARC-Refers-To: <urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa>
Content-Type: application/json
Content-Length: 1454,{"Container":{"Filename":"CC-MAIN-20180918130631-20180918150631-00000.warc.gz","Compressed":true,"Offset":"485","Gzip-Metadata":{"Inflated-Length":"703","Footer-Length":"8","Inflated-CRC":"-129749966","Deflate-Length":"473","Header-Length":"10"}},"Envelope":{"Format":"WARC","WARC-Header-Length":"388","Actual-Content-Length":"311","WARC-Header-Metadata":{"WARC-IP-Address":"193.107.238.44","WARC-Target-URI":"http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80","WARC-Warcinfo-ID":"<urn:uuid:8d75a1d7-86c5-4788-9e02-7e28190391ba>","WARC-Date":"2018-09-18T13:25:39Z","Content-Length":"311","WARC-Record-ID":"<urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa

conf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
dataset = data/WAT/CC-MAIN-20180918130631-20180918150631-00000.warc.wat NewHadoopRDD[0] at newAPIHadoopFile at <console>:40
data = MapPartitionsRDD[5] at map at <console>:54


MapPartitionsRDD[5] at map at <console>:54

In [3]:
val uriMetaDataPairs = data
    .map(pair => {
        val (header, metadata) = pair
    
        val keyValuePairs = header
            .split("\r\n")
            .filter(_ != "")
            .map(line => {
                val splitList = line.split(": ")
                ((splitList(0), splitList(1)))
            })

        val targetURL = keyValuePairs
            .filter(_._1 == "WARC-Target-URI")(0)
            ._2
            .split("\\?")(0)
            .split("#")(0)

        (targetURL, metadata)
    })

uriMetaDataPairs.take(1).foreach(println)

(http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8,{"Container":{"Filename":"CC-MAIN-20180918130631-20180918150631-00000.warc.gz","Compressed":true,"Offset":"485","Gzip-Metadata":{"Inflated-Length":"703","Footer-Length":"8","Inflated-CRC":"-129749966","Deflate-Length":"473","Header-Length":"10"}},"Envelope":{"Format":"WARC","WARC-Header-Length":"388","Actual-Content-Length":"311","WARC-Header-Metadata":{"WARC-IP-Address":"193.107.238.44","WARC-Target-URI":"http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8?page=80","WARC-Warcinfo-ID":"<urn:uuid:8d75a1d7-86c5-4788-9e02-7e28190391ba>","WARC-Date":"2018-09-18T13:25:39Z","Content-Length":"311","WARC-Record-ID":"<urn:uuid:4793a122-b938-4294-bb05-79a97c0f3caa>","WARC-Type":"request","Content-Type":"application/http; msgtype=request"},"Block-Digest":"sha1:4TCDGVD4W5RLE5AJINFVJPPX56UQVP6N","Payload-Metadata":{"Actual-Content-Type":"application/http; msgtype=request","HTTP-Request-Metadata":{"Headers-Length":"309","Entity

uriMetaDataPairs = MapPartitionsRDD[6] at map at <console>:43


MapPartitionsRDD[6] at map at <console>:43

In [4]:
val uriLinkPairs = uriMetaDataPairs
    .map(pair => {
        val (uri, unparsedMetaData) = pair

        val linkArrayStartIndex = unparsedMetaData.indexOf("\"Links\":") match {
            case x if (x > 0) => (x + ("\"Links\":[").length)
            case _ => 0
        }

        if (linkArrayStartIndex == 0) {
            (uri, Array[String]())
        } else {
             val unparsedLinkArray = unparsedMetaData
                .substring(linkArrayStartIndex)
                .split("]")(0)

            val links = unparsedLinkArray
                .split(",")
                .filter(_.contains("url"))
                .map(jsonObject => {
                    val startIndex = jsonObject.indexOf("\"url\":") + ("\"url\":").length + 1
                    val stopIndex = jsonObject.length - 2
                    
                    if (startIndex < stopIndex) {
                        jsonObject.substring(startIndex, stopIndex)
                    } else {
                        ""
                    }                    
                })

            (uri, links)
        }
    })
    .filter(_._2.nonEmpty)

uriLinkPairs.take(1).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

[URL: http://0-50.ru/news/tag/%F4%F3%F2%E1%EE%EB+%D0%EE%F1%F1%E8%E8]
//mc.yandex.ru/watch/21009928
/news/label/%CF%EE%E3%EE%E4%E0+%E2+%C5%EA%E0%F2%E5%F0%E8%ED%E1%F3%F0%E3%E5+%E8+%D1%E2%E5%F0%E4%EB%EE%E2%F1%EA%EE%E9+%EE%E1%EB%E0%F1%F2%E8
/news/label/%D7%F2%EE+%EF%F0%E8%E3%EE%F2%EE%E2%E8%F2%FC+%ED%E0+%F3%E6%E8%ED+%F0%E5%F6%E5%EF%F2+%F1+%F4%EE%F2%EE
/news/label/%CD%EE%E2%EE%F1%F2%E8+%C5%EA%E0%F2%E5%F0%E8%ED%E1%F3%F0%E3%E0
/news/label/96women.ru+-+%C6%E8%E2%EE%E9+%C6%E5%ED%F1%EA%E8%E9+%C6%F3%F0%ED%E0%EB
/
/editorial.html
/advertising.html
/aboutus.html
/rss/
/sendmail/
/news/
/images/logo.png
http://0-50.ru/
/images/banners/51234_original.jpg
/banners/26
/news/eburg/
/news/russia/
/news/education/
/news/polit/
/news/health/
/news/sport/
/news/incident/
/news/auto/
/news/company/
/news/socium/
/news/article/
/search/
/news/incident/2018-09-16/id_64718.html
/news/incident/2018-09-16/id_64718.html#comments
/news/incident/2018-09-16/id_64717.html
/news/incident/2018-09-16/id_64717.html#comment

uriLinkPairs = MapPartitionsRDD[8] at filter at <console>:77


MapPartitionsRDD[8] at filter at <console>:77

In [30]:
val uriStaticLinkPairs = uriLinkPairs
    .map(pair => {
        val (uri, links) = pair
        
        val staticLinks = links
            .map(_.split("\\?")(0))
//             .filter(!_.startsWith("#"))
//             .map(_.split("#")(0))
//             .filter(!_.startsWith("/"))
//             .filter(!_.startsWith("mailto"))
//             .filter(!_.startsWith("javascript"))
//             .filter(_ != "")
        
        (uri, staticLinks)
    })

uriStaticLinkPairs.take(534).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 0 in stage 24.0 failed 1 times, most recent failure: Lost task 0.0 in stage 24.0 (TID 28, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 0
	at $line144.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$2.apply(<console>:53)
	at $line144.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$2.apply(<console>:53)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at $line144.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<con

In [12]:
// uriStaticLinkPairs.collect()

Name: org.apache.spark.SparkException
Message: Job aborted due to stage failure: Task 1 in stage 6.0 failed 1 times, most recent failure: Lost task 1.0 in stage 6.0 (TID 7, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 0
	at $line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$2.apply(<console>:53)
	at $line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1$$anonfun$2.apply(<console>:53)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at $line44.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(<console>: