In [1]:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat

val spark = SparkSession.builder.master("local[*]").appName("SparkSQL").getOrCreate()

import spark.implicits._
import org.apache.spark.sql.functions._

import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


spark = org.apache.spark.sql.SparkSession@5bfee253


In [2]:
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "WARC/1.0")
val dataset = sc.newAPIHadoopFile(
    "data/WAT/",
    classOf[TextInputFormat],
    classOf[LongWritable],
    classOf[Text], conf
)

val data = dataset
    .map(x => x._2.toString)
    .filter(_.nonEmpty)
    .map(_.split("\r\n\r\n"))
//     .mapPartitionsWithIndex {  
//         (idx, iter) => if (idx == 0) iter.drop(2) else iter 
//     }
    .map(arr => (arr(0), arr(1)))
    .filter(_._2.startsWith("{"))

println(data.count())

// data.take(10).foreach(println)

359232


conf = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml
dataset = data/WAT/ NewHadoopRDD[0] at newAPIHadoopFile at <console>:47
data = MapPartitionsRDD[5] at filter at <console>:62


MapPartitionsRDD[5] at filter at <console>:62

In [3]:
val uriMetaDataPairs = data
    .map(pair => {
        val (header, metadata) = pair
    
        val keyValuePairs = header
            .split("\r\n")
            .filter(_ != "")
            .map(line => {
                val splitList = line.split(": ")
                ((splitList(0), splitList(1)))
            })

        val targetURL = keyValuePairs
            .filter(_._1 == "WARC-Target-URI")(0)
            ._2
        
        (targetURL, metadata)
    })

uriMetaDataPairs.take(1).foreach(println)

(CC-MAIN-20180918130631-20180918150631-00002.warc.gz,{"Container":{"Filename":"CC-MAIN-20180918130631-20180918150631-00002.warc.gz","Compressed":true,"Offset":"0","Gzip-Metadata":{"Inflated-Length":"769","Footer-Length":"8","Inflated-CRC":"1350986398","Deflate-Length":"484","Header-Length":"10"}},"Envelope":{"Format":"WARC","WARC-Header-Length":"259","Actual-Content-Length":"506","WARC-Header-Metadata":{"WARC-Filename":"CC-MAIN-20180918130631-20180918150631-00002.warc.gz","WARC-Date":"2018-09-18T13:06:31Z","Content-Length":"506","WARC-Record-ID":"<urn:uuid:dec40478-8e6b-4036-a8e3-06a2211ed681>","WARC-Type":"warcinfo","Content-Type":"application/warc-fields"},"Block-Digest":"sha1:RTUJIRE47WHSO4ME4JSEZV3WH5HOVY5Q","Payload-Metadata":{"Actual-Content-Type":"application/warc-fields","Actual-Content-Length":"506","Trailing-Slop-Length":"0","WARC-Info-Metadata":{"hostname":"ip-10-65-123-119.ec2.internal","software":"Apache Nutch 1.15 (modified, https://github.com/commoncrawl/nutch/)","format

uriMetaDataPairs = MapPartitionsRDD[6] at map at <console>:50


MapPartitionsRDD[6] at map at <console>:50

In [4]:
val uriLinkPairs = uriMetaDataPairs
    .map(pair => {
        val (uri, unparsedMetaData) = pair

        val linkArrayStartIndex = unparsedMetaData.indexOf("\"Links\":") match {
            case x if (x > 0) => (x + ("\"Links\":[").length)
            case _ => 0
        }

        if (linkArrayStartIndex == 0) {
            (uri, Array[String]())
        } else {
             val unparsedLinkArray = unparsedMetaData
                .substring(linkArrayStartIndex)
                .split("]")(0)

            val links = unparsedLinkArray
                .split(",")
                .filter(_.contains("url"))
                .map(jsonObject => {
                    val startIndex = jsonObject.indexOf("\"url\":") + ("\"url\":").length + 1
                    val stopIndex = jsonObject.length - 2
                    
                    if (startIndex < stopIndex) {
                        jsonObject.substring(startIndex, stopIndex)
                    } else {
                        ""
                    }                    
                })

            (uri, links)
        }
    })
    .filter(_._2.nonEmpty)

uriLinkPairs.take(1).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

[URL: http://022.holidayblog.jp/?m=20060610]
http://005.holidayblog.jp/wp-content/uploads/2008/02/home.gif
http://022.holidayblog.jp/
http://022.holidayblog.jp/?p=6828
http://022.holidayblog.jp/?p=6826
http://022.holidayblog.jp/?p=6825
http://022.holidayblog.jp/?p=6818
http://022.holidayblog.jp/?p=6817
http://022.holidayblog.jp/?cat=1
http://022.holidayblog.jp/?cat=3
http://022.holidayblog.jp/?m=200605
http://022.holidayblog.jp/?m=200607
http://022.holidayblog.jp/?m=20060601
http://022.holidayblog.jp/?m=20060602
http://022.holidayblog.jp/?m=20060603
http://022.holidayblog.jp/?m=20060604
http://022.holidayblog.jp/?m=20060605
http://022.holidayblog.jp/?m=20060606
http://022.holidayblog.jp/?m=20060607
http://022.holidayblog.jp/?m=20060608
http://022.holidayblog.jp/?m=20060609
http://022.holidayblog.jp/?m=20060610
http://022.holidayblog.jp/?m=20060611
http://022.holidayblog.jp/?m=20060612
http://022.holidayblog.jp/?m=20060613
http://022.holidayblog.jp/?m=20060614
http://022.holidayblog.jp/

uriLinkPairs = MapPartitionsRDD[8] at filter at <console>:84


MapPartitionsRDD[8] at filter at <console>:84

In [5]:
val uriStaticLinkPairs = uriLinkPairs
    .map(pair => {
        val (uri, links) = pair
        
        val protocolRegex = """^https?:\/\/.*"""
        
        val staticLinks = links
            .filter(_.matches(protocolRegex))
        
        (uri, staticLinks)
    }).cache()

uriStaticLinkPairs.take(1).foreach(x => {
    println("[URL: " + x._1 + "]")
    x._2.foreach(println)
})

[URL: http://022.holidayblog.jp/?m=20060610]
http://005.holidayblog.jp/wp-content/uploads/2008/02/home.gif
http://022.holidayblog.jp/
http://022.holidayblog.jp/?p=6828
http://022.holidayblog.jp/?p=6826
http://022.holidayblog.jp/?p=6825
http://022.holidayblog.jp/?p=6818
http://022.holidayblog.jp/?p=6817
http://022.holidayblog.jp/?cat=1
http://022.holidayblog.jp/?cat=3
http://022.holidayblog.jp/?m=200605
http://022.holidayblog.jp/?m=200607
http://022.holidayblog.jp/?m=20060601
http://022.holidayblog.jp/?m=20060602
http://022.holidayblog.jp/?m=20060603
http://022.holidayblog.jp/?m=20060604
http://022.holidayblog.jp/?m=20060605
http://022.holidayblog.jp/?m=20060606
http://022.holidayblog.jp/?m=20060607
http://022.holidayblog.jp/?m=20060608
http://022.holidayblog.jp/?m=20060609
http://022.holidayblog.jp/?m=20060610
http://022.holidayblog.jp/?m=20060611
http://022.holidayblog.jp/?m=20060612
http://022.holidayblog.jp/?m=20060613
http://022.holidayblog.jp/?m=20060614
http://022.holidayblog.jp/

uriStaticLinkPairs = MapPartitionsRDD[9] at map at <console>:54


MapPartitionsRDD[9] at map at <console>:54

In [6]:
val uriLinkPairsWithIndex = uriStaticLinkPairs.zipWithIndex.map(x => (x._2.toLong, x._1._1, x._1._2))

uriLinkPairsWithIndex.take(1).foreach(println)

(0,http://022.holidayblog.jp/?m=20060610,[Ljava.lang.String;@62c5c55f)


uriLinkPairsWithIndex = MapPartitionsRDD[11] at map at <console>:55


MapPartitionsRDD[11] at map at <console>:55

In [7]:
val linksList = uriLinkPairsWithIndex.flatMap(pair => {
    val (index, uri, links) = pair
    
    links.map(link => (index, link))
})

val uriList = uriLinkPairsWithIndex.map(x => (x._1, x._2))

linksList.take(3).foreach(println)
uriList.take(3).foreach(println)

(0,http://005.holidayblog.jp/wp-content/uploads/2008/02/home.gif)
(0,http://022.holidayblog.jp/)
(0,http://022.holidayblog.jp/?p=6828)
(0,http://022.holidayblog.jp/?m=20060610)
(1,http://022.holidayblog.jp/?m=20090310)
(2,http://043nobs.com/2018/06/kelvin-ikeduba-predicts-super-eagles-match-against-argentina.html)


linksList = MapPartitionsRDD[12] at flatMap at <console>:57
uriList = MapPartitionsRDD[13] at map at <console>:63


MapPartitionsRDD[13] at map at <console>:63

In [8]:
val linksDF = linksList.toDF("indexFrom", "to").cache()
val uriDF = uriList.toDF("index", "uri").cache()

linksDF.rdd.take(4).foreach(println)

linksDF.printSchema
linksDF.show()

uriDF.printSchema
uriDF.show()

[0,http://005.holidayblog.jp/wp-content/uploads/2008/02/home.gif]
[0,http://022.holidayblog.jp/]
[0,http://022.holidayblog.jp/?p=6828]
[0,http://022.holidayblog.jp/?p=6826]
root
 |-- indexFrom: long (nullable = false)
 |-- to: string (nullable = true)

+---------+--------------------+
|indexFrom|                  to|
+---------+--------------------+
|        0|http://005.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022.holida...|
|        0|http://022

linksDF = [indexFrom: bigint, to: string]
uriDF = [index: bigint, uri: string]


[index: bigint, uri: string]

In [9]:
val edgeDF = linksDF
    .join(uriDF, linksDF("to") === uriDF("uri"))
//     .withColumnRenamed("index", "indexTo")
    .cache()
//     .select("indexFrom", "indexTo")

edgeDF.show()


println(linksDF.rdd.count)
println(uriDF.rdd.count)
println(edgeDF.filter("to != uri").rdd.count)
println(edgeDF.rdd.count)

edgeDF.rdd.take(10).foreach(println)

+---------+--------------------+-----+--------------------+
|indexFrom|                  to|index|                 uri|
+---------+--------------------+-----+--------------------+
|        0|http://022.holida...|    0|http://022.holida...|
|        1|http://022.holida...|    1|http://022.holida...|
|        2|http://043nobs.co...|    2|http://043nobs.co...|
|        6|http://0800hardwa...|    6|http://0800hardwa...|
|        7|http://09-news.ru...|    7|http://09-news.ru...|
|       32|http://11599.ru/?...|77786|http://11599.ru/?...|
|       34|http://119.245.17...|   34|http://119.245.17...|
|       38|http://123domainm...|   38|http://123domainm...|
|       39|http://123hacks.x...|   39|http://123hacks.x...|
|       47|http://12tomatoes...|   47|http://12tomatoes...|
|       47|http://12tomatoes...|   47|http://12tomatoes...|
|       48|http://1340thefan...|   48|http://1340thefan...|
|       49|http://1340thefan...|   49|http://1340thefan...|
|       68|http://1et2et3dou...|   68|ht

edgeDF = [indexFrom: bigint, to: string ... 2 more fields]


[indexFrom: bigint, to: string ... 2 more fields]

In [10]:
// val unconnectedGraph: Graph[String, String] = Graph(uriList, sc.emptyRDD)

Name: Syntax Error.
Message: 
StackTrace: 

In [11]:
// val linkGraph = unconnectedGraph

Name: Syntax Error.
Message: 
StackTrace: 