In [1]:
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._ 
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.split

Intitializing Scala interpreter ...

Spark Web UI available at http://mbpdejebaptiste:4040
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1617955159152)
SparkSession available as 'spark'


import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.split


## Load des données

In [25]:
val karate_graph = sc.textFile("data/soc-karate/soc-karate.mtx")


karate_graph: org.apache.spark.rdd.RDD[String] = data/soc-karate/soc-karate.mtx MapPartitionsRDD[66] at textFile at <console>:38


## Preprocess

In [26]:
var df_karate = karate_graph.toDF.withColumn("id",monotonicallyIncreasingId)


df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint]


In [27]:
df_karate = df_karate.withColumn("rank", row_number().over(Window.orderBy("id")))

df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint ... 1 more field]


In [28]:
df_karate.show(30,false)

+--------------------------------------------------------------------------------+----------+----+
|value                                                                           |id        |rank|
+--------------------------------------------------------------------------------+----------+----+
|%%MatrixMarket matrix coordinate pattern symmetric                              |0         |1   |
|%-------------------------------------------------------------------------------|1         |2   |
|% UF Sparse Matrix Collection, Tim Davis                                        |2         |3   |
|% http://www.cise.ufl.edu/research/sparse/matrices/Newman/karate                |3         |4   |
|% name: Newman/karate                                                           |4         |5   |
|% [Karate club, from Wayne Zachary, 1977]                                       |5         |6   |
|% id: 2399                                                                      |6         |7   |
|% date: 1

In [29]:
df_karate = df_karate.filter(df_karate("rank")>24)

df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint ... 1 more field]


In [30]:
df_karate = df_karate.drop("id","rank")

df_karate: org.apache.spark.sql.DataFrame = [value: string]


In [31]:
val df_karate_splitted = df_karate.withColumn("_tmp", split($"value", "\\ ")).select(
  $"_tmp".getItem(0).as("srcId"),
  $"_tmp".getItem(1).as("dstID"),
)

df_karate_splitted: org.apache.spark.sql.DataFrame = [srcId: string, dstID: string]


In [32]:
val rows: RDD[Row] = df_karate_splitted.rdd

rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[84] at rdd at <console>:39


## Creation du graphe

In [85]:
var a= rows.map{ case Row(src:String, dist : String) => Edge(src.toLong, dist.toLong,1)}
var default_user=(0, 0)
var graph = Graph.fromEdges(a, default_user)

a: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = MapPartitionsRDD[222] at map at <console>:42
default_user: (Int, Int) = (0,0)
graph: org.apache.spark.graphx.Graph[(Int, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@60c8bc14


In [86]:
graph.edges.collect()

res37: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,1), Edge(3,1,1), Edge(3,2,1), Edge(4,1,1), Edge(4,2,1), Edge(4,3,1), Edge(5,1,1), Edge(6,1,1), Edge(7,1,1), Edge(7,5,1), Edge(7,6,1), Edge(8,1,1), Edge(8,2,1), Edge(8,3,1), Edge(8,4,1), Edge(9,1,1), Edge(9,3,1), Edge(10,3,1), Edge(11,1,1), Edge(11,5,1), Edge(11,6,1), Edge(12,1,1), Edge(13,1,1), Edge(13,4,1), Edge(14,1,1), Edge(14,2,1), Edge(14,3,1), Edge(14,4,1), Edge(17,6,1), Edge(17,7,1), Edge(18,1,1), Edge(18,2,1), Edge(20,1,1), Edge(20,2,1), Edge(22,1,1), Edge(22,2,1), Edge(26,24,1), Edge(26,25,1), Edge(28,3,1), Edge(28,24,1), Edge(28,25,1), Edge(29,3,1), Edge(30,24,1), Edge(30,27,1), Edge(31,2,1), Edge(31,9,1), Edge(32,1,1), Edge(32,25,1), Edge(32,26,1), Edge(32,29,1), Edge(33,3,1), Edge(33,9,1), Edge(33,15,1), Edge(3...


In [87]:
graph.vertices.collect()

res38: Array[(org.apache.spark.graphx.VertexId, (Int, Int))] = Array((20,(0,0)), (13,(0,0)), (19,(0,0)), (34,(0,0)), (15,(0,0)), (4,(0,0)), (21,(0,0)), (16,(0,0)), (22,(0,0)), (25,(0,0)), (28,(0,0)), (29,(0,0)), (11,(0,0)), (14,(0,0)), (32,(0,0)), (30,(0,0)), (24,(0,0)), (27,(0,0)), (33,(0,0)), (23,(0,0)), (1,(0,0)), (6,(0,0)), (17,(0,0)), (3,(0,0)), (7,(0,0)), (9,(0,0)), (8,(0,0)), (12,(0,0)), (18,(0,0)), (31,(0,0)), (26,(0,0)), (10,(0,0)), (5,(0,0)), (2,(0,0)))


In [97]:
val newgraph = graph.outerJoinVertices(graph.degrees)((index,_,deg) => (index,deg.get))

newgraph: org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@407eb64c


In [98]:
newgraph.vertices.collect()

res45: Array[(org.apache.spark.graphx.VertexId, (org.apache.spark.graphx.VertexId, Int))] = Array((20,(20,3)), (13,(13,2)), (19,(19,2)), (34,(34,17)), (15,(15,2)), (4,(4,6)), (21,(21,2)), (16,(16,2)), (22,(22,2)), (25,(25,3)), (28,(28,4)), (29,(29,3)), (11,(11,3)), (14,(14,5)), (32,(32,6)), (30,(30,4)), (24,(24,5)), (27,(27,2)), (33,(33,12)), (23,(23,2)), (1,(1,16)), (6,(6,4)), (17,(17,2)), (3,(3,10)), (7,(7,4)), (9,(9,5)), (8,(8,4)), (12,(12,1)), (18,(18,2)), (31,(31,4)), (26,(26,3)), (10,(10,2)), (5,(5,3)), (2,(2,9)))


In [None]:
val sigma_in[Long:i] = 