In [1]:
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._ 
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.split

Intitializing Scala interpreter ...

Spark Web UI available at http://mbp-de-gregoire:4044
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1618492427417)
SparkSession available as 'spark'


import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.split


## Load des données

In [2]:
val karate_graph = sc.textFile("data/soc-karate/soc-karate.mtx")


karate_graph: org.apache.spark.rdd.RDD[String] = data/soc-karate/soc-karate.mtx MapPartitionsRDD[1] at textFile at <console>:38


## Preprocess

In [3]:
var df_karate = karate_graph.toDF.withColumn("id",monotonicallyIncreasingId)


df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint]


In [4]:
df_karate = df_karate.withColumn("rank", row_number().over(Window.orderBy("id")))

df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint ... 1 more field]


In [5]:
df_karate.show(30,false)

+--------------------------------------------------------------------------------+----------+----+
|value                                                                           |id        |rank|
+--------------------------------------------------------------------------------+----------+----+
|%%MatrixMarket matrix coordinate pattern symmetric                              |0         |1   |
|%-------------------------------------------------------------------------------|1         |2   |
|% UF Sparse Matrix Collection, Tim Davis                                        |2         |3   |
|% http://www.cise.ufl.edu/research/sparse/matrices/Newman/karate                |3         |4   |
|% name: Newman/karate                                                           |4         |5   |
|% [Karate club, from Wayne Zachary, 1977]                                       |5         |6   |
|% id: 2399                                                                      |6         |7   |
|% date: 1

In [6]:
df_karate = df_karate.filter(df_karate("rank")>24)

df_karate: org.apache.spark.sql.DataFrame = [value: string, id: bigint ... 1 more field]


In [7]:
df_karate = df_karate.drop("id","rank")

df_karate: org.apache.spark.sql.DataFrame = [value: string]


In [8]:
val df_karate_splitted = df_karate.withColumn("_tmp", split($"value", "\\ ")).select(
  $"_tmp".getItem(0).as("srcId"),
  $"_tmp".getItem(1).as("dstID"),
)

df_karate_splitted: org.apache.spark.sql.DataFrame = [srcId: string, dstID: string]


In [9]:
val rows: RDD[Row] = df_karate_splitted.rdd

rows: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[19] at rdd at <console>:39


## Creation du graphe

In [10]:
var a= rows.map{ case Row(src:String, dist : String) => Edge(src.toLong, dist.toLong,1)}
var default_user=(0, 0)
var graph = Graph.fromEdges(a, default_user)

a: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = MapPartitionsRDD[20] at map at <console>:39
default_user: (Int, Int) = (0,0)
graph: org.apache.spark.graphx.Graph[(Int, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@41911ee7


In [11]:
graph.edges.collect()

res1: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,1), Edge(3,1,1), Edge(3,2,1), Edge(4,1,1), Edge(4,2,1), Edge(4,3,1), Edge(5,1,1), Edge(6,1,1), Edge(7,1,1), Edge(7,5,1), Edge(7,6,1), Edge(8,1,1), Edge(8,2,1), Edge(8,3,1), Edge(8,4,1), Edge(9,1,1), Edge(9,3,1), Edge(10,3,1), Edge(11,1,1), Edge(11,5,1), Edge(11,6,1), Edge(12,1,1), Edge(13,1,1), Edge(13,4,1), Edge(14,1,1), Edge(14,2,1), Edge(14,3,1), Edge(14,4,1), Edge(17,6,1), Edge(17,7,1), Edge(18,1,1), Edge(18,2,1), Edge(20,1,1), Edge(20,2,1), Edge(22,1,1), Edge(22,2,1), Edge(26,24,1), Edge(26,25,1), Edge(28,3,1), Edge(28,24,1), Edge(28,25,1), Edge(29,3,1), Edge(30,24,1), Edge(30,27,1), Edge(31,2,1), Edge(31,9,1), Edge(32,1,1), Edge(32,25,1), Edge(32,26,1), Edge(32,29,1), Edge(33,3,1), Edge(33,9,1), Edge(33,15,1), Edge(33...


In [12]:
graph.vertices.collect()

res2: Array[(org.apache.spark.graphx.VertexId, (Int, Int))] = Array((20,(0,0)), (13,(0,0)), (19,(0,0)), (34,(0,0)), (15,(0,0)), (4,(0,0)), (21,(0,0)), (16,(0,0)), (22,(0,0)), (25,(0,0)), (28,(0,0)), (29,(0,0)), (11,(0,0)), (14,(0,0)), (32,(0,0)), (30,(0,0)), (24,(0,0)), (27,(0,0)), (33,(0,0)), (23,(0,0)), (1,(0,0)), (6,(0,0)), (17,(0,0)), (3,(0,0)), (7,(0,0)), (9,(0,0)), (8,(0,0)), (12,(0,0)), (18,(0,0)), (31,(0,0)), (26,(0,0)), (10,(0,0)), (5,(0,0)), (2,(0,0)))


In [13]:
val newgraph = graph.outerJoinVertices(graph.degrees)((index,_,deg) => (index,deg.get))

newgraph: org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@4f6d1cc6


In [14]:
newgraph.triplets.collect()

res3: Array[org.apache.spark.graphx.EdgeTriplet[(org.apache.spark.graphx.VertexId, Int),Int]] = Array(((2,(2,9)),(1,(1,16)),1), ((3,(3,10)),(1,(1,16)),1), ((3,(3,10)),(2,(2,9)),1), ((4,(4,6)),(1,(1,16)),1), ((4,(4,6)),(2,(2,9)),1), ((4,(4,6)),(3,(3,10)),1), ((5,(5,3)),(1,(1,16)),1), ((6,(6,4)),(1,(1,16)),1), ((7,(7,4)),(1,(1,16)),1), ((7,(7,4)),(5,(5,3)),1), ((7,(7,4)),(6,(6,4)),1), ((8,(8,4)),(1,(1,16)),1), ((8,(8,4)),(2,(2,9)),1), ((8,(8,4)),(3,(3,10)),1), ((8,(8,4)),(4,(4,6)),1), ((9,(9,5)),(1,(1,16)),1), ((9,(9,5)),(3,(3,10)),1), ((10,(10,2)),(3,(3,10)),1), ((11,(11,3)),(1,(1,16)),1), ((11,(11,3)),(5,(5,3)),1), ((11,(11,3)),(6,(6,4)),1), ((12,(12,1)),(1,(1,16)),1), ((13,(13,2)),(1,(1,16)),1), ((13,(13,2)),(4,(4,6)),1), ((14,(14,5)),(1,(1,16)),1), ((14,(14,5)),(2,(2,9)),1), ((14,(14,...


In [15]:
newgraph.vertices.collect()

res4: Array[(org.apache.spark.graphx.VertexId, (org.apache.spark.graphx.VertexId, Int))] = Array((20,(20,3)), (13,(13,2)), (19,(19,2)), (34,(34,17)), (15,(15,2)), (4,(4,6)), (21,(21,2)), (16,(16,2)), (22,(22,2)), (25,(25,3)), (28,(28,4)), (29,(29,3)), (11,(11,3)), (14,(14,5)), (32,(32,6)), (30,(30,4)), (24,(24,5)), (27,(27,2)), (33,(33,12)), (23,(23,2)), (1,(1,16)), (6,(6,4)), (17,(17,2)), (3,(3,10)), (7,(7,4)), (9,(9,5)), (8,(8,4)), (12,(12,1)), (18,(18,2)), (31,(31,4)), (26,(26,3)), (10,(10,2)), (5,(5,3)), (2,(2,9)))


In [16]:
def sigmaIn2(comId: Long, graph : Graph[(Long, Int),Int]): Double = {
    var graphVar = graph.triplets.filter((triplet => triplet.srcAttr._1==comId && triplet.dstAttr._1==comId))
    return graphVar.count.toDouble
}

def kiin2(i:Long, comId : Long, graph : Graph[(Long, Int),Int]) : Double = {
    var graphVar = graph.triplets.filter((triplet => (triplet.srcAttr._1==i && triplet.dstAttr._1==comId) ||(triplet.srcAttr._1==comId && triplet.dstAttr._1==i)))
    return graphVar.count.toDouble
}

def sigmaTot2(comId: Long, graph : Graph[(Long, Int),Int]): Double = {
    var graphVar = graph.triplets.filter((triplet => (triplet.srcAttr._1==comId && triplet.dstAttr._1!=comId)||(triplet.srcAttr._1!=comId && triplet.dstAttr._1==comId)))
    return graphVar.count.toDouble
}

sigmaIn2: (comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double
kiin2: (i: Long, comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double
sigmaTot2: (comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double


In [17]:
def sigmaIn1(i:Long, comId: Long, graph : Graph[(Long, Int),Int]): Double = {
    var graphVar = graph.triplets.filter((triplet => (triplet.srcAttr._1==comId && triplet.dstAttr._1==comId) &&
                                                    !(triplet.srcId==i || triplet.dstId==i)))
    return graphVar.count.toDouble

}


def sigmaTot1(i:Long, comId: Long, graph : Graph[(Long, Int),Int]): Double = {
    var graphVar = graph.triplets.filter((triplet => (triplet.srcAttr._1==comId && triplet.dstAttr._1!=comId)||(triplet.srcAttr._1!=comId && triplet.dstAttr._1==comId) && !(triplet.srcId==i || triplet.dstId==i)))
    return graphVar.count.toDouble
}

sigmaIn1: (i: Long, comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double
sigmaTot1: (i: Long, comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double


In [18]:
def ki(i: Long, graph : Graph[(Long, Int),Int]): Double = {
    var graphVar = graph.triplets.filter((triplet => triplet.srcId==i || triplet.dstId==i))
    return graphVar.count.toDouble
}

def kiin(i:Long, comId : Long, graph : Graph[(Long, Int),Int]) : Double = {
    var graphVar = graph.triplets.filter((triplet => (triplet.srcAttr._1==i && triplet.dstAttr._1==comId) ||(triplet.srcAttr._1==comId && triplet.dstAttr._1==i)))
    return graphVar.count.toDouble
}

ki: (i: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double
kiin: (i: Long, comId: Long, graph: org.apache.spark.graphx.Graph[(Long, Int),Int])Double


In [32]:
def dq(sigmain:Double, sigmatot:Double, kivar:Double, kiinvar:Double, m:Long):Double = {
    return (((sigmain+kiinvar)/(2*m) - scala.math.pow((sigmatot+kivar)/(2*m),2)) - (sigmain/(2*m) - scala.math.pow(sigmatot/(2*m),2) - scala.math.pow(kivar/(2*m),2)))
}

dq: (sigmain: Double, sigmatot: Double, kivar: Double, kiinvar: Double, m: Long)Double


In [23]:
def maxCom(setDq : Set[(Int, Double)]) : (Int, Double) = {
    var max = -1000.0
    var ind = -1
    setDq.foreach(x => {
        if (x._2>max)
        {
            ind=x._1
            max=x._2
        }
    })
    return (ind,max)
}

maxCom: (setDq: Set[(Int, Double)])(Int, Double)


In [24]:
maxCom(setDq)

res6: (Int, Double) = (12,0.007560815253122938)


In [25]:
def mooveItoC(i:Int, comId : Int, graph:Graph[(VertexId, Int),Int]) : Graph[(VertexId, Int),Int] = {
    val newVertices = graph.mapVertices { case (id, attr) => if (id==i) (comId.toLong, attr._2) else attr }
    return newVertices
}

mooveItoC: (i: Int, comId: Int, graph: org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, Int),Int])org.apache.spark.graphx.Graph[(org.apache.spark.graphx.VertexId, Int),Int]


In [42]:
def communities(graph:Graph[(Long,Int),Int]) : Graph[(Long,Int),Int] = {
    var currentGraph = graph
    var nbVertices = graph.vertices.count
    var m = graph.edges.count

    var i = 0
    var keepGoing = true
    var j=0
    for (j <- 1 to 10){
        println(j)
        keepGoing=false
        println(((currentGraph.vertices.collect { case t => t._2._1 }).collect.distinct).size)
        for (i<- 1 to nbVertices.toInt){
            println("Noeud considéré : "+i+ '\n')
            /*get the community and ki */
            var vertice = (currentGraph.vertices.collect{ case t if t._1 == i => t._2}).collect
            var comId = vertice(0)._1.toInt
            println("Communauté de "+i+" : "+comId)
            var kivar = vertice(0)._2

            /* calculate dq1 */
            var kiin1 = kiin(i, comId, currentGraph)
            var simgain1 = sigmaIn1(i, comId, currentGraph)
            var simgatot1 = sigmaTot1(i, comId, currentGraph)
            var dq1 = dq(sigmain1, sigmatot1, kivar, kiin1, m)

            /*look all combis : we need to know how many communities are left and which one*/
            var communities = (currentGraph.vertices.collect { case t => t._2._1 }).collect.distinct

            var setDq =Set(): Set[(Int,Double)]
            communities.foreach( a => {
                if (a!=comId){
                    var sigmain2 = sigmaIn2(a, newgraph)
                    var sigmatot2 = sigmaTot2(a, newgraph)
                    var kiin2 = kiin(i,a, newgraph)
                    var dq2 = dq(sigmain2, sigmatot2, kivar, kiin2, m)
                    setDq+=((a.toInt,dq2))
                }
            })

            /* get best combi and moove if larger than 0 */
            var bestCombi = maxCom(setDq)
            if (bestCombi._2 > 0 ) {
                
                var communities = (currentGraph.vertices.collect { case t => t._2._1 }).collect.distinct
                println("On met "+ i +" dans "+bestCombi._1+ '\n')
                var updatedGraph = mooveItoC(i, bestCombi._1, currentGraph)
                currentGraph=updatedGraph

                /*Un changement est fait*/
                keepGoing=true

            }
            
    }
    println(((currentGraph.vertices.collect { case t => t._2._1 }).collect.distinct).size)
    }
    return currentGraph
}

communities: (graph: org.apache.spark.graphx.Graph[(Long, Int),Int])org.apache.spark.graphx.Graph[(Long, Int),Int]


In [43]:
var a =communities(newgraph)

1
34
Noeud considéré : 1

Communauté de 1 : 1
On met 1 dans 12

Noeud considéré : 2

Communauté de 2 : 2
On met 2 dans 22

Noeud considéré : 3

Communauté de 3 : 3
On met 3 dans 10

Noeud considéré : 4

Communauté de 4 : 4
On met 4 dans 13

Noeud considéré : 5

Communauté de 5 : 5
On met 5 dans 11

Noeud considéré : 6

Communauté de 6 : 6
On met 6 dans 17

Noeud considéré : 7

Communauté de 7 : 7
On met 7 dans 17

Noeud considéré : 8

Communauté de 8 : 8
Noeud considéré : 9

Communauté de 9 : 9
On met 9 dans 31

Noeud considéré : 10

Communauté de 10 : 10
On met 10 dans 34

Noeud considéré : 11

Communauté de 11 : 11
Noeud considéré : 12

Communauté de 12 : 12
Noeud considéré : 13

Communauté de 13 : 13
Noeud considéré : 14

Communauté de 14 : 14
Noeud considéré : 15

Communauté de 15 : 15
On met 15 dans 33

Noeud considéré : 16

Communauté de 16 : 16
On met 16 dans 33

Noeud considéré : 17

Communauté de 17 : 17
Noeud considéré : 18

Communauté de 18 : 18
Noeud considéré : 19

Communa

In [41]:
(a.vertices.collect{ case t => t._2._1 }).collect.distinct

res7: Array[Long] = Array(34, 13, 33, 30, 22, 32, 11, 14, 12, 17, 8, 18)


In [205]:
true

res58: Boolean = true
