In [0]:
# CS6350 - Assignment 3
# Analyzing Social Networks using GraphX/GraphFrame

In [0]:
# Karan Sandeep Risbud - KSR190005
# Ankit Upadhyay - AXU200010

In [0]:
ipFile = '/FileStore/tables/graphFrameip.txt'
opFile = '/FileStore/slashDotAnalysis'
raw = sc.textFile(ipFile)
raw.take(20)

Out[42]: ['# Directed graph (each unordered pair of nodes is saved once): Slashdot0902.txt ',
 '# Slashdot Zoo social network from February 0 2009',
 '# Nodes: 82168 Edges: 948464',
 '# FromNodeId\tToNodeId',
 '0\t0',
 '0\t1',
 '0\t2',
 '0\t3',
 '0\t4',
 '0\t5',
 '0\t6',
 '0\t7',
 '0\t8',
 '0\t9',
 '0\t10',
 '0\t11',
 '0\t12',
 '0\t13',
 '0\t14',
 '0\t15']

In [0]:
# Filtering the first 4 lines to use src and dst
data = raw.filter(lambda x: x[0] != '#').map(lambda x: x.split('\t')).map(lambda x: (x[0], x[1]))
data.take(10)

Out[43]: [('0', '0'),
 ('0', '1'),
 ('0', '2'),
 ('0', '3'),
 ('0', '4'),
 ('0', '5'),
 ('0', '6'),
 ('0', '7'),
 ('0', '8'),
 ('0', '9')]

In [0]:
# Edges
edges = data.toDF(["src", "dst"])
display(edges)

src,dst
0,0
0,1
0,2
0,3
0,4
0,5
0,6
0,7
0,8
0,9


In [0]:
# Vertices
nodes = sc.parallelize([i for i in range(82168)]) # Number of vertices is mentioned in raw data
nodes = nodes.map(lambda x: [x])
vertices = nodes.toDF(["id"])
display(vertices)

id
0
1
2
3
4
5
6
7
8
9


In [0]:
from graphframes import GraphFrame
from pyspark.sql.functions import desc

g = GraphFrame(vertices, edges)
g.cache()

Out[46]: GraphFrame(v:[id: bigint], e:[src: string, dst: string])

In [0]:
# Outdegree
outDeg = g.outDegrees
outDeg.orderBy(desc("outDegree")).show(5, False)
outDeg.coalesce(1).write.option("header", "true").csv(opFile + '0')

+----+---------+
|id  |outDegree|
+----+---------+
|2494|2511     |
|4805|2248     |
|398 |2209     |
|381 |1851     |
|226 |1701     |
+----+---------+
only showing top 5 rows



In [0]:
# Indegree
inDeg = g.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)
inDeg.coalesce(1).write.option("header", "true").csv(opFile + '1')

+----+--------+
|id  |inDegree|
+----+--------+
|2494|2553    |
|398 |2355    |
|4805|2292    |
|381 |1862    |
|226 |1729    |
+----+--------+
only showing top 5 rows



In [0]:
# Insight
# The vertices 2494, 4805, 398, 381, 226 are having high inDegree as well as high outDegree
# This makes sense as the nodes, where there is incoming edges, must also have around equal number of outgoing edges and vice versa
# Or else the nodes would be only source nodes or sink nodes

In [0]:
# PageRank
ranks = g.pageRank(resetProbability=0.15, maxIter=50)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(5)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").coalesce(1).write.option("header", "true").csv(opFile + '2')

+----+------------------+
|  id|          pagerank|
+----+------------------+
|2494| 174.1905459272265|
| 398|160.96347805483958|
| 381| 149.3488675548866|
|4805|117.59322547619814|
|  37|117.17350525884429|
+----+------------------+
only showing top 5 rows



In [0]:
# Insight
# The vertices 2494, 398, 381, 4805 which are having high Indegree as well as high Outdegree are also in top of the PageRanking
# This makes sense as nodes having high incoming edges and high outgoing edges from important ndoes must have a high PageRanking

In [0]:
# Connected Components
sc.setCheckpointDir('/Users/ankitUp/graphframes_cps')

cc = g.connectedComponents()
cc.groupBy("component").count().orderBy(desc("count")).show(5)
cc.groupBy("component").count().orderBy(desc("count")).coalesce(1).write.option("header", "true").csv(opFile + '3')

+---------+-----+
|component|count|
+---------+-----+
|        0|82168|
+---------+-----+



In [0]:
# Insight
# The whole network is connected and no node is left out

In [0]:
# Triangle Count
tc = g.triangleCount()
tc.orderBy(desc("count")).show(5, False);
tc.orderBy(desc("count")).coalesce(1).write.option("header", "true").csv(opFile + '4')

+-----+----+
|count|id  |
+-----+----+
|15666|49  |
|12456|195 |
|11590|398 |
|11452|1723|
|11191|342 |
+-----+----+
only showing top 5 rows



In [0]:
# Insight
# Only 398 from Indeg and Outdeg node is present in Triangle Count
# This suggests that though other nodes have higher indeg and outdeg, only 398 if having edges from A and B
# A and B are also connected to form a triangle