In [None]:
# Is this a sensible config, you ask?
print(sc._conf.getAll())

print("executor memory: %s" % sc._conf.get('spark.executor.memory'))

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

In [23]:
import time
from graphframes import GraphFrame

# To avoid those silly WARN on block locks not released yet by tasks.
sc.setLogLevel('ERROR')

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

t0 = int(round(time.time() * 1000))
# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216
num_rows = 10000
csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3-%s.txt" % num_rows
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)
t1 = int(round(time.time() * 1000))

print('Time to load %s rows into a GraphFrame: %s ms' % (num_rows, str(t1 - t0)))

# Query: Get in-degree of each vertex
t0 = int(round(time.time() * 1000))
g.inDegrees.show()
t1 = int(round(time.time() * 1000))
print('Time to calculate inDegrees: %s ms\n' % str(t1 - t0))

# Run PageRank algorithm, and show results.
t0 = int(round(time.time() * 1000))
results = g.pageRank(resetProbability=0.01, maxIter=10)
results.vertices.select("id", "pagerank").show()
t1 = int(round(time.time() * 1000))
print('Time to calculate PageRank with 20 iterations: %s ms' % str(t1 - t0))

# Search for 3-hop motifs.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 3-hop motifs: %s ms\n' % str(t1 - t0))

Time to load 10000 rows into a GraphFrame: 105 ms
+---+--------+
| id|inDegree|
+---+--------+
| 65|      95|
| 71|     166|
| 56|     184|
| 57|     227|
| 80|     160|
| 44|     129|
|  1|     198|
| 85|     129|
| 43|     191|
| 87|      70|
| 86|      17|
| 37|      47|
| 72|      10|
| 76|      67|
| 73|     134|
| 74|       3|
| 45|      32|
| 48|     266|
| 22|      32|
| 16|     310|
+---+--------+
only showing top 20 rows

Time to calculate inDegrees: 143 ms

+---+--------------------+
| id|            pagerank|
+---+--------------------+
| 69|0.027194786093570116|
| 65|  0.0683803164698065|
| 22|0.046185325149075196|
| 85| 0.14305979828332557|
| 87| 0.07204545141814425|
| 83| 0.08985114240233474|
| 24| 0.22661191032684153|
| 39|  0.0938801975463385|
| 19|  0.1447934596060821|
| 66| 0.11553720936932972|
| 56| 0.11249016108527567|
| 77|0.041955648745304046|
| 49|  0.2343406336730979|
|  0| 0.12780568268762965|
| 55|0.046118336810134976|
| 12|0.043738508164479654|
|  8|  0.07573

In [None]:
# Search for 4-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 4-hop motifs: %s ms\n' % str(t1 - t0))


In [None]:
# Search for 5-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
# Used up all mem, and in 30 minutes it still wasn't done.  Had to kill it >.<
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5); (v5)-[e5]->(v6)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 5-hop motifs: %s ms\n' % str(t1 - t0))