In [1]:
# Is this a sensible config, you ask?
print(sc._conf.getAll())

print("executor memory: %s" % sc._conf.get('spark.executor.memory'))

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

[('hive.metastore.warehouse.dir', 'file:/Users/jfon/src/github.com/file_access_monitor/spark-warehouse'), ('spark.submit.pyFiles', '/Users/jfon/.ivy2/jars/graphframes_graphframes-0.4.0-spark2.0-s_2.11.jar,/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-slf4j_2.11-2.1.2.jar,/Users/jfon/.ivy2/jars/org.scala-lang_scala-reflect-2.11.0.jar,/Users/jfon/.ivy2/jars/org.slf4j_slf4j-api-1.7.7.jar'), ('spark.driver.port', '60584'), ('spark.executor.id', 'driver'), ('spark.app.name', 'PySparkShell'), ('spark.app.id', 'local-1493911474737'), ('spark.driver.host', '128.30.10.236'), ('spark.sql.catalogImplementation', 'hive'), ('spark.rdd.compress', 'True'), ('spark.files', 'file:/Users/jfon/.ivy2/jars/graphframes_graphframes-0.4.0-spark2.0-s_2.11.jar,file:/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,file:/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-loggi

In [2]:
import time
from graphframes import GraphFrame

# To avoid those silly WARN on block locks not released yet by tasks.
sc.setLogLevel('ERROR')

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

"""
# Hello world example.
vertices_df = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
edges_df = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
"""

t0 = int(round(time.time() * 1000))
# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216
csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3.txt"
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)
t1 = int(round(time.time() * 1000))

print('Time to load 12k rows into a GraphFrame: %s ms' % str(t1 - t0))

# The default number of partitions for the shuffle RDD is a whooping 200.

# Query: Get in-degree of each vertex.
t0 = int(round(time.time() * 1000))
g.inDegrees.show()
t1 = int(round(time.time() * 1000))
print('Time to calculate inDegrees: %s ms\n' % str(t1 - t0))

# Run PageRank algorithm, and show results.
t0 = int(round(time.time() * 1000))
results = g.pageRank(resetProbability=0.01, maxIter=10)
results.vertices.select("id", "pagerank").show()
t1 = int(round(time.time() * 1000))
print('Time to calculate PageRank with 20 iterations: %s ms' % str(t1 - t0))

# Search for 3-hop motifs.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 3-hop motifs: %s ms\n' % str(t1 - t0))

Time to load 12k rows into a GraphFrame: 11475 ms
+---+--------+
| id|inDegree|
+---+--------+
| 65|      95|
| 71|     208|
| 56|     209|
| 57|     237|
| 80|     196|
| 44|     144|
|  1|     250|
| 85|     158|
| 43|     223|
| 87|     111|
| 86|      26|
| 37|      58|
| 72|      13|
| 76|      86|
| 73|     225|
| 74|      33|
| 45|      52|
| 34|       9|
| 48|     313|
| 22|      43|
+---+--------+
only showing top 20 rows

Time to calculate inDegrees: 2422 ms

+---+--------------------+
| id|            pagerank|
+---+--------------------+
| 69|0.024883249635716774|
| 65| 0.05898142349628339|
| 22|0.046510701000258316|
| 85| 0.14353307284285072|
| 87| 0.09258392565502635|
| 83| 0.09200146441630479|
| 24| 0.22600703759798135|
| 39| 0.09597251148215888|
| 19| 0.16091622409896456|
| 66| 0.12988731511507848|
| 56| 0.10501267824313702|
| 77|0.042032555185090956|
| 49|  0.2206331570647032|
|  0| 0.15358480061920282|
| 55| 0.04139891881312825|
| 12| 0.04548237933360287|
|  8| 0.09122

In [3]:
# Search for 4-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 4-hop motifs: %s ms\n' % str(t1 - t0))


+----+---------+----+----------------+----+---------------+---+---------------+----+
|  v1|       e1|  v2|              e2|  v3|             e3| v4|             e4|  v5|
+----+---------+----+----------------+----+---------------+---+---------------+----+
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,87,45049628]|[87]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,89,45029289]|[89]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,16,44942750]|[16]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,19,44938863]|[19]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,60,44364627]|[60]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,60,44355467]|[60]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,16,42610218]|[16]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1,47,42441779]|[47]|
|[11]|[11,39,0]|[39]|[39,83,45279817]|[83]|[83,1,45282280]|[1]|[1

In [None]:
# Search for 5-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
# Used up all mem, and in 30 minutes it still wasn't done.  Had to kill it >.<
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5); (v5)-[e5]->(v6)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 5-hop motifs: %s ms\n' % str(t1 - t0))