In [1]:
# Is this a sensible config, you ask?
print(sc._conf.getAll())

print("executor memory: %s" % sc._conf.get('spark.executor.memory'))

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

[('hive.metastore.warehouse.dir', 'file:/Users/jfon/src/github.com/file_access_monitor/spark-warehouse'), ('spark.submit.pyFiles', '/Users/jfon/.ivy2/jars/graphframes_graphframes-0.4.0-spark2.0-s_2.11.jar,/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-slf4j_2.11-2.1.2.jar,/Users/jfon/.ivy2/jars/org.scala-lang_scala-reflect-2.11.0.jar,/Users/jfon/.ivy2/jars/org.slf4j_slf4j-api-1.7.7.jar'), ('spark.app.id', 'local-1494358469470'), ('spark.executor.id', 'driver'), ('spark.app.name', 'PySparkShell'), ('spark.driver.host', '128.30.10.236'), ('spark.sql.catalogImplementation', 'hive'), ('spark.rdd.compress', 'True'), ('spark.driver.port', '55022'), ('spark.files', 'file:/Users/jfon/.ivy2/jars/graphframes_graphframes-0.4.0-spark2.0-s_2.11.jar,file:/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-logging-api_2.11-2.1.2.jar,file:/Users/jfon/.ivy2/jars/com.typesafe.scala-logging_scala-loggi

In [5]:
import time
from graphframes import GraphFrame

# To avoid those silly WARN on block locks not released yet by tasks.
sc.setLogLevel('ERROR')

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

t0 = int(round(time.time() * 1000))
# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216
num_rows = 100
csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3-%s.txt" % num_rows
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)
t1 = int(round(time.time() * 1000))

print('Time to load %s rows into a GraphFrame: %s ms' % (num_rows, str(t1 - t0)))

# Query: Get in-degree of each vertex
t0 = int(round(time.time() * 1000))
g.inDegrees.show()
t1 = int(round(time.time() * 1000))
print('Time to calculate inDegrees: %s ms\n' % str(t1 - t0))

# Run PageRank algorithm, and show results.
t0 = int(round(time.time() * 1000))
results = g.pageRank(resetProbability=0.01, maxIter=10)
results.vertices.select("id", "pagerank").show()
t1 = int(round(time.time() * 1000))
print('Time to calculate PageRank with 20 iterations: %s ms' % str(t1 - t0))

# Search for 3-hop motifs.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 3-hop motifs: %s ms\n' % str(t1 - t0))

Time to load 100 rows into a GraphFrame: 219 ms
+---+--------+
| id|inDegree|
+---+--------+
| 65|       7|
| 71|       1|
| 56|       4|
| 57|       1|
| 80|       1|
| 44|       1|
| 48|       3|
| 22|       3|
| 16|       3|
| 21|       3|
| 64|       2|
| 77|       2|
| 89|       1|
| 35|       3|
| 62|       2|
|  5|       1|
| 14|       1|
| 25|       2|
| 58|       1|
| 60|       9|
+---+--------+
only showing top 20 rows

Time to calculate inDegrees: 180 ms

+---+--------------------+
| id|            pagerank|
+---+--------------------+
| 69|0.011164705882352941|
| 65| 0.06396615154950808|
| 22| 0.05508933919807962|
| 85|                0.01|
| 87|                0.01|
| 83|                0.01|
| 24| 0.06664616106495666|
| 39|              0.0199|
| 64|           0.0297505|
| 66|0.012392823529411765|
| 44|0.010582352941176472|
| 77| 0.02204379374525956|
| 49|0.018263129167330337|
|  0|                0.01|
| 89|              0.0199|
| 67|0.026390565409436816|
| 57|0.025863796

In [None]:
# Search for 4-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 4-hop motifs: %s ms\n' % str(t1 - t0))


In [None]:
# Search for 5-hop motifs: JVM Crashy McCrash if you don't set the driver memory to at least 8g.
# Used up all mem, and in 30 minutes it still wasn't done.  Had to kill it >.<
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5); (v5)-[e5]->(v6)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 5-hop motifs: %s ms\n' % str(t1 - t0))