In [1]:
import time
from graphframes import GraphFrame

# To avoid those silly WARN on block locks not released yet by tasks.
sc.setLogLevel('ERROR')

# The default number of partitions for the shuffle RDD is a whooping 200.
# Set it to a more sensible number, e.g., twice the number of cores.
sqlContext.setConf('spark.sql.shuffle.partitions', '4')

"""
# Hello world example.
vertices_df = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
edges_df = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
"""

t0 = int(round(time.time() * 1000))
# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216
csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3.txt"
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)
t1 = int(round(time.time() * 1000))

print('Time to load 12k rows into a GraphFrame: %s ms' % str(t1 - t0))

# The default number of partitions for the shuffle RDD is a whooping 200.

# Query: Get in-degree of each vertex.
t0 = int(round(time.time() * 1000))
g.inDegrees.show()
t1 = int(round(time.time() * 1000))
print('Time to calculate inDegrees: %s ms\n' % str(t1 - t0))

# Run PageRank algorithm, and show results.
t0 = int(round(time.time() * 1000))
results = g.pageRank(resetProbability=0.01, maxIter=10)
results.vertices.select("id", "pagerank").show()
t1 = int(round(time.time() * 1000))
print('Time to calculate PageRank with 20 iterations: %s ms' % str(t1 - t0))

# Search for 3-hop motifs.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 3-hop motifs: %s ms\n' % str(t1 - t0))

Time to load 12k rows into a GraphFrame: 9003 ms
+---+--------+
| id|inDegree|
+---+--------+
| 65|      95|
| 71|     208|
| 56|     209|
| 57|     237|
| 80|     196|
| 44|     144|
|  1|     250|
| 85|     158|
| 43|     223|
| 87|     111|
| 86|      26|
| 37|      58|
| 72|      13|
| 76|      86|
| 73|     225|
| 74|      33|
| 45|      52|
| 34|       9|
| 48|     313|
| 22|      43|
+---+--------+
only showing top 20 rows

Time to calculate inDegrees: 2034 ms

+---+--------------------+
| id|            pagerank|
+---+--------------------+
| 69|0.024883249635716774|
| 65| 0.05898142349628339|
| 22|0.046510701000258316|
| 85| 0.14353307284285072|
| 87| 0.09258392565502635|
| 83| 0.09200146441630479|
| 24| 0.22600703759798135|
| 39| 0.09597251148215888|
| 19| 0.16091622409896456|
| 66| 0.12988731511507848|
| 56| 0.10501267824313702|
| 77|0.042032555185090956|
| 49|  0.2206331570647032|
|  0| 0.15358480061920282|
| 55| 0.04139891881312825|
| 12| 0.04548237933360287|
|  8| 0.091222

In [None]:
# Search for 4-hop motifs: JVM Crashy McCrash.
t0 = int(round(time.time() * 1000))
motifs = g.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3); (v3)-[e3]->(v4); (v4)-[e4]->(v5)")
motifs.show()
t1 = int(round(time.time() * 1000))
print('Time to find 4-hop motifs: %s ms\n' % str(t1 - t0))


--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59979)
Traceback (most recent call last):
  File "//anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-9ec2fe71027a>", line 4, in <module>
    motifs.show()
  File "/usr/local/spark-2.0.2/python/pyspark/sql/dataframe.py", line 287, in show
    print(self._jdf.showString(n, truncate))
  File "/usr/local/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py", line 1133, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/spark-2.0.2/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/usr/local/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py", line 319, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaErr