In [2]:
from graphframes import GraphFrame

# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216
csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3.txt"
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

"""
# Hello world example.
vertices_df = sqlContext.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
edges_df = sqlContext.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])
"""

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+

+---+-------------------+
| id|           pagerank|
+---+-------------------+
|  a|               0.01|
|  c|0.27995525261339177|
|  b| 0.2808611427228327|
+---+-------------------+

