In [None]:
# Create a Vertex DataFrame with unique ID column "id"
#v = sqlContext.createDataFrame([
#  ("a", "Alice", 34),
#  ("b", "Bob", 36),
#  ("c", "Charlie", 30),
#], ["id", "name", "age"])

# Create an Edge DataFrame with "src" and "dst" columns
#e = sqlContext.createDataFrame([
#  ("a", "b", "friend"),
#  ("b", "c", "follow"),
#  ("c", "b", "follow"),
#], ["src", "dst", "relationship"])
#g = GraphFrame(v, e)

# Create a graphframe out of SNAP's temporal dataset: https://snap.stanford.edu/data/email-Eu-core.html
# Dept3 is the one with the smallest number of edges: 12216

csv_path = "snap_temporal_graph_dataset/email-Eu-core-Dept3.txt"
edges_df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", " ").load(csv_path)

print(edges_df.head())

# Construct the required vertices dataframe, which requires a special "id" column.
src_vertices_df = edges_df.selectExpr("src as id")
dst_vertices_df = edges_df.selectExpr("dst as id")
vertices_df = src_vertices_df.unionAll(dst_vertices_df).distinct()

print(vertices_df.head())

# Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

Row(src='11', dst='39', time='0')
Row(id='51')
+---+--------+
| id|inDegree|
+---+--------+
|  7|      87|
| 51|      11|
| 54|     615|
| 15|     219|
| 11|     110|
| 69|      29|
| 29|      35|
| 42|      48|
| 87|     111|
| 73|     225|
| 64|     181|
|  3|      19|
| 30|     226|
| 34|       9|
| 59|      51|
|  8|     296|
| 22|      43|
| 28|      10|
| 85|     158|
| 16|     378|
+---+--------+
only showing top 20 rows

