In [None]:
# pyspark --packages graphframes:graphframes:0.8.2-spark3.2-s_2.12

In [None]:
# WITH “https://github.com/neo4j-graph-analtics/book/raw/master/data” AS base
# WITH base + “transport-nodes.csv” AS uri
# LOAD CSV WITH HEADERS FROM uri AS row
# MERGE (place:Place {id:row.id})
# SET place.latitude = toFloat(row.latitude),
#         place.longitude = toFloat(row.longitude),
#         place.population = toInteger(row.population)

In [9]:
from pyspark.sql.types import *
from graphframes import *

In [10]:
def create_transport_graph():
    node_fields = [
        StructField("id", StringType(), True),
        StructField("latitude", FloatType(), True),
        StructField("longitude", FloatType(), True),
        StructField("population", IntegerType(), True)
    ]

    nodes = spark.read.csv("/Users/hyunseokjung/Github/graph_algorithm/data/transport-nodes.csv",
                           header=True,
                           schema=StructType(node_fields))
    rels = spark.read.csv("/Users/hyunseokjung/Github/graph_algorithm/data/transport-relationships.csv", header=True)
    
    reversed_rels = (rels.withColumn("newSrc", rels.dst)
                    .withColumn("newDst", rels.src)
                    .drop("dst", "src")
                    .withColumnRenamed("newSrc", "src")
                    .withColumnRenamed("newDst", "dst")
                    .select("src", "dst", "relationship", "cost"))
    
    relationships = rels.union(reversed_rels)
    
    return GraphFrame(nodes, relationships)

In [11]:
g = create_transport_graph()

In [12]:
(g.vertices
  .filter("population > 100000 and population < 300000")
  .sort("population")
  .show())

+----------+--------+---------+----------+
|        id|latitude|longitude|population|
+----------+--------+---------+----------+
|Colchester|51.88921|  0.90421|    104390|
|   Ipswich|52.05917|  1.15545|    133384|
+----------+--------+---------+----------+



In [13]:
from_expr = "id='Den Haag'"
to_expr = "population > 100000 and population < 3000000 and id <> 'Den Haag'"
result = g.bfs(from_expr, to_expr)

In [14]:
print(result.columns)

['from', 'e0', 'to']


In [15]:
columns = [column for column in result.columns if not column.startswith("e")]
result.select(columns).show(5, False)

+---------------------------------------+---------------------------------------+
|from                                   |to                                     |
+---------------------------------------+---------------------------------------+
|{Den Haag, 52.078663, 4.288788, 514861}|{Rotterdam, 51.9225, 4.47917, 623652}  |
|{Den Haag, 52.078663, 4.288788, 514861}|{Amsterdam, 52.37919, 4.899431, 821752}|
+---------------------------------------+---------------------------------------+

