In [2]:
import pandas
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import ArrayType, LongType
from pyspark.sql import SparkSession
import os
import osmnx as ox

In [3]:
# Set up modes and dirs
overwrite  = False
databricks = False
if not databricks:
    from util import folder
    data_dir = folder.DATA_DIR
    spark = SparkSession.builder.getOrCreate()
else:
    data_dir = "/dbfs/mnt/group01"

map_dir = map_dbfs= os.path.join(data_dir, "nyc/map")

if databricks:
    map_dbfs = map_dbfs.replace("/dbfs", "")

dirs = [data_dir, map_dir]

In [4]:
for d in dirs:
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
nyc_map = None

In [6]:
def check_file_exist(_path):
    if os.path.exists(_path) and not overwrite:
            print("[SYSTEM]: File exists: {}".format(_path))
            return True
    else:
        return False

In [7]:
def get_nyc_map():
    info_title = lambda _file_name : print("____________________________MAP_DOWNLOAD_{}____________________________".format(_file_name))
    info_start = lambda _file_name : print("[SYSTEM]: Start  {}".format(_file_name))
    info_end   = lambda _file_name : print("[SYSTEM]: Finish {}".format(_file_name))
    info_title("NYC")
    dest = os.path.join(map_dir, "NYC.mph")
    try:
        info_start("NYC.mph")
        if check_file_exist(dest):
            remote_map = ox.load_graphml(dest)
        else:
            remote_map = ox.graph_from_place('New York City, New York, USA', network_type='drive')
            ox.save_graphml(remote_map, dest)
        info_end("NYC.mph")
        return remote_map
    except Exception:
        print("Exception : NYC Map's data is not downloaded")

In [8]:
nyc_map = get_nyc_map()

____________________________MAP_DOWNLOAD_NYC____________________________
[SYSTEM]: Start  NYC.mph
[SYSTEM]: File exists: /Users/kzmain/LSDE/data/nyc/map/NYC.mph
[SYSTEM]: Finish NYC.mph


In [9]:
def get_raw_edge_node():
    info_title = lambda node_or_edge : print("____________________________RAW_{}____________________________".format(node_or_edge))
    info_start = lambda node_or_edge : print("[SYSTEM]: Start  {}".format(node_or_edge))
    info_end   = lambda node_or_edge : print("[SYSTEM]: Finish {}".format(node_or_edge))
    _gdf_nodes, _gdf_edges = ox.graph_to_gdfs(nyc_map, nodes=True, edges=True)



    info_title("NODE")
    dest = os.path.join(map_dir,"node_raw.csv")
    if not check_file_exist(dest):
        info_start("NODE")
        _nodes_df = pandas.DataFrame(_gdf_nodes)[['osmid','y','x']]
        _nodes_df.to_csv(dest, index=False)
        info_end("NODE")

    info_title("EDGE")
    dest = os.path.join(map_dir,"edge_raw.csv")
    if not check_file_exist(dest):
        info_start("EDGE")
        _edges_df = pandas.DataFrame(_gdf_edges)[['osmid', 'oneway', 'length', 'u', 'v']]
        _edges_df.to_csv(dest, index=False)
        info_end("EDGE")

In [10]:
get_raw_edge_node()


____________________________RAW_NODE____________________________
[SYSTEM]: File exists: /Users/kzmain/LSDE/data/nyc/map/node_raw.csv
____________________________RAW_EDGE____________________________
[SYSTEM]: File exists: /Users/kzmain/LSDE/data/nyc/map/edge_raw.csv


In [11]:
def string_to_list(osmids):
    res = []
    osmids = osmids.replace("[", "").replace("]", "").replace(" ", "")
    for osmid in osmids.split(","):
        res.append(int(osmid))
    return res

to_list_udf = udf(string_to_list, ArrayType(LongType()))

In [12]:
def process_edge():
    info_title = lambda node_or_edge : print("____________________________FINAL_{}____________________________".format(node_or_edge))
    info_start = lambda node_or_edge : print("[SYSTEM]: Start  {}".format(node_or_edge))
    info_end   = lambda node_or_edge : print("[SYSTEM]: Finish {}".format(node_or_edge))
    info_title("EDGE")
    dest = os.path.join(map_dir,"edge.parquet")
    if check_file_exist(dest):
        return
    info_start("EDGE")
    nodes_df = spark.read\
    .option("inferschema", True)\
    .option("header", True)\
    .csv(os.path.join(map_dbfs,"edge_raw.csv"))

    nodes_df\
    .withColumn('osmid', to_list_udf('osmid'))\
    .withColumn('osmid', explode('osmid'))\
    .write.parquet(dest)
    info_end("EDGE")

In [13]:
process_edge()

____________________________FINAL_EDGE____________________________
[SYSTEM]: File exists: /Users/kzmain/LSDE/data/nyc/map/edge.parquet


In [14]:
def process_node():
    info_title = lambda node_or_edge : print("____________________________FINAL_{}____________________________".format(node_or_edge))
    info_start = lambda node_or_edge : print("[SYSTEM]: Start  {}".format(node_or_edge))
    info_end   = lambda node_or_edge : print("[SYSTEM]: Finish {}".format(node_or_edge))
    info_title("NODE")
    dest = os.path.join(map_dir,"node.parquet")
    if check_file_exist(dest):
        return
    info_start("NODE")
    spark.read\
    .option("inferschema", True)\
    .option("header", True)\
    .csv(os.path.join(map_dbfs,"node_raw.csv"))\
    .write.parquet(os.path.join(map_dbfs,"node.parquet"))
    info_end("NODE")

In [15]:
process_node()

____________________________FINAL_NODE____________________________
[SYSTEM]: File exists: /Users/kzmain/LSDE/data/nyc/map/node.parquet
