# requirements
```
osmium version 1.13.2
libosmium version 2.17.1
Supported PBF compression types: none zlib
```

In [None]:
# In this paper, we use New York pbf data ver.2022-02-24T21:21:27Z.
# !wget https://download.geofabrik.de/north-america/us/new-york-latest.osm.pbf -O ../data/new-york-latest.osm.pbf
# !osmium extract --overwrite --bbox -74.02,40.70,-73.9,40.84 -o ../data/newyork.osm.pbf ../data/new-york-latest.osm.pbf

In [None]:
from datetime import datetime, timedelta
import os
from pathlib import Path
pwd = Path(os.getcwd())

import pytz
import random
import sys
sys.path.append(str(pwd.parent))
sys.path.append(str(pwd.parent / "config"))

import warnings
warnings.simplefilter("ignore")

import networkx as nx
import numpy as np
from osmread import parse_file, Node, Way
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from config import Config
from util import haversine

config = Config.load()

# Create whole node dataframe.

In [None]:
file_path = "../data/newyork.osm.pbf"
file = parse_file(file_path)

nodes = []
ways = []
for data in tqdm(file):
    if isinstance(data, Node):
        if data.tags == {}:
            nodes.append(data)
    if isinstance(data, Way):
        if data.nodes[0] != data.nodes[-1]:
            ways.append(data)

node_df = pd.DataFrame(columns=["NodeID", "WayID"])
for way in tqdm(ways):
    tmp_df = pd.DataFrame({"NodeID": way.nodes, "WayID": way.id})
    node_df = pd.concat([node_df, tmp_df])

# Trim node dataframe with LOCAL_REGION_BOUND.

In [None]:
node_map_lat = {}
node_map_lon = {}
for node in nodes:
    node_map_lat.update({node.id: node.lat})
    node_map_lon.update({node.id: node.lon})
node_df["Longitude"] = node_df["NodeID"].map(node_map_lon)
node_df["Latitude"] = node_df["NodeID"].map(node_map_lat)

node_df = node_df[
    (node_df["Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (node_df["Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (node_df["Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (node_df["Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

# create node connection information

In [None]:
connected_node_df = pd.DataFrame(columns=["NodeID", "NextNodeID", "WayID", "Longitude", "Latitude"])
connected_node_df_list = []
for way_id in tqdm(node_df["WayID"].drop_duplicates()):
    tmp_df = node_df[node_df["WayID"]==way_id]
    if len(tmp_df) > 1:
        next_node_list = tmp_df["NodeID"].values.tolist()[1:] + [-1]
    else:
        next_node_list = [-1]

    assert len(tmp_df) == len(next_node_list)
    tmp_df["NextNodeID"] = next_node_list
    tmp_df["NextNodeID"] = tmp_df["NextNodeID"].astype(int)
    connected_node_df_list.append(tmp_df)
connected_node_df = pd.concat(connected_node_df_list)

# extract biggest graph.

In [None]:
graph = nx.Graph()
graph.add_nodes_from(connected_node_df["NodeID"].drop_duplicates().values)

for from_node, to_node in zip(connected_node_df["NodeID"].values, connected_node_df["NextNodeID"].values):
    if to_node == -1:
        continue
    else:
        graph.add_edge(from_node, to_node)

max_nodes = max(nx.connected_components(graph), key=len)

In [None]:
filtered_node_df = pd.DataFrame(columns=["NodeID", "NextNodeID", "WayID"])

for way_id in tqdm(connected_node_df["WayID"].drop_duplicates()):
    node_list = connected_node_df[connected_node_df["WayID"]==way_id]["NodeID"].values
    filtered_node_list = []
    for node_id in node_list:
        if node_id in max_nodes:
            filtered_node_list.append(node_id)
    if len(filtered_node_list) <= 1:
        continue
    next_node_list = filtered_node_list[1:] + [-1]
    assert len(filtered_node_list) == len(next_node_list)

    tmp_df = pd.DataFrame({"NodeID": filtered_node_list, "NextNodeID": next_node_list, "WayID": way_id})
    tmp_df["NextNodeID"] = tmp_df["NextNodeID"].astype(int)
    filtered_node_df = pd.concat([filtered_node_df, tmp_df])

filtered_node_df["Longitude"] = filtered_node_df["NodeID"].map(node_map_lon)
filtered_node_df["Latitude"] = filtered_node_df["NodeID"].map(node_map_lat)

In [None]:
node_info_df = filtered_node_df[["NodeID", "Longitude", "Latitude"]].drop_duplicates()

node_info = {
    node_id: {
        "node_index": node_index,
        "longitude": longitude,
        "latitude": lattitude,
    } for node_index, (node_id, longitude, lattitude) in enumerate(
        zip(node_info_df["NodeID"].values, node_info_df["Longitude"].values, node_info_df["Latitude"].values))
}

In [None]:
from_lat

# check map.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=[15,15])
plt.scatter(node_info_df["Longitude"], node_info_df["Latitude"], s=8)

for from_node_id, to_node_id in zip(filtered_node_df["NodeID"].values, filtered_node_df["NextNodeID"].values):
    if to_node_id == -1:
        continue
    from_lng = node_info[from_node_id]["longitude"]
    from_lat = node_info[from_node_id]["latitude"]
    to_lng = node_info[to_node_id]["longitude"]
    to_lat = node_info[to_node_id]["latitude"]
    plt.plot([from_lng, to_lng], [from_lat, to_lat])
plt.title("New York Node Connection (2022-02-24)", fontsize=20)
plt.xlabel("longitude", fontsize=15)
plt.ylabel("latitude", fontsize=15)
plt.savefig(f"../data/map_{config.LOCAL_REGION_BOUND}.png")

In [None]:
dir(config)

# create cost csv.

In [None]:
num_nodes = len(node_info_df)
adj_matrix = np.ones([num_nodes, num_nodes]) / np.zeros([num_nodes, num_nodes])
for i in range(num_nodes):
    adj_matrix[i][i] = 0

for from_node_id, to_node_id in zip(filtered_node_df["NodeID"].values, filtered_node_df["NextNodeID"].values):
    if to_node_id == -1:
        continue
    from_node_idx = node_info[from_node_id]["node_index"]
    to_node_idx = node_info[to_node_id]["node_index"]

    cost = haversine(
        node_info[from_node_id]["longitude"],
        node_info[from_node_id]["latitude"],
        node_info[to_node_id]["longitude"],
        node_info[to_node_id]["latitude"],
    ) / 15 * 60 # Minutes to arrival. Cars drives at 15 kilometers per hour.

    adj_matrix[from_node_idx][to_node_idx] = cost
    adj_matrix[to_node_idx][from_node_idx] = cost

In [None]:
for k in tqdm(range(num_nodes)):
    for i in range(num_nodes):
        for j in range(num_nodes):
            if adj_matrix[i][k]!=np.inf and adj_matrix[k][j]!=np.inf:
                adj_matrix[i][j] = min(adj_matrix[i][j], adj_matrix[i][k] + adj_matrix[k][j])
adj_matrix_df = pd.DataFrame(adj_matrix)
adj_matrix_df.to_csv("../data/AccurateMap.csv", index=False, header=False)

# create node csv.

In [None]:
node_data_df = filtered_node_df.reset_index(drop=True)[["NodeID", "Longitude", "Latitude"]]
node_data_df.drop_duplicates(inplace=True)
node_data_df[["RoadName", "Gid", "Distance", "WayID"]] = np.nan
node_data_df[["NodeID", "WayID", "Longitude", "Latitude", "RoadName", "Gid", "Distance"]].to_csv("../data/Node.csv")

# create node id list.

In [None]:
with open("../data/NodeIDList.txt", mode="w") as f:
    for node_id in node_data_df["NodeID"]:
        f.write(f"{node_id}\n")

# create driver.

In [None]:
driver_start_points = random.choices(node_data_df["NodeID"].values, k=15000)
df = pd.DataFrame({"DriverID": range(0,15000), "NodeS": driver_start_points})
df.to_csv("../data/Drivers0601.csv", index=False)

# create order.

In [None]:
order_df = pd.read_csv("../data/Order/original/yellow_tripdata_2016-06.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < LOCAL_REGION_BOUND.north_bound)
]

In [None]:
nn = NearestNeighbors(algorithm='ball_tree')
nn.fit(node_data_df[["Longitude", "Latitude"]].values)

node_id_list = node_data_df["NodeID"].values.tolist()

_, start_indices = nn.kneighbors(
    rename_order_without_outlier_df[["PointS_Longitude", "PointS_Latitude"]], n_neighbors=1)
rename_order_without_outlier_df["NodeS"] = [node_id_list[i[0]] for i in start_indices]

_, end_indices = nn.kneighbors(
    rename_order_without_outlier_df[["PointE_Longitude", "PointE_Latitude"]], n_neighbors=1
)
rename_order_without_outlier_df["NodeE"] = [node_id_list[i[0]] for i in end_indices]

In [None]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

rename_order_without_outlier_df["Start_datetime"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["ID"] = range(len(rename_order_without_outlier_df))
start_date = datetime(year=2016, month=6, day=1)
directory = "train"
while True:
    next_date = start_date + timedelta(days=1)
    tmp_df = rename_order_without_outlier_df[
        (rename_order_without_outlier_df["Start_datetime"]<next_date)
        & (rename_order_without_outlier_df["Start_datetime"]>start_date)
    ]
    tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
    tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
    if start_date.day >= 24:
        directory = "test"
    tmp_df[["ID"] + list(RENAME_DICT.values()) + ["NodeS", "NodeE"]].to_csv(
        f"../data/Order/modified/{directory}/order_2016{str(start_date.month).zfill(2)}{str(start_date.day).zfill(2)}.csv",
        index=False
    )
    start_date = next_date
    if start_date.month != 6:
        break