In [17]:
import datetime
import json
import random
import osmnx
import numpy as np
import pandas as pd
from pprint import pprint as pretty_print
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
from tqdm.notebook import tqdm

# This is a nodebook used to sample and generate the three datasets

## Amazon Dataset

In [2]:
@dataclass
class Dimensions:
    depth: float
    height: float
    width: float

    @property
    def volume(self) -> float:
        return self.depth * self.height * self.width

@dataclass
class TimeWindow:
    start_time: datetime.datetime
    end_time: datetime.datetime

@dataclass
class Package:
    name: str
    time_window: Optional[TimeWindow]
    service_time: float
    dimensions: Dimensions

@dataclass
class Stop:
    name: str
    requested_parcel: List[Package]
    lat: float
    lng: float
    is_depot: bool

@dataclass
class Route:
    name: str
    stops: List[Stop]
    capacity: float
    departure_time: datetime.datetime
    travel_times: Dict[Tuple[str, str], float]

def _parse_dimensions(dimension_data: Dict) -> Dimensions:
    depth_centimeters = dimension_data['depth_cm']
    height = dimension_data['height_cm']
    width = dimension_data['width_cm']
    return Dimensions(depth=depth_centimeters, height=height, width=width)

def _parse_time_window(time_window_data: Dict) -> Optional[TimeWindow]:
    start_time = time_window_data["start_time_utc"]
    end_time = time_window_data["end_time_utc"]
    if isinstance(start_time, float) or isinstance(end_time, float):
        return None

    start_time_date = datetime.datetime.fromisoformat(start_time)
    end_time_date = datetime.datetime.fromisoformat(end_time)

    return TimeWindow(start_time=start_time_date, end_time=end_time_date)

def _parse_package(package_id: str, package_data: Dict) -> Package:
    name = package_id
    time_window = _parse_time_window(package_data['time_window'])
    service_time = package_data['planned_service_time_seconds']
    dimensions = _parse_dimensions(package_data['dimensions'])
    return Package(name=name, time_window=time_window, service_time=service_time, dimensions=dimensions)

def _parse_route_stop(stop_id: str, stop_data: Dict) -> Stop:
    packages = []
    for package_id in stop_data.keys():
        package_data = stop_data[package_id]
        package = _parse_package(package_id, package_data)
        packages.append(package)
    return Stop(name=stop_id, requested_parcel=packages, lat=0, lng=0, is_depot=False)

def _parse_route(route_id: str, route_data: Dict) -> Route:
    stops = []
    for stop_id in route_data.keys():
        stop_data = route_data[stop_id]
        stop = _parse_route_stop(stop_id, stop_data)
        stops.append(stop)
    return Route(name=route_id, stops=stops, capacity=0.0, departure_time=datetime.datetime.now(), travel_times={})

def parse_package_data(eval_package_data_path: str) -> List[Route]:
    with open(eval_package_data_path) as package_data_file:
        package_file_data = json.load(package_data_file)

    routes = []
    for route_id in package_file_data.keys():
        route_data = package_file_data[route_id]
        route = _parse_route(route_id, route_data)
        routes.append(route)

    return routes

def _add_additional_stop_data(stop: Stop, additional_data: Dict):
    lat = additional_data['lat']
    lng = additional_data['lng']
    stop.lat = lat
    stop.lng = lng
    if additional_data['type'] == 'Dropoff':
        stop.is_depot = False
    else:
        stop.is_depot = True

def _add_additional_route_data(route: Route, additonal_data: Dict):
    capacity = additonal_data['executor_capacity_cm3']
    route.capacity = capacity

    departure_time = datetime.datetime.fromisoformat(additonal_data['date_YYYY_MM_DD'] + ' ' + additonal_data['departure_time_utc'])
    route.departure_time = departure_time

    for stop in route.stops:
        stop_id = stop.name
        stop_data = additonal_data['stops'][stop_id]
        _add_additional_stop_data(stop, additional_data=stop_data)

def add_route_data(routes: List[Route], route_data_path: str):
    with open(route_data_path) as route_data_file:
        route_file_data = json.load(route_data_file)

    route_by_id = {}
    for route in routes:
        route_by_id[route.name] = route

    for route_id in route_file_data.keys():
        route_data = route_file_data[route_id]

        route = route_by_id[route_id]

        _add_additional_route_data(route, additonal_data=route_data)

# (AH, AH): distance_between_AH_and_AH
# AH : { AH: distance_between_AH_and_AH, ... }

def _add_travel_time_to_route(route: Route, travel_time_data: Dict):
    travel_times = {}
    for origin_stop in route.stops:
        for target_stop in route.stops:
            travel_times[origin_stop.name, target_stop.name] = travel_time_data[origin_stop.name][target_stop.name]

    route.travel_times = travel_times

def add_travel_times(routes: List[Route], travel_time_data_path: str):
    with open(travel_time_data_path) as travel_time_data_file:
        travel_time_data = json.load(travel_time_data_file)

    for route in routes:
        route_data = travel_time_data[route.name]
        _add_travel_time_to_route(route, route_data)


In [3]:
routes = parse_package_data("./almrrc2021/almrrc2021-data-evaluation/model_apply_inputs/eval_package_data.json")
add_route_data(routes, './almrrc2021/almrrc2021-data-evaluation/model_apply_inputs/eval_route_data.json')
add_travel_times(routes, './almrrc2021/almrrc2021-data-evaluation/model_apply_inputs/eval_travel_times.json')

In [88]:
len(routes)

3052

In [89]:
def sample_stops(stops, nodes, num_sample=50):
    nodes = add_depots(stops, nodes)
    stops = random.choices(stops, k=num_sample)
    nodes = add_nodes(stops, nodes)
    return nodes


def add_depots(stops, nodes):
    for stop in stops:
        # save depot
        if stop.is_depot:
            nodes[f'D_{stop.name}'] = {
                "Lat": stop.lat,
                "Lon": stop.lng,
                "Demand": 0
            }
            return nodes


def add_nodes(stops, nodes):
    for stop in stops:
        # calculate demand size for each node
        d_size = sum([pack.dimensions.depth * pack.dimensions.height * pack.dimensions.width for pack in stop.requested_parcel])
        
        if d_size == 0:
            continue
        
        if f"C_{stop.name}" not in nodes.keys():
            nodes[f"C_{stop.name}"] = {
                "Lat": stop.lat,
                "Lon": stop.lng,
                "Demand": d_size 
            }
        else:
            nodes[f"C_{stop.name}"]["Demand"] += d_size
    return nodes

In [90]:
nodes = {}

In [91]:
nodes = sample_stops(routes[0].stops, nodes, 80)
nodes = sample_stops(routes[1].stops, nodes, 80)
nodes = sample_stops(routes[2].stops, nodes, 80)

In [92]:
len(nodes.keys())

165

In [104]:
df = pd.DataFrame.from_dict(nodes, orient='index').rename_axis("Id").reset_index()

In [105]:
df

Unnamed: 0,Id,Lat,Lon,Demand
0,D_UZ,33.918699,-118.324843,0.000
1,C_YJ,33.889082,-118.354826,35932.530
2,C_YV,33.891009,-118.350679,2228.832
3,C_BB,33.898753,-118.350277,90942.306
4,C_TX,33.898680,-118.344971,10466.192
...,...,...,...,...
160,C_VM,34.438840,-118.544269,27654.480
161,C_AC,34.445781,-118.547663,5741.924
162,C_TJ,34.431924,-118.553759,15726.410
163,C_XG,34.439153,-118.540444,10440.234


In [106]:
df.to_csv("LosAngeles.NODES", sep="\t", index=False)

In [9]:
dict_stops = {}
dict_depot = {}
dict_parcel = {}

for route in tqdm(routes):
    for stop in route.stops:
        if stop.is_depot and stop.name not in dict_depot:
            dict_depot[stop.name] = (stop.lat, stop.lng)
        if not stop.is_depot and stop.name not in dict_stops.keys():            
            dict_stops[stop.name] = (stop.lat, stop.lng)
        for parcel in stop.requested_parcel:
            if parcel.name not in dict_parcel:
                dict_parcel[parcel.name] = [stop.name, parcel.service_time, parcel.dimensions]

  0%|          | 0/3052 [00:00<?, ?it/s]

In [11]:
routes[0]

27556

In [10]:
len(dict_depot)

392

In [11]:
len(dict_stops)

675

In [25]:
len(dict_parcel)

724913

## Shanghai Dataset

In [2]:
data = pd.read_csv("[Data] Parcel Deliveries - Shanghai.csv")

In [15]:
df = data.groupby(by=["CUSTOMER", "X", "Y"])[["DEMAND"]].sum().reset_index()

In [17]:
df

Unnamed: 0,CUSTOMER,X,Y,DEMAND
0,B0001,121.432655,31.147011,2688
1,B0003,121.408859,31.324804,1280
2,B0004,121.313957,31.176537,1152
3,B0008,121.491043,31.279089,5760
4,B0011,121.212265,31.270176,3520
...,...,...,...,...
5443,B9207,121.388962,31.308622,2240
5444,B9208,121.401924,31.199085,1984
5445,B9211,121.551903,31.244450,4032
5446,B9212,121.366760,31.354418,4352


In [16]:
import pandas as pd
import sys
from geopy import distance
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
from tqdm.notebook import tqdm

@dataclass
class Node:
    id: str
    coordinate: Tuple[float, float]
    demand: int
    
@dataclass
class Route:
    id_from: str
    id_to: str
    total_distance: float

@dataclass
class Network:
    name: str
    nodes: Dict[str, Node]
    routes: Dict[Tuple[str, str], Route]


def parse_data(network_name: str, data_path: str) -> Network:
    data = pd.read_csv(data_path)
    data = data.groupby(by=["X", "Y", "CUSTOMER"])[["DEMAND"]].sum().reset_index()
    dict_nodes = {}
    for i in tqdm(data.index):
        current_node = Node(data.loc[i, "CUSTOMER"], (data.loc[i, "Y"], data.loc[i, "X"]), data.loc[i, "DEMAND"])
        dict_nodes[current_node.id] = current_node
    dict_routes = {}
    for node_one_id, node_one in tqdm(dict_nodes.items()):
        for node_two_id, node_two in dict_nodes.items():
            if (node_one_id, node_two_id) in dict_routes:
                continue
            if node_one_id == node_two_id:
                current_distance = sys.float_info.max
            else:
                current_distance = distance.distance(node_one.coordinate, node_two.coordinate).km
            dict_routes[(node_one_id, node_two_id)] = Route(node_one_id, node_two_id, current_distance)
    return Network(network_name, dict_nodes, dict_routes)

In [21]:
data_path = "[Data] Parcel Deliveries - Shanghai.csv"

In [68]:
data_network = parse_data("Shanghai", data_path)

  0%|          | 0/5448 [00:00<?, ?it/s]

  0%|          | 0/5448 [00:00<?, ?it/s]

In [74]:
len(data_network.nodes.keys())

5448

In [84]:
all_nodes = data_network.nodes.keys()

dict_keys(['B5253', 'B3442', 'B7138', 'B6747', 'B4395', 'B0864', 'B6694', 'B9182', 'B8205', 'B4455', 'B0104', 'B2291', 'B6328', 'B8865', 'B0626', 'B7024', 'B6315', 'B3073', 'B6374', 'B1145', 'B2600', 'B3433', 'B5713', 'B3961', 'B1036', 'B1652', 'B7582', 'B0839', 'B3805', 'B2290', 'B6203', 'B0143', 'B8851', 'B9021', 'B2884', 'B3054', 'B4121', 'B5784', 'B4056', 'B7278', 'B9036', 'B4514', 'B1439', 'B6473', 'B7797', 'B8375', 'B4614', 'B1118', 'B7762', 'B5886', 'B6335', 'B3972', 'B4114', 'B1292', 'B1503', 'B7375', 'B0585', 'B6968', 'B3062', 'B6217', 'B7442', 'B5214', 'B0374', 'B4461', 'B4094', 'B2647', 'B3543', 'B3199', 'B5899', 'B1018', 'B4567', 'B6774', 'B2860', 'B4434', 'B2899', 'B2420', 'B3532', 'B8683', 'B2106', 'B0011', 'B1172', 'B2096', 'B0088', 'B6296', 'B8495', 'B0587', 'B3669', 'B4621', 'B5559', 'B5193', 'B3379', 'B7085', 'B2299', 'B2703', 'B7437', 'B6350', 'B7862', 'B1137', 'B7352', 'B3161', 'B6888', 'B9145', 'B4927', 'B9048', 'B3827', 'B2815', 'B0595', 'B0434', 'B3397', 'B8147',

In [105]:
node_combi = [(x, y) for x in all_nodes for y in all_nodes]

In [109]:
routes_df = pd.DataFrame(node_combi)

In [112]:
routes_df.rename(columns={0: 'id_from', 1: 'id_to'})

Unnamed: 0,id_from,id_to
0,B5253,B5253
1,B5253,B3442
2,B5253,B7138
3,B5253,B6747
4,B5253,B4395
...,...,...
29680699,B2693,B5126
29680700,B2693,B5340
29680701,B2693,B8437
29680702,B2693,B3251


In [None]:
for i in tqdm(routes_df.index):
    route = data_network.routes[(routes_df.loc[i, "id_from"], routes_df.loc[i, "id_to"])]
    routes_df.loc[i, "distance"] = route.total_distance

In [58]:
data = data.groupby(by=["X", "Y", "CUSTOMER"])[["DEMAND"]].sum().reset_index()

In [75]:
test_dict = {}
for i in tqdm(data.index):
    for j in data.index:
        if i == j:
            current_distance = sys.float_info.max
        else:
            current_distance = distance.distance(
                (data.loc[i, "Y"], data.loc[i, "X"]),
                (data.loc[j, "Y"], data.loc[j, "Y"])
            ).km
        test_dict[(data.loc[i, "CUSTOMER"], data.loc[j, "CUSTOMER"])] = current_distance 

  0%|          | 0/5448 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [65]:
test_dict

{('B5253', 'B5253'): 1.7976931348623157e+308,
 ('B5253', 'B3442'): 8284.733445938171,
 ('B5253', 'B7138'): 8269.595389695975,
 ('B5253', 'B6747'): 8321.270932975425,
 ('B5253', 'B4395'): 8278.053769885124,
 ('B5253', 'B0864'): 8320.44913729842,
 ('B5253', 'B6694'): 8319.849637207748,
 ('B5253', 'B9182'): 8317.859527037537,
 ('B5253', 'B8205'): 8323.217158322615,
 ('B5253', 'B4455'): 8314.087470059074,
 ('B5253', 'B0104'): 8278.749380421197,
 ('B5253', 'B2291'): 8287.215937314311,
 ('B5253', 'B6328'): 8322.199169862793,
 ('B5253', 'B8865'): 8288.22276108619,
 ('B5253', 'B0626'): 8302.886890739843,
 ('B5253', 'B7024'): 8281.470742255662,
 ('B5253', 'B6315'): 8316.672079338872,
 ('B5253', 'B3073'): 8321.430978223298,
 ('B5253', 'B6374'): 8292.38019647776,
 ('B5253', 'B1145'): 8291.148456552008,
 ('B5253', 'B2600'): 8297.492407490496,
 ('B5253', 'B3433'): 8318.028764265675,
 ('B5253', 'B5713'): 8279.452342463317,
 ('B5253', 'B3961'): 8269.700101935341,
 ('B5253', 'B1036'): 8316.59955678092

## Paris Dataset

In [4]:
paris_df = pd.read_csv("Paris.nodes", sep=" ")

In [5]:
paris_df

Unnamed: 0,Id,Lon,Lat,Demand[kg],Demand[m^3*10^-3],Duration
0,D0,2.409223,48.925258,0,0,00:00:00
1,C1,2.332285,48.827915,419,4191,00:15:00
2,C2,2.398701,48.793634,563,5631,00:16:00
3,C3,2.243650,48.843398,325,3251,00:12:00
4,C4,2.387332,48.875706,407,4071,00:18:00
...,...,...,...,...,...,...
108,C108,2.444511,48.939981,569,5691,00:17:00
109,C109,2.386449,48.842822,522,5221,00:15:00
110,C110,2.339154,48.876952,373,3731,00:19:00
111,C111,2.179838,48.879446,597,5971,00:19:00


In [60]:
distance_matrix = pd.DataFrame([(x, y) for x in paris_df["Id"] for y in paris_df["Id"]])

In [61]:
distance_matrix = distance_matrix.rename(columns={0: "from_id", 1: "to_id"})

In [62]:
distance_matrix

Unnamed: 0,from_id,to_id
0,D0,D0
1,D0,C1
2,D0,C2
3,D0,C3
4,D0,C4
...,...,...
12764,C112,C108
12765,C112,C109
12766,C112,C110
12767,C112,C111


In [63]:
distance_matrix = pd.merge(distance_matrix, paris_df[["Id", "Lon", "Lat"]], left_on=["from_id"], right_on=["Id"])
distance_matrix = pd.merge(distance_matrix, paris_df[["Id", "Lon", "Lat"]], left_on=["to_id"], right_on=["Id"])

In [64]:
distance_matrix = distance_matrix.drop(columns=["from_id", "to_id"])

In [65]:
distance_matrix

Unnamed: 0,Id_x,Lon_x,Lat_x,Id_y,Lon_y,Lat_y
0,D0,2.409223,48.925258,D0,2.409223,48.925258
1,C1,2.332285,48.827915,D0,2.409223,48.925258
2,C2,2.398701,48.793634,D0,2.409223,48.925258
3,C3,2.243650,48.843398,D0,2.409223,48.925258
4,C4,2.387332,48.875706,D0,2.409223,48.925258
...,...,...,...,...,...,...
12764,C108,2.444511,48.939981,C112,2.317015,48.814601
12765,C109,2.386449,48.842822,C112,2.317015,48.814601
12766,C110,2.339154,48.876952,C112,2.317015,48.814601
12767,C111,2.179838,48.879446,C112,2.317015,48.814601


In [68]:
for i in tqdm(distance_matrix.index):
    node_from = (distance_matrix.loc[i, "Lat_x"], distance_matrix.loc[i, "Lon_x"])
    node_to = (distance_matrix.loc[i, "Lat_y"], distance_matrix.loc[i, "Lon_y"])
    if node_from == node_to:
        distance_matrix.loc[i, 'distance'] = sys.float_info.max
    else:
        distance_matrix.loc[i, 'distance'] = distance.distance(node_from, node_to).km

  0%|          | 0/12769 [00:00<?, ?it/s]

In [72]:
distance_matrix.to_csv("Paris.routes", sep="\t", index=False)