In [1]:
from datetime import datetime, timedelta
import os
from pathlib import Path
pwd = Path(os.getcwd())

import pytz
import random
import sys
sys.path.append(str(pwd.parent))
sys.path.append(str(pwd.parent / "config"))

import warnings
warnings.simplefilter("ignore")

import networkx as nx
import numpy as np
from osmread import parse_file, Node, Way
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from config import Config
from util import haversine

config = Config.load()

In [2]:
node_df = pd.read_csv("../data/Node.csv")

In [3]:
order_df = pd.read_csv("../data/Order/original/yellow_tripdata_2015-06.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

In [4]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)

In [5]:
rename_order_without_outlier_df.sort_values("Start_time").head()

Unnamed: 0,Start_time,End_time,PointS_Longitude,PointS_Latitude,PointE_Longitude,PointE_Latitude
836,2016-05-29 00:04:04,2016-05-29 00:09:24,-74.011665,40.708351,-74.014984,40.709614
2002,2016-05-29 00:11:01,2016-05-29 00:12:22,-74.011841,40.707851,-74.0158,40.705021
3017,2016-05-29 00:17:01,2016-05-29 00:17:07,-74.015366,40.709419,-74.015358,40.709419
4825,2016-05-29 00:28:00,2016-05-29 00:30:15,-74.013718,40.702316,-74.014709,40.709492
5024,2016-05-29 00:29:16,2016-05-29 00:29:33,-74.017189,40.705338,-74.017189,40.705292


In [6]:
june_dummy_df = rename_order_without_outlier_df[rename_order_without_outlier_df["Start_time"] >= datetime(2016, 6, 1)]

In [7]:
order_df = pd.read_csv("../data/Order/original/yellow_tripdata_2015-07.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

In [8]:
rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)

In [9]:
july_dummy_df = rename_order_without_outlier_df[rename_order_without_outlier_df["Start_time"] < datetime(2016, 7, 1)]

In [10]:
df = pd.concat([june_dummy_df, july_dummy_df])
df.sort_values("Start_time", inplace=True)

In [11]:
nn = NearestNeighbors(algorithm='ball_tree')
nn.fit(node_df[["Longitude", "Latitude"]].values)

node_id_list = node_df["NodeID"].values.tolist()

_, start_indices = nn.kneighbors(
    df[["PointS_Longitude", "PointS_Latitude"]], n_neighbors=1)
df["NodeS"] = [node_id_list[i[0]] for i in start_indices]

_, end_indices = nn.kneighbors(
    df[["PointE_Longitude", "PointE_Latitude"]], n_neighbors=1
)
df["NodeE"] = [node_id_list[i[0]] for i in end_indices]

In [12]:
df.head()

Unnamed: 0,Start_time,End_time,PointS_Longitude,PointS_Latitude,PointE_Longitude,PointE_Latitude,NodeS,NodeE
1246999,2016-06-01 00:01:13,2016-06-01 00:02:14,-74.012917,40.70652,-74.013443,40.703972,7707021984,7685651630
1247002,2016-06-01 00:01:13,2016-06-01 00:02:14,-74.012917,40.70652,-74.013443,40.703972,7707021984,7685651630
1252972,2016-06-01 00:21:26,2016-06-01 00:21:44,-74.015549,40.708118,-74.015556,40.708122,4145434703,4145434703
1255849,2016-06-01 00:32:24,2016-06-01 00:32:45,-74.016144,40.704556,-74.016151,40.704594,6128792145,9371204480
1256578,2016-06-01 00:35:22,2016-06-01 00:35:27,-74.01593,40.705021,-74.01593,40.705013,7706983360,7706983360


In [13]:
cluster_df_path = Path("../data") / (f"({str(config.LOCAL_REGION_BOUND)})" + "9" + config.AREA_MODE.value + "Cluster.csv")
cluster_df = pd.read_csv(cluster_df_path)

In [14]:
grid_dict = {node_id: grid_id for node_id, grid_id in zip(cluster_df["NodeID"].values, cluster_df["GridID"].values)}

In [15]:
df["Start_GridID"] = df["NodeS"].map(grid_dict)
df["End_GridID"] = df["NodeS"].map(grid_dict)
dummy_df = df[["Start_time", "End_time", "NodeS", "NodeE", "Start_GridID", "End_GridID"]]

In [17]:
def newyork_datetime_to_utc(date: datetime) -> int:
    eastern = pytz.timezone('US/Eastern')
    date.tz_localize(eastern)
    return int(date.timestamp())

dummy_df["ID"] = range(len(dummy_df))
start_date = datetime(year=2016, month=6, day=1)

while True:
    next_date = start_date + timedelta(days=1)
    tmp_df = dummy_df[
        (dummy_df["Start_time"]<next_date)
        & (dummy_df["Start_time"]>start_date)
    ]
    tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
    tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
    tmp_df.to_csv(
        f"../modules/dummy_data/order_2016{str(start_date.month).zfill(2)}{str(start_date.day).zfill(2)}.csv",
        index=False
    )
    start_date = next_date
    if start_date.month != 6:
        break

In [None]:
tmp_df.groupby("Start_GridID").count().reset_index()[["Start_GridID", "ID"]].sort_values("Start_GridID")