In [None]:
from datetime import datetime, timedelta
import os
from pathlib import Path
pwd = Path(os.getcwd())

import pytz
import random
import sys
sys.path.append(str(pwd.parent))
sys.path.append(str(pwd.parent / "config"))

import warnings
warnings.simplefilter("ignore")

import networkx as nx
import numpy as np
from osmread import parse_file, Node, Way
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from config import Config
from util import haversine

config = Config.load()

In [None]:
node_df = pd.read_csv("../data/Node.csv")

In [None]:
order_df = pd.read_csv("../data/Order/original/yellow_tripdata_2015-06.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

In [None]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)

In [None]:
rename_order_without_outlier_df.sort_values("Start_time").head()

In [None]:
june_dummy_df = rename_order_without_outlier_df[rename_order_without_outlier_df["Start_time"] >= datetime(2016, 6, 1)]

In [None]:
order_df = pd.read_csv("../data/Order/original/yellow_tripdata_2015-07.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

In [None]:
rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

rename_order_without_outlier_df["Start_time"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)
rename_order_without_outlier_df["End_time"] = rename_order_without_outlier_df["End_time"].apply(
    lambda x: x - timedelta(days=2) + timedelta(days=365)
)

In [None]:
july_dummy_df = rename_order_without_outlier_df[rename_order_without_outlier_df["Start_time"] < datetime(2016, 7, 1)]

In [None]:
df = pd.concat([june_dummy_df, july_dummy_df])
df.sort_values("Start_time", inplace=True)

In [None]:
nn = NearestNeighbors(algorithm='ball_tree')
nn.fit(node_df[["Longitude", "Latitude"]].values)

node_id_list = node_df["NodeID"].values.tolist()

_, start_indices = nn.kneighbors(
    df[["PointS_Longitude", "PointS_Latitude"]], n_neighbors=1)
df["NodeS"] = [node_id_list[i[0]] for i in start_indices]

_, end_indices = nn.kneighbors(
    df[["PointE_Longitude", "PointE_Latitude"]], n_neighbors=1
)
df["NodeE"] = [node_id_list[i[0]] for i in end_indices]

In [None]:
df.head()

In [None]:
cluster_df_path = Path("../data") / (f"({str(config.LOCAL_REGION_BOUND)})" + "9" + config.AREA_MODE.value + "Cluster.csv")
cluster_df = pd.read_csv(cluster_df_path)

In [None]:
grid_dict = {node_id: grid_id for node_id, grid_id in zip(cluster_df["NodeID"].values, cluster_df["GridID"].values)}

In [None]:
df["Start_GridID"] = df["NodeS"].map(grid_dict)
df["End_GridID"] = df["NodeS"].map(grid_dict)
dummy_df = df[["Start_time", "End_time", "NodeS", "NodeE", "Start_GridID", "End_GridID"]]

In [None]:
def newyork_datetime_to_utc(date: datetime) -> int:
    eastern = pytz.timezone('US/Eastern')
    date.tz_localize(eastern)
    return int(date.timestamp())

dummy_df["ID"] = range(len(dummy_df))
start_date = datetime(year=2016, month=6, day=1)
directory = "dummy"
while True:
    next_date = start_date + timedelta(days=1)
    tmp_df = dummy_df[
        (dummy_df["Start_time"]<next_date)
        & (dummy_df["Start_time"]>start_date)
    ]
    tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
    tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
    tmp_df.to_csv(
        f"../data/Order/modified/{directory}/order_2016{str(start_date.month).zfill(2)}{str(start_date.day).zfill(2)}.csv",
        index=False
    )
    start_date = next_date
    if start_date.month != 6:
        break

In [None]:
tmp_df.groupby("Start_GridID").count().reset_index()[["Start_GridID", "ID"]].sort_values("Start_GridID")