In [1]:
from dataclasses import dataclass
from datetime import datetime, timedelta
import os
from pathlib import Path
from typing import List
pwd = Path(os.getcwd())

import pytz
import random
import sys
sys.path.append(str(pwd.parent))
sys.path.append(str(pwd.parent / "config"))

import warnings
warnings.simplefilter("ignore")

import networkx as nx
import numpy as np
from osmread import parse_file, Node, Way
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

from config import Config
from util import haversine
import matplotlib.pyplot as plt
import matplotlib as mpl

config = Config.load()

In [2]:
time_step = int(config.TIMESTEP) / 1e9 / 60

In [3]:
import glob
import json
adj_data_path = glob.glob(f"../data/{config.DATA_SIZE}/*Neighbor.csv")[0]
size = len(pd.read_csv(adj_data_path, header=None))
A = np.zeros((size,size))
D = np.zeros((size,size))
with open(adj_data_path) as f:
    for i in range(size):
        tmp = f.readline().replace('""','"').replace('"{','{').replace('}"','}').rstrip()
        for info_str in tmp.split("},"):
            if info_str[-1] != "}":
                info_str += "}"
            info = json.loads(info_str)
            if info["distance"] < time_step:
                D[i][i] += 1
                A[i][info["area_id"]] = 1
                A[info["area_id"]][i] = 1

In [4]:
# https://lib-arts.hatenablog.com/entry/laplacian_matrix
from scipy import linalg
I = np.identity(size)
L = I - np.dot(np.sqrt(linalg.inv(D)), np.dot(A, np.sqrt(linalg.inv(D))))

In [5]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class Net(torch.nn.Module):
    def __init__(self, num_node_features: int, num_nodes):
        super(Net, self).__init__()
        self.conv1 = GCNConv(num_node_features, num_nodes)
        self.conv2 = GCNConv(num_nodes, num_nodes)
        self.fc = torch.nn.Linear(num_nodes*num_nodes, num_nodes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = torch.flatten(x)
        return self.fc(x)
        

In [6]:
src_list = []
dst_list = []
for i in range(size):
    for j in range(size):
        if A[i][j] == 1:
            src_list.append(i)
            dst_list.append(j)
edge_index = torch.tensor([src_list, dst_list], dtype=torch.long)

# create training data

In [7]:
node_data_df = pd.read_csv(f"../data/{config.DATA_SIZE}/NodeConnection.csv")[["NodeID", "Latitude", "Longitude"]].drop_duplicates()

In [8]:
order_df = pd.read_csv(f"../data/yellow_tripdata_2015-06.csv")

USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}

rename_order_df = order_df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_order_without_outlier_df = rename_order_df[
    (rename_order_df["PointS_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointS_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointE_Longitude"] > config.LOCAL_REGION_BOUND.west_bound)
    & (rename_order_df["PointE_Longitude"] < config.LOCAL_REGION_BOUND.east_bound)
    & (rename_order_df["PointS_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointS_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
    & (rename_order_df["PointE_Latitude"] > config.LOCAL_REGION_BOUND.south_bound)
    & (rename_order_df["PointE_Latitude"] < config.LOCAL_REGION_BOUND.north_bound)
]

In [9]:
nn = NearestNeighbors(algorithm='ball_tree')
nn.fit(node_data_df[["Longitude", "Latitude"]].values)

node_id_list = node_data_df["NodeID"].values.tolist()

_, start_indices = nn.kneighbors(
    rename_order_without_outlier_df[["PointS_Longitude", "PointS_Latitude"]], n_neighbors=1)
rename_order_without_outlier_df["NodeS"] = [node_id_list[i[0]] for i in start_indices]

In [10]:
node_to_grid_df = pd.read_csv(f"../data/{config.DATA_SIZE}/(-74.0088, -73.9778, 40.7111, 40.7471)53TransportationClusteringCluster.csv")
node_to_grid_info = {}
for node_id, grid_id in zip(node_to_grid_df["NodeID"], node_to_grid_df["GridID"]):
    node_to_grid_info[node_id] = grid_id

In [11]:
rename_order_without_outlier_df.head()

Unnamed: 0,Start_time,End_time,PointS_Longitude,PointS_Latitude,PointE_Longitude,PointE_Latitude,NodeS
30,2015-06-01 00:00:08,2015-06-01 00:36:25,-73.987793,40.732361,-73.987793,40.732361,8309479665
34,2015-06-01 00:00:09,2015-06-01 00:01:52,-74.005989,40.735191,-73.999718,40.743622,8307463843
35,2015-06-01 00:00:09,2015-06-01 00:02:49,-73.988045,40.731632,-73.980858,40.730545,8309479665
45,2015-06-01 00:00:14,2015-06-01 00:08:42,-73.98233,40.744888,-74.003281,40.7351,8119462495
54,2015-06-01 00:00:16,2015-06-01 00:08:59,-73.991737,40.72982,-73.984352,40.716393,5706569384


In [12]:
rename_order_without_outlier_df["GridID"] = rename_order_without_outlier_df["NodeS"].map(node_to_grid_info)
rename_order_without_outlier_df.head()

Unnamed: 0,Start_time,End_time,PointS_Longitude,PointS_Latitude,PointE_Longitude,PointE_Latitude,NodeS,GridID
30,2015-06-01 00:00:08,2015-06-01 00:36:25,-73.987793,40.732361,-73.987793,40.732361,8309479665,37
34,2015-06-01 00:00:09,2015-06-01 00:01:52,-74.005989,40.735191,-73.999718,40.743622,8307463843,40
35,2015-06-01 00:00:09,2015-06-01 00:02:49,-73.988045,40.731632,-73.980858,40.730545,8309479665,37
45,2015-06-01 00:00:14,2015-06-01 00:08:42,-73.98233,40.744888,-74.003281,40.7351,8119462495,52
54,2015-06-01 00:00:16,2015-06-01 00:08:59,-73.991737,40.72982,-73.984352,40.716393,5706569384,30


In [13]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

rename_order_without_outlier_df["Start_datetime"] = rename_order_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)
rename_order_without_outlier_df["month"] = rename_order_without_outlier_df["Start_datetime"].apply(lambda x: x.month)
rename_order_without_outlier_df["day"] = rename_order_without_outlier_df["Start_datetime"].apply(lambda x: x.day)
rename_order_without_outlier_df["hour"] = rename_order_without_outlier_df["Start_datetime"].apply(lambda x: x.hour)
rename_order_without_outlier_df["minute"] = rename_order_without_outlier_df["Start_datetime"].apply(lambda x: (((x.minute // 10)) % 6) * 10 )

In [14]:
raw_feature_df = rename_order_without_outlier_df[["GridID", "month", "day", "hour", "minute"]]
raw_feature_df["raw_time"] = raw_feature_df["month"]*1000000 + raw_feature_df["day"]*10000 + raw_feature_df["hour"]*100 + raw_feature_df["minute"]
raw_time = raw_feature_df["raw_time"].drop_duplicates().tolist()

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = Net(num_node_features=3, num_nodes=size).to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01, weight_decay=5e-4)

In [17]:
from torch_geometric.data import Data

grid_id_list = raw_feature_df["GridID"].drop_duplicates()

prev_demands = raw_feature_df[raw_feature_df["raw_time"]==raw_time[0]]
tmp_feature_df = raw_feature_df[raw_feature_df["raw_time"]==raw_time[0]]
tmp_feature_df.groupby("GridID")
tmp = tmp_feature_df.groupby("GridID")["GridID"].count().to_dict()
y = []
for grid_id in grid_id_list:
    if grid_id in tmp:
        y.append(tmp[grid_id])
    else:
        y.append(0)
prev_demands = y

loss_list = []
for time_ in raw_time[1:]:
    tmp_feature_df = raw_feature_df[raw_feature_df["raw_time"]==time_]
    tmp_feature_df.groupby("GridID")
    tmp = tmp_feature_df.groupby("GridID")["GridID"].count().to_dict()
    y = []
    for grid_id in grid_id_list:
        if grid_id in tmp:
            y.append(tmp[grid_id])
        else:
            y.append(0)
    feature = []
    for prev_num in prev_demands:
        feature.append([
            (time_ // 100) % 100,  # hour
            time_ % 100,  # minute
            prev_num  # prev demand num
        ])
    feature_tensor = torch.tensor(feature, dtype=torch.float)
    y_tensor = torch.tensor(y, dtype=torch.float)
    data = Data(x=feature_tensor, edge_index=edge_index, y=y_tensor)

    net.train()
    optimizer.zero_grad()
    out = net(data)
    try:
        loss = criterion(out, data.y)
    except:
        print(out.shape)
        print(data.y.shape)
    loss.backward()
    optimizer.step()
    loss_list.append(loss)

    prev_demands = y

In [18]:
loss_list

[tensor(33.8744, grad_fn=<MseLossBackward0>),
 tensor(47.5340, grad_fn=<MseLossBackward0>),
 tensor(82.8358, grad_fn=<MseLossBackward0>),
 tensor(31.8619, grad_fn=<MseLossBackward0>),
 tensor(33.8023, grad_fn=<MseLossBackward0>),
 tensor(11.1892, grad_fn=<MseLossBackward0>),
 tensor(8.8171, grad_fn=<MseLossBackward0>),
 tensor(10.1530, grad_fn=<MseLossBackward0>),
 tensor(3.0535, grad_fn=<MseLossBackward0>),
 tensor(7.0465, grad_fn=<MseLossBackward0>),
 tensor(10.6935, grad_fn=<MseLossBackward0>),
 tensor(5.9659, grad_fn=<MseLossBackward0>),
 tensor(1.9179, grad_fn=<MseLossBackward0>),
 tensor(4.4170, grad_fn=<MseLossBackward0>),
 tensor(4.5445, grad_fn=<MseLossBackward0>),
 tensor(3.5735, grad_fn=<MseLossBackward0>),
 tensor(4.0633, grad_fn=<MseLossBackward0>),
 tensor(11.4587, grad_fn=<MseLossBackward0>),
 tensor(1.4929, grad_fn=<MseLossBackward0>),
 tensor(2.3562, grad_fn=<MseLossBackward0>),
 tensor(5.0806, grad_fn=<MseLossBackward0>),
 tensor(1.7348, grad_fn=<MseLossBackward0>),
 

In [None]:
len(tmp)

In [None]:
feature_df = raw_feature_df.groupby(["GridID", "month", "day", "hour", "minute"])

In [33]:
feature_df.get_group()

TypeError: get_group() missing 1 required positional argument: 'name'