Merge pull request #4 from kuanb/kuanb-join-on-new-points

Identify and generate x-feed transfer edges based proximity threshold
kuanb · Dec 3, 2017 · 6b4ad38 · 6b4ad38
2 parents 0592d75 + cad14ce
commit 6b4ad38
Show file tree

Hide file tree

Showing 3 changed files with 216 additions and 6 deletions.
diff --git a/peartree/graph.py b/peartree/graph.py
@@ -9,6 +9,7 @@
 from .summarizer import (generate_edge_and_wait_values,
                          generate_summary_edge_costs,
                          generate_summary_wait_times)
+from .toolkit import generate_graph_node_dataframe, get_nearest_node
 
 
 def generate_empty_md_graph(name: str,
@@ -36,19 +37,90 @@ def generate_summary_graph_elements(feed: ptg.gtfs.feed,
     return (summary_edge_costs, wait_times_by_stop)
 
 
+def generate_cross_feed_edges(G,
+                              feed,
+                              wait_times_by_stop,
+                              connection_threshold):
+    # Terminate this process early if the graph is empty
+    if (G.number_of_nodes() == 0):
+        return pd.DataFrame({'stop_id': [],
+                             'to_nodes': [],
+                             'edge_costs': []})
+
+    # First, we need a DataFrame representation of the nodes in the graph
+    node_df = generate_graph_node_dataframe(G)
+
+    stop_ids = []
+    to_nodes = []
+    edge_costs = []
+
+    # TODO: Repeating this in populate_graph as well, there may
+    #       be a way to condense these two steps well
+    for i, row in wait_times_by_stop.iterrows():
+        sid = str(row.stop_id)
+
+        # TODO: Join tables before hand to make
+        #       this part go faster
+        id_mask = (feed.stops.stop_id == sid)
+        stop_data_head = feed.stops[id_mask].head(1)
+
+        # Handle the possibility that there are no values for that stop
+        # id in the feed subset of wait times
+        if not len(stop_data_head):
+            continue
+
+        # Once check has cleared, pull out the first row as a pd.Series
+        stop_data = stop_data_head.T.squeeze()
+
+        # Ensure that each value is typed correctly prior to being
+        # fed into the nearest node method
+        lat = float(stop_data.stop_lat)
+        lon = float(stop_data.stop_lon)
+        point = (lat, lon)
+        (nn, nn_dist) = get_nearest_node(node_df, point)
+
+        # Only generate a connector edge if it satisfies the
+        # meter distance threshold
+        if nn_dist < connection_threshold:
+            stop_ids.append(sid)
+            to_nodes.append(nn)
+            edge_costs.append(nn_dist)
+
+    return pd.DataFrame({'stop_id': stop_ids,
+                         'to_node': to_nodes,
+                         'distance': edge_costs})
+
+
 def populate_graph(G: nx.MultiDiGraph,
                    name: str,
                    feed: ptg.gtfs.feed,
                    wait_times_by_stop: pd.DataFrame,
-                   summary_edge_costs: pd.DataFrame):
+                   summary_edge_costs: pd.DataFrame,
+                   cross_feed_edges: pd.DataFrame):
+    # As we convert stop ids to actual nodes, let's keep track of those names
+    # here so that we can reference them when we add connector edges across
+    # the various feeds loaded into the graph
+    sid_lookup = {}
+
     for i, row in wait_times_by_stop.iterrows():
         sid = str(row.stop_id)
         full_sid = nameify_stop_id(name, sid)
 
         # TODO: Join tables before hand to make
         #       this part go faster
         id_mask = (feed.stops.stop_id == sid)
-        stop_data = feed.stops[id_mask].head(1).T.squeeze()
+        stop_data_head = feed.stops[id_mask].head(1)
+
+        # Handle the possibility that there are no values for that stop
+        # id in the feed subset of wait times
+        if not len(stop_data_head):
+            continue
+
+        # Once check has cleared, pull out the first row as a pd.Series
+        stop_data = stop_data_head.T.squeeze()
+
+        # Add to the lookup crosswalk dictionary
+        sid_lookup[sid] = full_sid
 
         G.add_node(full_sid,
                    boarding_cost=row.avg_cost,
@@ -62,4 +134,23 @@ def populate_graph(G: nx.MultiDiGraph,
                    sid_to,
                    length=row.edge_cost)
 
+    # Now add the cross feed edge connectors to the graph to
+    # capture transfer points
+    for i, row in cross_feed_edges.iterrows():
+        # Extract the row column values as discrete variables
+        sid = row.stop_id
+        to = row.to_node
+        d = row.distance
+
+        # Use the lookup table to get converted stop id name
+        full_sid = sid_lookup[sid]
+
+        # Convert to km/hour
+        kmph = (d / 1000) / 4.5
+
+        # Convert to seconds
+        in_seconds = kmph * 60 * 60
+
+        G.add_edge(full_sid, to, length=in_seconds)
+
     return G
diff --git a/peartree/paths.py b/peartree/paths.py
@@ -4,8 +4,8 @@
 import networkx as nx
 import partridge as ptg
 
-from .graph import (generate_empty_md_graph, generate_summary_graph_elements,
-                    populate_graph)
+from .graph import (generate_cross_feed_edges, generate_empty_md_graph,
+                    generate_summary_graph_elements, populate_graph)
 from .utilities import log
 
 
@@ -58,7 +58,44 @@ def load_feed_as_graph(feed: ptg.gtfs.feed,
                        start_time: int,
                        end_time: int,
                        name: str=None,
-                       existing_graph: nx.MultiDiGraph=None):
+                       existing_graph: nx.MultiDiGraph=None,
+                       connection_threshold: float=50):
+    """
+    Convert a feed object into a NetworkX Graph, connect to an existing
+    NetworkX graph if one is supplied
+
+    Parameters
+    ----------
+    feed : partridge.feed
+        A feed object from Partridge holding a representation of the
+        desired schedule ids and their releated scheudule data from an
+        operator GTFS
+    start_time : int
+        Represented in seconds after midnight; indicates the start time
+        with which to take the subset of the target feed schedule
+        to be used to measure impedance between stops along
+        the route, as well as cost (wait time) to board at each stop
+    end_time : int
+        Represented in seconds after midnight; indicates the end time
+        with which to take the subset of the target feed schedule
+        to be used to measure impedance between stops along
+        the route, as well as cost (wait time) to board at each stop
+    name : str
+        Name of the operator, which is used to create a unique ID for each
+        of the stops, routes, etc. in the feed being supplied
+    existing_graph : networkx.Graph
+        An existing graph containing other operator or schedule data
+    connection_threshold : float
+        Treshold by which to create a connection with an existing stop
+        in the existing_graph graph, measured in meters
+
+    Returns
+    -------
+    G
+        networkx.Graph, the loaded, combined representation of the schedule
+        data from the feed subset by the time parameters provided
+    """
+
     # Generate a random name for name if it is None
     if not name:
         name = _generate_random_name()
@@ -90,8 +127,14 @@ def load_feed_as_graph(feed: ptg.gtfs.feed,
     else:
         G = generate_empty_md_graph(name)
 
+    cross_feed_edges = generate_cross_feed_edges(G,
+                                                 feed,
+                                                 wait_times_by_stop,
+                                                 connection_threshold)
+
     return populate_graph(G,
                           name,
                           feed,
                           wait_times_by_stop,
-                          summary_edge_costs)
+                          summary_edge_costs,
+                          cross_feed_edges)
diff --git a/peartree/toolkit.py b/peartree/toolkit.py
@@ -0,0 +1,76 @@
+from typing import Tuple
+
+import numpy as np
+import osmnx as ox
+import pandas as pd
+
+
+def great_circle_vec(lat1: float,
+                     lng1: float,
+                     lat2: float,
+                     lng2: float,
+                     earth_radius: int=6371009):
+    # This method wraps the same in OSMnx, source:
+    #   https://github.com/gboeing/osmnx/blob/
+    #   b32f8d333c6965a0d2f27c1f3224a29de2f08d55/osmnx/utils.py#L262
+    return ox.utils.great_circle_vec(lat1, lng1, lat2, lng2, earth_radius)
+
+
+def generate_graph_node_dataframe(G):
+    # This method breaks out a portion of a similar method from
+    # OSMnx's get_nearest_node; source:
+    #   https://github.com/gboeing/osmnx/blob/
+    #   b32f8d333c6965a0d2f27c1f3224a29de2f08d55/osmnx/utils.py#L326
+    if not G or (G.number_of_nodes() == 0):
+        raise ValueError('G argument must be not be empty or '
+                         'should contain at least one node')
+
+    # Dump graph node coordinates array
+    clist = []
+    for node, data in G.nodes(data=True):
+        # Ensure that each items is cast as the correct typegi
+        x = float(data['x'])
+        y = float(data['y'])
+        clist.append([node, x, y])
+    coords = np.array(clist)
+
+    # Then make into a Pandas DataFrame, with the node as index
+    return pd.DataFrame(coords, columns=['node', 'x', 'y']).set_index('node')
+
+
+def get_nearest_node(df_orig: pd.DataFrame,
+                     point: Tuple[float, float]):
+    # This method breaks out a portion of a similar method from
+    # OSMnx's get_nearest_node; source:
+    #   https://github.com/gboeing/osmnx/blob/
+    #   b32f8d333c6965a0d2f27c1f3224a29de2f08d55/osmnx/utils.py#L326
+
+    # Make a copy of the DataFrame to prevent mutation outside of function
+    df = df_orig.copy()
+
+    # Add second column of reference points
+    df['reference_y'] = point[0]
+    df['reference_x'] = point[1]
+
+    # TODO: OSMnx supports euclidean as well, for now we have a stumped
+    #       version of this same function
+
+    # Ensure each vectorized series is typed correctly
+    ref_ys = df['reference_y'].astype(float)
+    ref_xs = df['reference_x'].astype(float)
+    ys = df['y'].astype(float)
+    xs = df['x'].astype(float)
+
+    # Calculate distance vector using great circle distances (ie, for
+    # spherical lat-long geometries)
+    distances = great_circle_vec(lat1=ref_ys,
+                                 lng1=ref_xs,
+                                 lat2=ys,
+                                 lng2=xs)
+
+    # Calculate the final results to be returned
+    nearest_node = str(distances.idxmin())
+    nn_dist = distances.loc[nearest_node]
+
+    # Returna as tuple
+    return (nearest_node, nn_dist)