[WIP] Interpolate intermediary stops (#21)

* nan helper * fill in times logic * update loc and drop reset index * update fixture * update tests
kuanb · Feb 11, 2018 · 18b42e9 · 18b42e9
1 parent d557838
commit 18b42e9
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 3 deletions.
diff --git a/peartree/summarizer.py b/peartree/summarizer.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import partridge as ptg
 
+from .toolkit import nan_helper
 from .utilities import log
 
 
@@ -253,13 +254,60 @@ def generate_summary_wait_times(df: pd.DataFrame) -> pd.DataFrame:
     return summed_reset
 
 
+def apply_interpolation(orig_array):
+    nans, x = nan_helper(orig_array)
+    orig_array[nans] = np.interp(x(nans), x(~nans), orig_array[~nans])
+    return orig_array
+
+
+def fill_in_times(sub_df):
+    # First, make sure that there is a set of stop sequence
+    # numbers present in each of the trip_id sub-dataframes
+    if 'stop_sequence' not in sub_df.columns:
+        sub_df['stop_sequence'] = range(len(sub_df))
+
+    uniq_sequence_ids = sub_df.stop_sequence.unique()
+    if not len(uniq_sequence_ids) == len(sub_df):
+        raise Exception('Expected there to be a unique set of '
+                        'stop ids for each trip_id in stop_times.')
+
+    # Next, make sure that the subset dataframe is sorted
+    # stop sequence, incrementing upward
+    sub_df = sub_df.sort_values(by=['stop_sequence'])
+
+    # Extract the arrival and departure times as independent arrays
+    sub_df['arrival_time'] = apply_interpolation(sub_df['arrival_time'])
+    sub_df['departure_time'] = apply_interpolation(sub_df['departure_time'])
+
+    return sub_df
+
+
+def linearly_interpolate_infill_times(stops_orig_df):
+    # Prevent any upstream modification of this object
+    stops_df = stops_orig_df.copy()
+    cleaned = stops_df.groupby('trip_id').apply(fill_in_times)
+
+    # Result of the apply operation creates a large, nested
+    # multi-index which we should drop
+    cleaned = cleaned.reset_index(drop=True)
+
+    return cleaned
+
+
 def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
                                   target_time_start: int,
                                   target_time_end: int) -> Tuple[pd.DataFrame]:
+    # Initialize the trips dataframe to be worked with
     ftrips = feed.trips.copy()
     ftrips = ftrips[~ftrips['route_id'].isnull()]
     ftrips = ftrips.set_index('route_id', drop=False)
 
+    # Similarly, prepare the stops times dataframe by also
+    # infilling all stop times that are NaN with their linearly
+    # interpolated values based on their nearest numerically valid
+    # neighbors
+    stop_times = linearly_interpolate_infill_times(feed.stop_times)
+
     all_edge_costs = None
     all_wait_times = None
     for i, route in feed.routes.iterrows():
@@ -274,8 +322,8 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
             trips = trips.to_frame().T
 
         # Get just the stop times related to this trip
-        st_trip_id_mask = feed.stop_times.trip_id.isin(trips.trip_id)
-        stimes_init = feed.stop_times[st_trip_id_mask]
+        st_trip_id_mask = stop_times.trip_id.isin(trips.trip_id)
+        stimes_init = stop_times[st_trip_id_mask]
 
         # Then subset further by just the time period that we care about
         start_time_mask = (stimes_init.arrival_time >= target_time_start)

diff --git a/peartree/toolkit.py b/peartree/toolkit.py
@@ -83,3 +83,24 @@ def get_nearest_nodes(df_orig: pd.DataFrame,
 
     # Return filtered series
     return nearest_nodes
+
+
+def nan_helper(y):
+    """
+    Helper to handle indices and logical indices of NaNs.
+    From: https://stackoverflow.com/questions/6518811/
+          interpolate-nan-values-in-a-numpy-array#6518811
+
+    Input:
+        - y, 1d numpy array with possible NaNs
+    Output:
+        - nans, logical indices of NaNs
+        - index, a function, with signature indices= index(logical_indices),
+          to convert logical indices of NaNs to 'equivalent' indices
+    Example:
+        >>> # linear interpolation of NaNs
+        >>> nans, x= nan_helper(y)
+        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
+    """
+
+    return (np.isnan(y), lambda z: z.nonzero()[0])
diff --git a/tests/fixtures/samtrans-2017-11-28.zip b/tests/fixtures/samtrans-2017-11-28.zip
diff --git a/tests/test_graph.py b/tests/test_graph.py
@@ -16,7 +16,7 @@ def test_generate_empty_graph():
 
 
 def test_generate_summary_graph_elements():
-    path_1 = fixture('caltrain-2017-07-24.zip')
+    path_1 = fixture('samtrans-2017-11-28.zip')
     feed_1 = get_representative_feed(path_1)
 
     start = 7 * 60 * 60
@@ -49,3 +49,21 @@ def test_generate_summary_graph_elements():
     # Make sure that there are stop ids unique
     u = wait_times_by_stop.stop_id.unique()
     assert len(u) == len(wait_times_by_stop)
+
+    # Another sanity check, we should be sure that the resulting
+    # edges list captures all the stops that were assigned null
+    # values in the fixture dataset were assigned a linearly imputed
+    # arrival and departure time and thus preserved as a stop
+    # in the edge list
+
+    # First get the null times mask
+    null_times = feed_1.stop_times.departure_time.isnull()
+    # And identify all unique stops from the original feed
+    null_stop_ids = feed_1.stop_times[null_times].stop_id.unique()
+
+    # Now let's take the list of these null stop ids and extract
+    # all the ones from that list in the summary edge dataframe
+    mask = summary_edge_costs.from_stop_id.isin(null_stop_ids)
+    # And now we can get the stop ids out from this list
+    preserved_from_nulls = summary_edge_costs.from_stop_id[mask].unique()
+    assert len(preserved_from_nulls) == 205