Skip to content

Commit

Permalink
[WIP] Interpolate intermediary stops (#21)
Browse files Browse the repository at this point in the history
* nan helper

* fill in times logic

* update loc and drop reset index

* update fixture

* update tests
  • Loading branch information
kuanb committed Feb 11, 2018
1 parent d557838 commit 18b42e9
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 3 deletions.
52 changes: 50 additions & 2 deletions peartree/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import partridge as ptg

from .toolkit import nan_helper
from .utilities import log


Expand Down Expand Up @@ -253,13 +254,60 @@ def generate_summary_wait_times(df: pd.DataFrame) -> pd.DataFrame:
return summed_reset


def apply_interpolation(orig_array):
nans, x = nan_helper(orig_array)
orig_array[nans] = np.interp(x(nans), x(~nans), orig_array[~nans])
return orig_array


def fill_in_times(sub_df):
# First, make sure that there is a set of stop sequence
# numbers present in each of the trip_id sub-dataframes
if 'stop_sequence' not in sub_df.columns:
sub_df['stop_sequence'] = range(len(sub_df))

uniq_sequence_ids = sub_df.stop_sequence.unique()
if not len(uniq_sequence_ids) == len(sub_df):
raise Exception('Expected there to be a unique set of '
'stop ids for each trip_id in stop_times.')

# Next, make sure that the subset dataframe is sorted
# stop sequence, incrementing upward
sub_df = sub_df.sort_values(by=['stop_sequence'])

# Extract the arrival and departure times as independent arrays
sub_df['arrival_time'] = apply_interpolation(sub_df['arrival_time'])
sub_df['departure_time'] = apply_interpolation(sub_df['departure_time'])

return sub_df


def linearly_interpolate_infill_times(stops_orig_df):
# Prevent any upstream modification of this object
stops_df = stops_orig_df.copy()
cleaned = stops_df.groupby('trip_id').apply(fill_in_times)

# Result of the apply operation creates a large, nested
# multi-index which we should drop
cleaned = cleaned.reset_index(drop=True)

return cleaned


def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
target_time_start: int,
target_time_end: int) -> Tuple[pd.DataFrame]:
# Initialize the trips dataframe to be worked with
ftrips = feed.trips.copy()
ftrips = ftrips[~ftrips['route_id'].isnull()]
ftrips = ftrips.set_index('route_id', drop=False)

# Similarly, prepare the stops times dataframe by also
# infilling all stop times that are NaN with their linearly
# interpolated values based on their nearest numerically valid
# neighbors
stop_times = linearly_interpolate_infill_times(feed.stop_times)

all_edge_costs = None
all_wait_times = None
for i, route in feed.routes.iterrows():
Expand All @@ -274,8 +322,8 @@ def generate_edge_and_wait_values(feed: ptg.gtfs.feed,
trips = trips.to_frame().T

# Get just the stop times related to this trip
st_trip_id_mask = feed.stop_times.trip_id.isin(trips.trip_id)
stimes_init = feed.stop_times[st_trip_id_mask]
st_trip_id_mask = stop_times.trip_id.isin(trips.trip_id)
stimes_init = stop_times[st_trip_id_mask]

# Then subset further by just the time period that we care about
start_time_mask = (stimes_init.arrival_time >= target_time_start)
Expand Down
21 changes: 21 additions & 0 deletions peartree/toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,24 @@ def get_nearest_nodes(df_orig: pd.DataFrame,

# Return filtered series
return nearest_nodes


def nan_helper(y):
"""
Helper to handle indices and logical indices of NaNs.
From: https://stackoverflow.com/questions/6518811/
interpolate-nan-values-in-a-numpy-array#6518811
Input:
- y, 1d numpy array with possible NaNs
Output:
- nans, logical indices of NaNs
- index, a function, with signature indices= index(logical_indices),
to convert logical indices of NaNs to 'equivalent' indices
Example:
>>> # linear interpolation of NaNs
>>> nans, x= nan_helper(y)
>>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
"""

return (np.isnan(y), lambda z: z.nonzero()[0])
Binary file modified tests/fixtures/samtrans-2017-11-28.zip
Binary file not shown.
20 changes: 19 additions & 1 deletion tests/test_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_generate_empty_graph():


def test_generate_summary_graph_elements():
path_1 = fixture('caltrain-2017-07-24.zip')
path_1 = fixture('samtrans-2017-11-28.zip')
feed_1 = get_representative_feed(path_1)

start = 7 * 60 * 60
Expand Down Expand Up @@ -49,3 +49,21 @@ def test_generate_summary_graph_elements():
# Make sure that there are stop ids unique
u = wait_times_by_stop.stop_id.unique()
assert len(u) == len(wait_times_by_stop)

# Another sanity check, we should be sure that the resulting
# edges list captures all the stops that were assigned null
# values in the fixture dataset were assigned a linearly imputed
# arrival and departure time and thus preserved as a stop
# in the edge list

# First get the null times mask
null_times = feed_1.stop_times.departure_time.isnull()
# And identify all unique stops from the original feed
null_stop_ids = feed_1.stop_times[null_times].stop_id.unique()

# Now let's take the list of these null stop ids and extract
# all the ones from that list in the summary edge dataframe
mask = summary_edge_costs.from_stop_id.isin(null_stop_ids)
# And now we can get the stop ids out from this list
preserved_from_nulls = summary_edge_costs.from_stop_id[mask].unique()
assert len(preserved_from_nulls) == 205

0 comments on commit 18b42e9

Please sign in to comment.