# Import
For this to work, the data files need to be placed into the `data/raw` folder.

In [1]:
from pathlib import Path
import pandas as pd
import plotly_express as px


# # This is needed to have figures appear in external HTML file.
# from plotly.offline import init_notebook_mode
# init_notebook_mode()

In [2]:
SEP17_CSV = Path('../data/raw') / 'SEP2017.csv'
OCT17_CSV = Path('../data/raw') / 'OCT2017.csv'
NOV17_CSV = Path('../data/raw') / 'NOV2017.csv'
APR18_CSV = Path('../data/raw') / 'APR2018.csv'
MAY18_CSV = Path('../data/raw') / 'MAY2018.csv'

raw_sep_df = pd.read_csv(SEP17_CSV)
raw_oct_df = pd.read_csv(OCT17_CSV)
raw_nov_df = pd.read_csv(NOV17_CSV)
raw_apr_df = pd.read_csv(APR18_CSV)
raw_may_df = pd.read_csv(MAY18_CSV)

# Bring 'em all together into one dataframe.
raw_df = raw_sep_df.append(raw_oct_df).append(raw_nov_df).append(raw_apr_df).append(raw_may_df)

# Data

These csv files are joins of the `init_veh_stoph` and `trimet_stop_event` tables. Here's an example query used to make them. Nevermind, that looks bad. So here's the query parameters.

* Routes 4, 10 and 14
* All stop types (service stops are 0 or 5)
* Stops 3637, 3641, 3633, 2642, 7856
    * 3637 - SE 11th and SE Madison [Just before the start of bus lane - actually begins at 10th]
    * 3641 - SE 7th and SE Madison 
    * 3633 - SE Grand and SE Madison [End of bus lane]
    * 2642 - Hawthorne Bridge, Westbound [Likely outside of this analysis]
    * 7856 - SE 7th and SE Clay [4 (or 2) bus only]
* Weekday service (code 'W')
* Between 5:00 am (18000) and 12:00 pm (43200)

# Goal

So the goal here is to see how long the busses take to traverse the bus lane from 9:00 am (32400) until 10:00 am (36000), and see if there's an obvious change over time.

As we learned before, lets start with route 14, from arrival at stop 3637 to arrival at 3633. We have ridership data too, but we'll look at that later.

## Columns
For now, let's limit the columns to the following:
* SERVICE_DATE   
* VEHICLE_NUMBER
* TRAIN
* ROUTE_NUMBER
* LEAVE_TIME
* STOP_TIME
* ARRIVE_TIME
* LOCATION_ID
* ESTIMATED_LOAD
* STOP_TYPE

In [3]:
cols = ['SERVICE_DATE', 'VEHICLE_NUMBER', 'TRAIN', 'ROUTE_NUMBER', 'LEAVE_TIME', 'STOP_TIME', 'ARRIVE_TIME', 'LOCATION_ID', 'ESTIMATED_LOAD', 'STOP_TYPE']
filt_df = raw_df[cols][(raw_df['ROUTE_NUMBER'] == 14) & 
                       (raw_df['STOP_TIME'].between(32400, 36000)) &
                       (raw_df['LOCATION_ID'].isin([3637, 3633])) & 
                       (raw_df['STOP_TYPE'].isin([0, 5]))]
# Convert to datetime so they can be sorted
filt_df['SERVICE_DATE'] = pd.to_datetime(filt_df['SERVICE_DATE'], format='%d%b%Y:%H:%M:%S')
filt_df.sort_values('SERVICE_DATE', inplace=True)
print(filt_df.shape)

(992, 10)


Sanity check here. `filt_df.shape` gives us `(992, 10)` entries. A trip contains two entries (start and stop), so that's 496 bus trips. If we assume 22 weekdays per month, over 5 months, that's 110 weekdays. 496 bus trips / 110 weekdays = ~ 4.5 bus trips per weekday over this 1 hour window. 4.5 bus trips / 60 min = ~ 13 min inbetween trips. That seems reasonable.

In [4]:
# To filter this all down, let's first group by date, then train number within the date.
grouped = filt_df.groupby(['SERVICE_DATE', 'TRAIN'])

# How we'll make the output (to convert to DataFrame)
bus_trips = {'date': [], 'train': [], 'is_complete_trip': [], 'start_time': [], 'end_time': []}

for name, group in grouped:
    # If a group doesn't have 2 entries, then we can't calculate and elapsed time. We'll flag them for now and deal with them later.
    if group.shape[0] != 2:
        bus_trips['is_complete_trip'] = False
    else:
        bus_trips['is_complete_trip'] = True
    bus_trips['date'].append(name[0])
    bus_trips['train'].append(name[1])
    try:
        bus_trips['start_time'].append(group[group['LOCATION_ID'] == 3637]['ARRIVE_TIME'].values[0])
    except IndexError:
        bus_trips['start_time'].append(None)
    
    try: 
        bus_trips['end_time'].append(group[group['LOCATION_ID'] == 3633]['ARRIVE_TIME'].values[0])
    except IndexError:
        bus_trips['end_time'].append(None)
    
trips_df = pd.DataFrame(bus_trips)
trips_df['elapsed_time'] = trips_df['end_time'] - trips_df['start_time']

In [None]:
px.scatter(trips_df[trips_df['is_complete_trip']], x='date', y='elapsed_time', labels={'date': 'Service date', 'elapsed_time': 'Trip duration [sec]'},
#            range_x=[pd.to_datetime('01SEP2017', format='%d%b%Y'), pd.to_datetime('01DEC2017', format='%d%b%Y')],
           title='Every trip')

In [None]:
day_averages = {'date': [], 'elapsed_time': []}
for name, group in trips_df.groupby('date'):
    day_averages['date'].append(name)
    day_averages['elapsed_time'].append(group.mean()['elapsed_time'])
avg_df = pd.DataFrame(day_averages)

px.scatter(avg_df, x='date', y='elapsed_time', labels={'date': 'Service date', 'elapsed_time': 'Trip duration [sec]'},
#            range_x=[pd.to_datetime('01SEP2017', format='%d%b%Y'), pd.to_datetime('01DEC2017', format='%d%b%Y')],
           title='Average over a day')