In [1]:
import pandas as pd
%matplotlib notebook

# Follow Jeff's instruction from: https://github.com/hackoregon/2019-transportation-data-science/blob/master/notebooks/5.0-jab-stop-to-stop-analysis.ipynb 

# To use this notebook, make sure to copy the two .parquest.gzip files from Google Drive 
# (Team Transpo / Datasets & Notebooks / Toad (Congestion) / toad_stop_to_stop_durations )
# and put them in the <data science repo> / data / interim / TOAD folder. 
# (you will need to make the TOAD folder if it doesn't exist.)

# Or change the path to point to them however you want :)

In [2]:
stops_df = pd.read_parquet('../data/interim/TOAD/stop_to_stop.parquet.gzip')

In [3]:
stops_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53847000 entries, 1 to 54694745
Data columns (total 9 columns):
event_no_trip             int64
pattern_direction         object
line_id                   uint16
stop_type                 uint8
stop_id                   uint16
act_arr_time              datetime64[ns, UTC]
prev_stop_id              uint16
prev_stop_act_arr_time    datetime64[ns, UTC]
elapsed_time_seconds      uint16
dtypes: datetime64[ns, UTC](2), int64(1), object(1), uint16(4), uint8(1)
memory usage: 2.5+ GB


In [4]:
# Filter by only lines 10 and 14.  We only need stops 3641 and 3633 because stop 3641 will have elapsed_time_seconds for 3637 to 3641
# We need to do lines 10 and 14 separately because of difference in frequency and calculating the average later on 
lines_10_and_14 = stops_df[(stops_df.stop_id == 3641) | (stops_df.stop_id == 3633) & ((stops_df.line_id == 10) | (stops_df.line_id == 14))]

In [5]:
# Filter by only lines 10 and 14.  We only need stops 3641 and 3633 because stop 3641 will have elapsed_time_seconds for 3637 to 3641
# We need to do lines 10 and 14 separately because of difference in frequency and calculating the average later on 
line10 = stops_df[((stops_df.stop_id == 3641) | (stops_df.stop_id == 3633)) & (stops_df.line_id == 10)]
line14 = stops_df[((stops_df.stop_id == 3641) | (stops_df.stop_id == 3633)) & (stops_df.line_id == 14)]

In [6]:
# Filter to September 9am-10am for both 2017 and 2018
# Note that act_arr_time is in UTC so I'm using hour 16 which will filter to 9am-10am PST
line10_2017 = line10[(line10.act_arr_time >= '2017-09-01') & (line10.act_arr_time <= '2017-10-01')]
line10_2018 = line10[(line10.act_arr_time >= '2018-09-01') & (line10.act_arr_time <= '2018-10-01')]
line10_2017 = line10_2017[line10_2017['act_arr_time'].dt.hour == 16]
line10_2018 = line10_2018[line10_2018['act_arr_time'].dt.hour == 16]

line14_2017 = line14[(line14.act_arr_time >= '2017-09-01') & (line14.act_arr_time <= '2017-10-01')]
line14_2018 = line14[(line14.act_arr_time >= '2018-09-01') & (line14.act_arr_time <= '2018-10-01')]
line14_2017 = line14_2017[line14_2017['act_arr_time'].dt.hour == 16]
line14_2018 = line14_2018[line14_2018['act_arr_time'].dt.hour == 16]

In [7]:
# Filter to only weekdays 
line10_2017['weekday'] = line10_2017['act_arr_time'].apply(lambda x: x.weekday())
line10_2017 = line10_2017[line10_2017['weekday'] < 5 ]
line10_2018['weekday'] = line10_2018['act_arr_time'].apply(lambda x: x.weekday())
line10_2018 = line10_2018[line10_2018['weekday'] < 5 ]

line14_2017['weekday'] = line14_2017['act_arr_time'].apply(lambda x: x.weekday())
line14_2017 = line14_2017[line14_2017['weekday'] < 5 ]
line14_2018['weekday'] = line14_2018['act_arr_time'].apply(lambda x: x.weekday())
line14_2018 = line14_2018[line14_2018['weekday'] < 5 ]

In [8]:
# Check how many rows are in the dataframes
print(str(len(line10_2017)) + " rows for line 10 in 2017")
print(str(len(line10_2018)) + " rows for line 10 in 2018")
print(str(len(line14_2017)) + " rows for line 14 in 2017")
print(str(len(line14_2018)) + " rows for line 14 in 2018")

92 rows for line 10 in 2017
77 rows for line 10 in 2018
211 rows for line 14 in 2017
184 rows for line 14 in 2018


In [9]:
# Total number of minutes to pass through the Madison corridor for lines 10 and 14 in 2017
print("line 10 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 10 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60) + " minutes")

line 10 2017: 101.48333333333333 minutes
line 10 2018: 63.46666666666667 minutes
line 14 2017: 226.3 minutes
line 14 2018: 183.6 minutes


In [10]:
# Difference in minutes
print("Line 10 difference in minutes 2017 minus 2018: " + str(line10_2017.elapsed_time_seconds.sum()/60 - line10_2018.elapsed_time_seconds.sum()/60))
print("Line 14 difference in minutes 2017 minus 2018: " + str(line14_2017.elapsed_time_seconds.sum()/60 - line14_2018.elapsed_time_seconds.sum()/60))

Line 10 difference in minutes 2017 minus 2018: 38.016666666666666
Line 14 difference in minutes 2017 minus 2018: 42.70000000000002


In [11]:
# Average trip time in minutes for 2017 and 2018.  If each trip in the corridor is from stop 3637 to 3633 than one full trip should be the number of rows divided by
# 2 since we are only including stops 3641 and 3633.  This seems mostly correct but there are a few one-offs as you can see from the value counts: 
print("Line 10 2017 value counts:")
print(line10_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line10_2018.stop_id.value_counts())
print("\nLine 14 2017 value counts:")
print(line14_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line14_2018.stop_id.value_counts())
print("\nAverage time to get through the Madison corridor for line 10 in 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 10 in 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2017: " + str((line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2) +
                                                line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2))/2) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2018: " + str((line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2) +
                                                line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2))/2) + " minutes")

Line 10 2017 value counts:
3633    47
3641    45
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3633    39
3641    38
Name: stop_id, dtype: int64

Line 14 2017 value counts:
3641    107
3633    104
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3641    92
3633    92
Name: stop_id, dtype: int64

Average time to get through the Madison corridor for line 10 in 2017: 2.206159420289855 minutes
Average time to get through the Madison corridor for line 10 in 2018: 1.6484848484848484 minutes
Average time to get through the Madison corridor for line 14 in 2017: 2.1450236966824647 minutes
Average time to get through the Madison corridor for line 14 in 2018: 1.9956521739130435 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2017: 2.17559155848616 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2018: 1.822068511198946 minutes


In [12]:
# There was some improvement in travel time from 9am-10am, what about from 6am-10am overall?
# Filter to September 9am-10am for both 2017 and 2018
# Note that act_arr_time is in UTC so I'm using hour 16 which will filter to 9am-10am PST
line10_2017 = line10[(line10.act_arr_time >= '2017-09-01') & (line10.act_arr_time <= '2017-10-01')]
line10_2018 = line10[(line10.act_arr_time >= '2018-09-01') & (line10.act_arr_time <= '2018-10-01')]
line10_2017 = line10_2017[(line10_2017['act_arr_time'].dt.hour >= 16) & (line10_2017['act_arr_time'].dt.hour < 20)]
line10_2018 = line10_2018[(line10_2018['act_arr_time'].dt.hour >= 16) & (line10_2018['act_arr_time'].dt.hour < 20)]

line14_2017 = line14[(line14.act_arr_time >= '2017-09-01') & (line14.act_arr_time <= '2017-10-01')]
line14_2018 = line14[(line14.act_arr_time >= '2018-09-01') & (line14.act_arr_time <= '2018-10-01')]
line14_2017 = line14_2017[(line14_2017['act_arr_time'].dt.hour >= 16) & (line14_2017['act_arr_time'].dt.hour < 20)]
line14_2018 = line14_2018[(line14_2018['act_arr_time'].dt.hour >= 16) & (line14_2018['act_arr_time'].dt.hour < 20)]

In [13]:
# Filter to only weekdays 
line10_2017['weekday'] = line10_2017['act_arr_time'].apply(lambda x: x.weekday())
line10_2017 = line10_2017[line10_2017['weekday'] < 5 ]
line10_2018['weekday'] = line10_2018['act_arr_time'].apply(lambda x: x.weekday())
line10_2018 = line10_2018[line10_2018['weekday'] < 5 ]

line14_2017['weekday'] = line14_2017['act_arr_time'].apply(lambda x: x.weekday())
line14_2017 = line14_2017[line14_2017['weekday'] < 5 ]
line14_2018['weekday'] = line14_2018['act_arr_time'].apply(lambda x: x.weekday())
line14_2018 = line14_2018[line14_2018['weekday'] < 5 ]

In [14]:
# Check how many rows are in the dataframes
print(str(len(line10_2017)) + " rows for line 10 in 2017")
print(str(len(line10_2018)) + " rows for line 10 in 2018")
print(str(len(line14_2017)) + " rows for line 14 in 2017")
print(str(len(line14_2018)) + " rows for line 14 in 2018")

370 rows for line 10 in 2017
347 rows for line 10 in 2018
710 rows for line 14 in 2017
656 rows for line 14 in 2018


In [15]:
# Total number of minutes to pass through the Madison corridor for lines 10 and 14 in 2017
print("line 10 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 10 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60) + " minutes")

line 10 2017: 336.93333333333334 minutes
line 10 2018: 287.73333333333335 minutes
line 14 2017: 725.9666666666667 minutes
line 14 2018: 681.0666666666667 minutes


In [16]:
# Difference in minutes
print("Line 10 difference in minutes 2017 minus 2018: " + str(line10_2017.elapsed_time_seconds.sum()/60 - line10_2018.elapsed_time_seconds.sum()/60))
print("Line 14 difference in minutes 2017 minus 2018: " + str(line14_2017.elapsed_time_seconds.sum()/60 - line14_2018.elapsed_time_seconds.sum()/60))

Line 10 difference in minutes 2017 minus 2018: 49.19999999999999
Line 14 difference in minutes 2017 minus 2018: 44.89999999999998


In [17]:
# Average trip time in minutes for 2017 and 2018.  If each trip in the corridor is from stop 3637 to 3633 than one full trip should be the number of rows divided by
# 2 since we are only including stops 3641 and 3633.  This seems mostly correct but there are a few one-offs as you can see from the value counts: 
print("Line 10 2017 value counts:")
print(line10_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line10_2018.stop_id.value_counts())
print("\nLine 14 2017 value counts:")
print(line14_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line14_2018.stop_id.value_counts())
print("\nAverage time to get through the Madison corridor for line 10 in 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 10 in 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2017: " + str((line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2) +
                                                line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2))/2) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2018: " + str((line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2) +
                                                line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2))/2) + " minutes")

Line 10 2017 value counts:
3633    186
3641    184
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3633    174
3641    173
Name: stop_id, dtype: int64

Line 14 2017 value counts:
3641    355
3633    355
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3641    329
3633    327
Name: stop_id, dtype: int64

Average time to get through the Madison corridor for line 10 in 2017: 1.8212612612612613 minutes
Average time to get through the Madison corridor for line 10 in 2018: 1.6584053794428435 minutes
Average time to get through the Madison corridor for line 14 in 2017: 2.044976525821596 minutes
Average time to get through the Madison corridor for line 14 in 2018: 2.0764227642276425 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2017: 1.9331188935414287 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2018: 1.867414071835243 minutes


In [18]:
# Check 10am
# Filter to September 9am-10am for both 2017 and 2018
# Note that act_arr_time is in UTC so I'm using hour 16 which will filter to 9am-10am PST
line10_2017 = line10[(line10.act_arr_time >= '2017-09-01') & (line10.act_arr_time <= '2017-10-01')]
line10_2018 = line10[(line10.act_arr_time >= '2018-09-01') & (line10.act_arr_time <= '2018-10-01')]
line10_2017 = line10_2017[line10_2017['act_arr_time'].dt.hour == 17]
line10_2018 = line10_2018[line10_2018['act_arr_time'].dt.hour == 17]

line14_2017 = line14[(line14.act_arr_time >= '2017-09-01') & (line14.act_arr_time <= '2017-10-01')]
line14_2018 = line14[(line14.act_arr_time >= '2018-09-01') & (line14.act_arr_time <= '2018-10-01')]
line14_2017 = line14_2017[line14_2017['act_arr_time'].dt.hour == 17]
line14_2018 = line14_2018[line14_2018['act_arr_time'].dt.hour == 17]

In [19]:
# Filter to only weekdays 
line10_2017['weekday'] = line10_2017['act_arr_time'].apply(lambda x: x.weekday())
line10_2017 = line10_2017[line10_2017['weekday'] < 5 ]
line10_2018['weekday'] = line10_2018['act_arr_time'].apply(lambda x: x.weekday())
line10_2018 = line10_2018[line10_2018['weekday'] < 5 ]

line14_2017['weekday'] = line14_2017['act_arr_time'].apply(lambda x: x.weekday())
line14_2017 = line14_2017[line14_2017['weekday'] < 5 ]
line14_2018['weekday'] = line14_2018['act_arr_time'].apply(lambda x: x.weekday())
line14_2018 = line14_2018[line14_2018['weekday'] < 5 ]

In [20]:
# Check how many rows are in the dataframes
print(str(len(line10_2017)) + " rows for line 10 in 2017")
print(str(len(line10_2018)) + " rows for line 10 in 2018")
print(str(len(line14_2017)) + " rows for line 14 in 2017")
print(str(len(line14_2018)) + " rows for line 14 in 2018")

80 rows for line 10 in 2017
78 rows for line 10 in 2018
153 rows for line 14 in 2017
170 rows for line 14 in 2018


In [21]:
# Total number of minutes to pass through the Madison corridor for lines 10 and 14 in 2017
print("line 10 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 10 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60) + " minutes")

line 10 2017: 80.36666666666666 minutes
line 10 2018: 65.38333333333334 minutes
line 14 2017: 156.68333333333334 minutes
line 14 2018: 196.13333333333333 minutes


In [22]:
# Difference in minutes
print("Line 10 difference in minutes 2017 minus 2018: " + str(line10_2017.elapsed_time_seconds.sum()/60 - line10_2018.elapsed_time_seconds.sum()/60))
print("Line 14 difference in minutes 2017 minus 2018: " + str(line14_2017.elapsed_time_seconds.sum()/60 - line14_2018.elapsed_time_seconds.sum()/60))

Line 10 difference in minutes 2017 minus 2018: 14.98333333333332
Line 14 difference in minutes 2017 minus 2018: -39.44999999999999


In [23]:
# Average trip time in minutes for 2017 and 2018.  If each trip in the corridor is from stop 3637 to 3633 than one full trip should be the number of rows divided by
# 2 since we are only including stops 3641 and 3633.  This seems mostly correct but there are a few one-offs as you can see from the value counts: 
print("Line 10 2017 value counts:")
print(line10_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line10_2018.stop_id.value_counts())
print("\nLine 14 2017 value counts:")
print(line14_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line14_2018.stop_id.value_counts())
print("\nAverage time to get through the Madison corridor for line 10 in 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 10 in 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2017: " + str((line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2) +
                                                line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2))/2) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2018: " + str((line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2) +
                                                line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2))/2) + " minutes")

Line 10 2017 value counts:
3641    40
3633    40
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3641    39
3633    39
Name: stop_id, dtype: int64

Line 14 2017 value counts:
3633    78
3641    75
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3641    88
3633    82
Name: stop_id, dtype: int64

Average time to get through the Madison corridor for line 10 in 2017: 2.0091666666666663 minutes
Average time to get through the Madison corridor for line 10 in 2018: 1.6764957264957268 minutes
Average time to get through the Madison corridor for line 14 in 2017: 2.0481481481481483 minutes
Average time to get through the Madison corridor for line 14 in 2018: 2.3074509803921566 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2017: 2.0286574074074073 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2018: 1.9919733534439417 minutes


In [24]:
# Check 11am
# Filter to September 9am-10am for both 2017 and 2018
# Note that act_arr_time is in UTC so I'm using hour 16 which will filter to 9am-10am PST
line10_2017 = line10[(line10.act_arr_time >= '2017-09-01') & (line10.act_arr_time <= '2017-10-01')]
line10_2018 = line10[(line10.act_arr_time >= '2018-09-01') & (line10.act_arr_time <= '2018-10-01')]
line10_2017 = line10_2017[line10_2017['act_arr_time'].dt.hour == 18]
line10_2018 = line10_2018[line10_2018['act_arr_time'].dt.hour == 18]

line14_2017 = line14[(line14.act_arr_time >= '2017-09-01') & (line14.act_arr_time <= '2017-10-01')]
line14_2018 = line14[(line14.act_arr_time >= '2018-09-01') & (line14.act_arr_time <= '2018-10-01')]
line14_2017 = line14_2017[line14_2017['act_arr_time'].dt.hour == 18]
line14_2018 = line14_2018[line14_2018['act_arr_time'].dt.hour == 18]

In [25]:
# Filter to only weekdays 
line10_2017['weekday'] = line10_2017['act_arr_time'].apply(lambda x: x.weekday())
line10_2017 = line10_2017[line10_2017['weekday'] < 5 ]
line10_2018['weekday'] = line10_2018['act_arr_time'].apply(lambda x: x.weekday())
line10_2018 = line10_2018[line10_2018['weekday'] < 5 ]

line14_2017['weekday'] = line14_2017['act_arr_time'].apply(lambda x: x.weekday())
line14_2017 = line14_2017[line14_2017['weekday'] < 5 ]
line14_2018['weekday'] = line14_2018['act_arr_time'].apply(lambda x: x.weekday())
line14_2018 = line14_2018[line14_2018['weekday'] < 5 ]

In [26]:
# Check how many rows are in the dataframes
print(str(len(line10_2017)) + " rows for line 10 in 2017")
print(str(len(line10_2018)) + " rows for line 10 in 2018")
print(str(len(line14_2017)) + " rows for line 14 in 2017")
print(str(len(line14_2018)) + " rows for line 14 in 2018")

103 rows for line 10 in 2017
103 rows for line 10 in 2018
163 rows for line 14 in 2017
153 rows for line 14 in 2018


In [27]:
# Total number of minutes to pass through the Madison corridor for lines 10 and 14 in 2017
print("line 10 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 10 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60) + " minutes")
print("line 14 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60) + " minutes")

line 10 2017: 80.15 minutes
line 10 2018: 86.65 minutes
line 14 2017: 158.91666666666666 minutes
line 14 2018: 148.1 minutes


In [28]:
# Difference in minutes
print("Line 10 difference in minutes 2017 minus 2018: " + str(line10_2017.elapsed_time_seconds.sum()/60 - line10_2018.elapsed_time_seconds.sum()/60))
print("Line 14 difference in minutes 2017 minus 2018: " + str(line14_2017.elapsed_time_seconds.sum()/60 - line14_2018.elapsed_time_seconds.sum()/60))

Line 10 difference in minutes 2017 minus 2018: -6.5
Line 14 difference in minutes 2017 minus 2018: 10.816666666666663


In [29]:
# Average trip time in minutes for 2017 and 2018.  If each trip in the corridor is from stop 3637 to 3633 than one full trip should be the number of rows divided by
# 2 since we are only including stops 3641 and 3633.  This seems mostly correct but there are a few one-offs as you can see from the value counts: 
print("Line 10 2017 value counts:")
print(line10_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line10_2018.stop_id.value_counts())
print("\nLine 14 2017 value counts:")
print(line14_2017.stop_id.value_counts())
print("\nLine 10 2018 value counts:")
print(line14_2018.stop_id.value_counts())
print("\nAverage time to get through the Madison corridor for line 10 in 2017: " + str(line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 10 in 2018: " + str(line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2017: " + str(line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2)) + " minutes")
print("Average time to get through the Madison corridor for line 14 in 2018: " + str(line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2)) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2017: " + str((line10_2017.elapsed_time_seconds.sum()/60/(len(line10_2017)/2) +
                                                line14_2017.elapsed_time_seconds.sum()/60/(len(line14_2017)/2))/2) + " minutes")
print("Average time to get through the Madison corridor for lines 10 and 14 in 2018: " + str((line10_2018.elapsed_time_seconds.sum()/60/(len(line10_2018)/2) +
                                                line14_2018.elapsed_time_seconds.sum()/60/(len(line14_2018)/2))/2) + " minutes")

Line 10 2017 value counts:
3641    53
3633    50
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3641    52
3633    51
Name: stop_id, dtype: int64

Line 14 2017 value counts:
3641    84
3633    79
Name: stop_id, dtype: int64

Line 10 2018 value counts:
3633    78
3641    75
Name: stop_id, dtype: int64

Average time to get through the Madison corridor for line 10 in 2017: 1.5563106796116506 minutes
Average time to get through the Madison corridor for line 10 in 2018: 1.6825242718446602 minutes
Average time to get through the Madison corridor for line 14 in 2017: 1.9498977505112474 minutes
Average time to get through the Madison corridor for line 14 in 2018: 1.9359477124183007 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2017: 1.753104215061449 minutes
Average time to get through the Madison corridor for lines 10 and 14 in 2018: 1.8092359921314805 minutes


In [30]:
# The overall improvement isn't that impressive.  Look at disturbance delay for the same windows for comparison
delay_2017 = pd.read_csv('../data/interim/TOAD/hawthorne_disturbance_sep2017.csv')
delay_2018 = pd.read_csv('../data/interim/TOAD/hawthorne_disturbance_jul2018-dec2018.csv')

In [31]:
# Filter to only 9am
delay_2017_9am = delay_2017[(delay_2017.ACT_ARR_TIME >= 32400) & (delay_2017.ACT_ARR_TIME <= 36000)]

In [32]:
# Remove stop 7856
delay_2017_9am = delay_2017_9am[delay_2017_9am.start_location == 3637]

In [33]:
# Already only have weekdays, don't need this
# delay_2017['SERVICE_DATE'] =  pd.to_datetime(delay_2017['SERVICE_DATE'], format='%d%b%Y:00:00:00')
# delay_2017['weekday'] = delay_2017['SERVICE_DATE'].apply(lambda x: x.weekday())
# delay_2017 = delay_2017[delay_2017['weekday'] < 5 ]

In [34]:
delay_2017_9am.delay.sum()

3237

In [35]:
delay_2018['SERVICE_DATE'] = delay_2018['SERVICE_DATE'].str[:7]
delay_2018['SERVICE_DATE'] =  pd.to_datetime(delay_2018['SERVICE_DATE'])
delay_2018 = delay_2018[(delay_2018.SERVICE_DATE >= '2018-09-01') & (delay_2018.SERVICE_DATE < '2018-10-01')]

In [36]:
# Filter to only 9am
delay_2018_9am = delay_2018[(delay_2018.ACT_ARR_TIME >= 32400) & (delay_2018.ACT_ARR_TIME <= 36000)]

In [37]:
# Remove stop 7856
delay_2018_9am = delay_2018_9am[delay_2018_9am.start_location == 3637]

In [38]:
delay_2018_9am.delay.sum()

572

In [39]:
delay_2017_9am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98 entries, 1969 to 2765
Data columns (total 17 columns):
SERVICE_DATE      98 non-null object
SERVICE_KEY       98 non-null object
TRAIN             98 non-null int64
TRIP_NUMBER       98 non-null int64
bus               98 non-null int64
ROUTE_NUMBER      98 non-null int64
DIRECTION         98 non-null int64
start_location    98 non-null int64
depart            98 non-null int64
end_location      98 non-null int64
arrive            98 non-null int64
ACT_ARR_TIME      98 non-null int64
ACT_DEP_TIME      98 non-null int64
STOP_TYPE         98 non-null int64
GPS_LONGITUDE     98 non-null float64
GPS_LATITUDE      98 non-null float64
delay             98 non-null int64
dtypes: float64(2), int64(13), object(2)
memory usage: 13.8+ KB


In [40]:
delay_2018_9am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11 entries, 5119 to 6857
Data columns (total 17 columns):
SERVICE_DATE      11 non-null datetime64[ns]
SERVICE_KEY       11 non-null object
TRAIN             11 non-null int64
TRIP_NUMBER       11 non-null int64
bus               11 non-null int64
ROUTE_NUMBER      11 non-null int64
DIRECTION         11 non-null int64
start_location    11 non-null int64
depart            11 non-null int64
end_location      11 non-null int64
arrive            11 non-null int64
ACT_ARR_TIME      11 non-null int64
ACT_DEP_TIME      11 non-null int64
STOP_TYPE         11 non-null int64
GPS_LONGITUDE     11 non-null float64
GPS_LATITUDE      11 non-null float64
delay             11 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(13), object(1)
memory usage: 1.5+ KB


In [41]:
# This confirms that there were way less disturbance stops during 9am-10am.  Check 6am-10am
# Filter to only 6am-10am
delay_2017_6_10am = delay_2017[(delay_2017.ACT_ARR_TIME >= 21600) & (delay_2017.ACT_ARR_TIME <= 36000)]
delay_2018_6_10am = delay_2018[(delay_2018.ACT_ARR_TIME >= 21600) & (delay_2018.ACT_ARR_TIME <= 36000)]
# Remove stop 7856
delay_2017_6_10am = delay_2017_6_10am[delay_2017_6_10am.start_location == 3637]
delay_2018_6_10am = delay_2018_6_10am[delay_2018_6_10am.start_location == 3637]

In [42]:
delay_2017_6_10am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 574 entries, 1947 to 2768
Data columns (total 17 columns):
SERVICE_DATE      574 non-null object
SERVICE_KEY       574 non-null object
TRAIN             574 non-null int64
TRIP_NUMBER       574 non-null int64
bus               574 non-null int64
ROUTE_NUMBER      574 non-null int64
DIRECTION         574 non-null int64
start_location    574 non-null int64
depart            574 non-null int64
end_location      574 non-null int64
arrive            574 non-null int64
ACT_ARR_TIME      574 non-null int64
ACT_DEP_TIME      574 non-null int64
STOP_TYPE         574 non-null int64
GPS_LONGITUDE     574 non-null float64
GPS_LATITUDE      574 non-null float64
delay             574 non-null int64
dtypes: float64(2), int64(13), object(2)
memory usage: 80.7+ KB


In [43]:
delay_2018_6_10am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 5114 to 7280
Data columns (total 17 columns):
SERVICE_DATE      276 non-null datetime64[ns]
SERVICE_KEY       276 non-null object
TRAIN             276 non-null int64
TRIP_NUMBER       276 non-null int64
bus               276 non-null int64
ROUTE_NUMBER      276 non-null int64
DIRECTION         276 non-null int64
start_location    276 non-null int64
depart            276 non-null int64
end_location      276 non-null int64
arrive            276 non-null int64
ACT_ARR_TIME      276 non-null int64
ACT_DEP_TIME      276 non-null int64
STOP_TYPE         276 non-null int64
GPS_LONGITUDE     276 non-null float64
GPS_LATITUDE      276 non-null float64
delay             276 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(13), object(1)
memory usage: 38.8+ KB


In [44]:
delay_2017_6_10am.delay.sum()

16565

In [45]:
delay_2018_6_10am.delay.sum()

6842

In [46]:
# Check 10am-11am
# Filter to only 10am-11am
delay_2017_10am = delay_2017[(delay_2017.ACT_ARR_TIME >= 36000) & (delay_2017.ACT_ARR_TIME <= 39600)]
delay_2018_10am = delay_2018[(delay_2018.ACT_ARR_TIME >= 36000) & (delay_2018.ACT_ARR_TIME <= 39600)]
# Remove stop 7856
delay_2017_10am = delay_2017_10am[delay_2017_10am.start_location == 3637]
delay_2018_10am = delay_2018_10am[delay_2018_10am.start_location == 3637]

In [47]:
delay_2017_10am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26 entries, 2051 to 2690
Data columns (total 17 columns):
SERVICE_DATE      26 non-null object
SERVICE_KEY       26 non-null object
TRAIN             26 non-null int64
TRIP_NUMBER       26 non-null int64
bus               26 non-null int64
ROUTE_NUMBER      26 non-null int64
DIRECTION         26 non-null int64
start_location    26 non-null int64
depart            26 non-null int64
end_location      26 non-null int64
arrive            26 non-null int64
ACT_ARR_TIME      26 non-null int64
ACT_DEP_TIME      26 non-null int64
STOP_TYPE         26 non-null int64
GPS_LONGITUDE     26 non-null float64
GPS_LATITUDE      26 non-null float64
delay             26 non-null int64
dtypes: float64(2), int64(13), object(2)
memory usage: 3.7+ KB


In [48]:
delay_2018_10am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 5260 to 7232
Data columns (total 17 columns):
SERVICE_DATE      8 non-null datetime64[ns]
SERVICE_KEY       8 non-null object
TRAIN             8 non-null int64
TRIP_NUMBER       8 non-null int64
bus               8 non-null int64
ROUTE_NUMBER      8 non-null int64
DIRECTION         8 non-null int64
start_location    8 non-null int64
depart            8 non-null int64
end_location      8 non-null int64
arrive            8 non-null int64
ACT_ARR_TIME      8 non-null int64
ACT_DEP_TIME      8 non-null int64
STOP_TYPE         8 non-null int64
GPS_LONGITUDE     8 non-null float64
GPS_LATITUDE      8 non-null float64
delay             8 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(13), object(1)
memory usage: 1.1+ KB


In [49]:
delay_2017_10am.delay.sum()

1429

In [50]:
delay_2018_10am.delay.sum()

491

In [51]:
# Check 11am-12pm
# Filter to only 10am-11am
delay_2017_11am = delay_2017[(delay_2017.ACT_ARR_TIME >= 39600) & (delay_2017.ACT_ARR_TIME <= 43200)]
delay_2018_11am = delay_2018[(delay_2018.ACT_ARR_TIME >= 39600) & (delay_2018.ACT_ARR_TIME <= 43200)]
# Remove stop 7856
delay_2017_11am = delay_2017_11am[delay_2017_11am.start_location == 3637]
delay_2018_11am = delay_2018_11am[delay_2018_11am.start_location == 3637]

In [52]:
delay_2017_11am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 2206 to 2769
Data columns (total 17 columns):
SERVICE_DATE      13 non-null object
SERVICE_KEY       13 non-null object
TRAIN             13 non-null int64
TRIP_NUMBER       13 non-null int64
bus               13 non-null int64
ROUTE_NUMBER      13 non-null int64
DIRECTION         13 non-null int64
start_location    13 non-null int64
depart            13 non-null int64
end_location      13 non-null int64
arrive            13 non-null int64
ACT_ARR_TIME      13 non-null int64
ACT_DEP_TIME      13 non-null int64
STOP_TYPE         13 non-null int64
GPS_LONGITUDE     13 non-null float64
GPS_LATITUDE      13 non-null float64
delay             13 non-null int64
dtypes: float64(2), int64(13), object(2)
memory usage: 1.8+ KB


In [53]:
delay_2018_11am.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 5256 to 7224
Data columns (total 17 columns):
SERVICE_DATE      14 non-null datetime64[ns]
SERVICE_KEY       14 non-null object
TRAIN             14 non-null int64
TRIP_NUMBER       14 non-null int64
bus               14 non-null int64
ROUTE_NUMBER      14 non-null int64
DIRECTION         14 non-null int64
start_location    14 non-null int64
depart            14 non-null int64
end_location      14 non-null int64
arrive            14 non-null int64
ACT_ARR_TIME      14 non-null int64
ACT_DEP_TIME      14 non-null int64
STOP_TYPE         14 non-null int64
GPS_LONGITUDE     14 non-null float64
GPS_LATITUDE      14 non-null float64
delay             14 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(13), object(1)
memory usage: 2.0+ KB


In [54]:
delay_2017_11am.delay.sum()

575

In [55]:
delay_2018_11am.delay.sum()

597