In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wego = pd.read_csv('../data/headway_data_clean.csv')

In [3]:
# convert times to datetime64 objects, check dtype to confirm
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['DATE'] = pd.to_datetime(wego['DATE'], format = "%Y/%m/%d")


In [4]:
#Create a column that gives each day of the week its on variable
wego['DAYS_OF_THE_WEEK'] = wego['DATE'].dt.weekday


**Question1: How much impact does being late or too spaced out at the first stop have downstream**



*Does being late to start a trip or too spaced out between buses have an affect on the remainder of the trip?*

In [5]:
#Lateness information on the first stop, middle stops and the last stop
wego.groupby('TRIP_EDGE')['ADHERENCE'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
TRIP_EDGE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,207872.0,-3.692178,5.634956,-141.183333,-5.016666,-2.3,-0.583333,85.666666
1,65711.0,-2.760552,8.510429,-948.533333,-3.533333,-1.95,-0.833333,84.666666
2,65277.0,-2.016883,8.407097,-489.316666,-4.433333,-0.5,2.333333,88.383333


In [6]:
#The amount of stops total
wego['TRIP_EDGE'].value_counts()

0    215185
2     67728
1     67415
Name: TRIP_EDGE, dtype: int64

Question: **Does being late to start a trip mean that they are bunched?**

In [15]:
#Creating a dataframe of the info I will need to answer the question
trip_adherence = wego[['TRIP_ID','CALENDAR_ID', 'TRIP_EDGE', 'ADHERENCE']]


In [20]:
#the first stop
first_stop = trip_adherence[trip_adherence['TRIP_EDGE'] == 1].dropna()

In [26]:
first_stop['ADHERENCE'].value_counts()

 0.950000      1055
 0.966666      1021
-1.033333       313
-0.700000       307
-0.900000       306
               ... 
-17.750000        1
 8.883333         1
 8.900000         1
-948.533333       1
 13.366666        1
Name: ADHERENCE, Length: 2081, dtype: int64

In [9]:
#The number of trips that started on time or early.
trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] >= 0)].shape

(6109, 4)

In [16]:
#The number of trips that started late
trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] < 0)].shape

(59602, 4)

In [19]:
#The actual trips where the trip began late
trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] < 0)]

Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE
0,345104,120230801,1,-2.133333
4,345105,120230801,1,-1.583333
7,345106,120230801,1,-1.716666
11,345107,120230801,1,-1.316666
14,345108,120230801,1,-1.516666
...,...,...,...,...
350313,353445,120230930,1,-3.283333
350316,353446,120230930,1,-0.850000
350319,353447,120230930,1,-5.900000
350322,353448,120230930,1,-8.616666


In [None]:
trip_adherence[trip_adherence['TRIP_EDGE'] == 0].dropna()

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 0) & (trip_adherence['ADHERENCE'] >= 0)].shape

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 0) & (trip_adherence['ADHERENCE'] < 0)].shape

In [None]:
trip_adherence[trip_adherence['TRIP_EDGE'] == 2].dropna()

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 2) & (trip_adherence['ADHERENCE'] >= 0)].shape

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 2) & (trip_adherence['ADHERENCE'] < 0)].shape

In [None]:
trips = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'TRIP_EDGE').reset_index(drop = True)

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] <= 0)]

In [None]:
trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] >= 0)]

In [None]:
trip_adherence.info()

In [27]:
trip_adherence = trip_adherence.copy()

In [28]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
trip_adherence['TRIP_ID'] = trip_adherence['TRIP_ID'].astype(str)


In [29]:
#changing the dtype to string so that i can concat with the "ROUTE ID" column
trip_adherence['CALENDAR_ID'] = trip_adherence['CALENDAR_ID'].astype(str)


In [43]:
#creating a column that concats the two columns
trip_adherence['TRIP_IDS'] = trip_adherence['TRIP_ID'] + trip_adherence['CALENDAR_ID']
trip_adherence

Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE,TRIP_IDS,row_num,stops
0,345104,120230801,1,-2.133333,345104120230801,1,1
1,345104,120230801,0,-2.450000,345104120230801,1,2
2,345104,120230801,0,-0.933333,345104120230801,1,3
3,345104,120230801,2,6.283333,345104120230801,1,4
4,345105,120230801,1,-1.583333,345105120230801,1,1
...,...,...,...,...,...,...,...
350323,353448,120230930,0,-8.433333,353448120230930,1,2
350324,353448,120230930,2,-11.300000,353448120230930,1,3
350325,353449,120230930,1,-4.316666,353449120230930,1,1
350326,353449,120230930,0,-22.083333,353449120230930,1,2


In [31]:
#allows for each stop to be numbered in their respective route IDs
trip_adherence['row_num'] = 1
trip_adherence['stops'] = trip_adherence.groupby('TRIP_IDS')['row_num'].cumsum()

In [45]:
trip_adherence


Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE,TRIP_IDS,row_num,stops
0,345104,120230801,1,-2.133333,345104120230801,1,1
1,345104,120230801,0,-2.450000,345104120230801,1,2
2,345104,120230801,0,-0.933333,345104120230801,1,3
3,345104,120230801,2,6.283333,345104120230801,1,4
4,345105,120230801,1,-1.583333,345105120230801,1,1
...,...,...,...,...,...,...,...
350323,353448,120230930,0,-8.433333,353448120230930,1,2
350324,353448,120230930,2,-11.300000,353448120230930,1,3
350325,353449,120230930,1,-4.316666,353449120230930,1,1
350326,353449,120230930,0,-22.083333,353449120230930,1,2


In [32]:
#trip_adherence['stops'] = trip_adherence['stops'].astype(str)

In [33]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
trip_adherence_pivot = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops').reset_index(drop = True)

In [41]:
trip_adherence_pivot

stops,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.133333,-2.450000,-0.933333,6.283333,,,,,,,,,,
1,-2.450000,-3.000000,-0.316666,8.016666,,,,,,,,,,
2,-0.766666,-1.050000,-0.233333,6.200000,,,,,,,,,,
3,-1.583333,0.950000,1.500000,,,,,,,,,,,
4,-3.116666,-2.650000,-1.916666,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66445,0.966666,2.200000,,,,,,,,,,,,
66446,-0.266666,-139.683333,,,,,,,,,,,,
66447,0.966666,-166.566666,,,,,,,,,,,,
66448,-2.566666,-5.816666,,,,,,,,,,,,


In [38]:
trip_adherence_pivot[trip_adherence_pivot[1] < 0]

stops,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.133333,-2.450000,-0.933333,6.283333,,,,,,,,,,
1,-2.450000,-3.000000,-0.316666,8.016666,,,,,,,,,,
2,-0.766666,-1.050000,-0.233333,6.200000,,,,,,,,,,
3,-1.583333,0.950000,1.500000,,,,,,,,,,,
4,-3.116666,-2.650000,-1.916666,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66440,-0.250000,3.100000,,,,,,,,,,,,
66441,-4.716666,-18.416666,,,,,,,,,,,,
66446,-0.266666,-139.683333,,,,,,,,,,,,
66448,-2.566666,-5.816666,,,,,,,,,,,,


In [39]:
trip_adherence_pivot[trip_adherence_pivot[1] >= 0]

stops,1,2,3,4,5,6,7,8,9,10,11,12,13,14
38,0.033333,-1.766666,-1.433333,0.016666,5.433333,,,,,,,,,
42,0.966666,-0.766666,3.916666,,,,,,,,,,,
43,0.966666,-0.466666,3.616666,,,,,,,,,,,
44,0.966666,0.933333,2.933333,,,,,,,,,,,
48,0.966666,0.016666,3.833333,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66442,0.966666,6.700000,,,,,,,,,,,,
66443,0.466666,-0.600000,,,,,,,,,,,,
66444,0.966666,-65.200000,,,,,,,,,,,,
66445,0.966666,2.200000,,,,,,,,,,,,


In [None]:
#the min amount of stops
trip_adherence['stops'].min()

In [None]:
#the max amount of stops
trip_adherence['stops'].max()

In [None]:
#Checking the total amount of stops
trip_adherence['stops'].info()

In [None]:
trip_adherence_pivot

In [None]:
trip_adherence_pivot['Stops']

In [None]:
start_on_time = trip_adherence_pivot[trip_adherence_pivot[1] == 0].reset_index()

In [None]:
start_late = trip_adherence_pivot[trip_adherence_pivot[1] != 0].reset_index()

In [None]:
start_on_time = start_on_time.copy()

In [None]:
start_late = start_late.copy()

In [None]:
#start_late.iloc[0]

In [None]:
wego.loc[
    (wego['ROUTE_ABBR'] == 56) & (wego['STOPS']== 20)
    
    
]

In [None]:
trip_adherence_pivot.info()