In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wego = pd.read_csv('../data/headway_data_clean.csv')

In [3]:
# convert times to datetime64 objects, check dtype to confirm
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['DATE'] = pd.to_datetime(wego['DATE'], format = "%Y/%m/%d")


In [4]:
#Create a column that gives each day of the week its on variable
wego['DAYS_OF_THE_WEEK'] = wego['DATE'].dt.weekday


**Question1: How much impact does being late or too spaced out at the first stop have downstream**



*Does being late to start a trip or too spaced out between buses have an affect on the remainder of the trip?*

In [5]:
#Lateness information on the first stop, middle stops and the last stop
wego.groupby('TRIP_EDGE')['ADHERENCE'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
TRIP_EDGE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,207872.0,-3.692178,5.634956,-141.183333,-5.016666,-2.3,-0.583333,85.666666
1,65711.0,-2.760552,8.510429,-948.533333,-3.533333,-1.95,-0.833333,84.666666
2,65277.0,-2.016883,8.407097,-489.316666,-4.433333,-0.5,2.333333,88.383333


In [6]:
wego['ADHERENCE'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 350328 entries, 0 to 350327
Series name: ADHERENCE
Non-Null Count   Dtype  
--------------   -----  
338860 non-null  float64
dtypes: float64(1)
memory usage: 2.7 MB


In [7]:
first_stop = wego[wego['TRIP_EDGE']== 1]

In [8]:
other_stops = wego[wego['TRIP_EDGE']!= 1]

In [9]:
wego['TRIP_EDGE'].value_counts()

0    215185
2     67728
1     67415
Name: TRIP_EDGE, dtype: int64

Question: **Does being late to start a trip mean that they are bunched?**

In [10]:
#Creating a dataframe of the info I will need to answer the question
trip_adherence = wego[['TRIP_ID','CALENDAR_ID', 'TRIP_EDGE', 'ADHERENCE']]
trip_adherence

Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE
0,345104,120230801,1,-2.133333
1,345104,120230801,0,-2.450000
2,345104,120230801,0,-0.933333
3,345104,120230801,2,6.283333
4,345105,120230801,1,-1.583333
...,...,...,...,...
350323,353448,120230930,0,-8.433333
350324,353448,120230930,2,-11.300000
350325,353449,120230930,1,-4.316666
350326,353449,120230930,0,-22.083333


In [11]:
trip_adherence = trip_adherence.copy()

In [12]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
trip_adherence['TRIP_ID'] = trip_adherence['TRIP_ID'].astype(str)


In [13]:
#changing the dtype to string so that i can concat with the "ROUTE ID" column
trip_adherence['CALENDAR_ID'] = trip_adherence['CALENDAR_ID'].astype(str)


In [14]:
#creating a column that concats the two columns
trip_adherence['TRIP_IDS'] = trip_adherence['TRIP_ID'] + trip_adherence['CALENDAR_ID']
trip_adherence

Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE,TRIP_IDS
0,345104,120230801,1,-2.133333,345104120230801
1,345104,120230801,0,-2.450000,345104120230801
2,345104,120230801,0,-0.933333,345104120230801
3,345104,120230801,2,6.283333,345104120230801
4,345105,120230801,1,-1.583333,345105120230801
...,...,...,...,...,...
350323,353448,120230930,0,-8.433333,353448120230930
350324,353448,120230930,2,-11.300000,353448120230930
350325,353449,120230930,1,-4.316666,353449120230930
350326,353449,120230930,0,-22.083333,353449120230930


In [16]:
#allows for each stop to be numbered in their respective rout IDs
trip_adherence['row_num'] = 1
trip_adherence['stops'] = trip_adherence.groupby('TRIP_IDS')['row_num'].cumsum()

In [19]:
trip_adherence


Unnamed: 0,TRIP_ID,CALENDAR_ID,TRIP_EDGE,ADHERENCE,TRIP_IDS,row_num,stops
0,345104,120230801,1,-2.133333,345104120230801,1,1
1,345104,120230801,0,-2.450000,345104120230801,1,2
2,345104,120230801,0,-0.933333,345104120230801,1,3
3,345104,120230801,2,6.283333,345104120230801,1,4
4,345105,120230801,1,-1.583333,345105120230801,1,1
...,...,...,...,...,...,...,...
350323,353448,120230930,0,-8.433333,353448120230930,1,2
350324,353448,120230930,2,-11.300000,353448120230930,1,3
350325,353449,120230930,1,-4.316666,353449120230930,1,1
350326,353449,120230930,0,-22.083333,353449120230930,1,2


In [24]:
trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

stops,1,2,3,4,5,6,7,8,9,10,11,12,13,14
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
345104120230801,-2.133333,-2.450000,-0.933333,6.283333,,,,,,,,,,
345104120230802,-2.450000,-3.000000,-0.316666,8.016666,,,,,,,,,,
345104120230803,-0.766666,-1.050000,-0.233333,6.200000,,,,,,,,,,
345105120230801,-1.583333,0.950000,1.500000,,,,,,,,,,,
345105120230802,-3.116666,-2.650000,-1.916666,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354106120230925,0.966666,2.200000,,,,,,,,,,,,
354106120230926,-0.266666,-139.683333,,,,,,,,,,,,
354106120230927,0.966666,-166.566666,,,,,,,,,,,,
354106120230928,-2.566666,-5.816666,,,,,,,,,,,,


In [22]:
trip_adherence['stops'].describe()

count    350328.000000
mean          3.291099
std           1.770892
min           1.000000
25%           2.000000
50%           3.000000
75%           5.000000
max          21.000000
Name: stops, dtype: float64

In [None]:
wego['TRIP_ID'].nunique()