In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wego = pd.read_csv('../data/headway_data_clean.csv')

In [3]:
# convert times to datetime64 objects, check dtype to confirm
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['DATE'] = pd.to_datetime(wego['DATE'], format = "%Y/%m/%d")


In [4]:
#Create a column that gives each day of the week its on variable
wego['DAYS_OF_THE_WEEK'] = wego['DATE'].dt.weekday


**Question1: How much impact does being late or too spaced out at the first stop have downstream**



*Does being late to start a trip or too spaced out between buses have an affect on the remainder of the trip?*

In [5]:
#dropping all nan values in the adherence column
wego = wego.dropna(subset = 'ADHERENCE')

In [6]:
route_3_to_downtown = wego.loc[
    (wego['ROUTE_ABBR'] == 3) & (wego['ROUTE_DIRECTION_NAME']== 'TO DOWNTOWN')
]

In [7]:
route_3_to_downtown = route_3_to_downtown.copy()

In [8]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column
route_3_to_downtown['TRIP_ID'] = route_3_to_downtown['TRIP_ID'].astype(str)
route_3_to_downtown['CALENDAR_ID'] = route_3_to_downtown['CALENDAR_ID'].astype(str)

In [9]:
#creating a column that concats the two columns
route_3_to_downtown['TRIP_IDS'] = route_3_to_downtown['TRIP_ID'] + route_3_to_downtown['CALENDAR_ID']

In [10]:
#allows for each stop to be numbered in their respective route IDs
route_3_to_downtown['row_num'] = 1
route_3_to_downtown['stops'] = route_3_to_downtown.groupby('TRIP_IDS')['row_num'].cumsum()

In [40]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_3_to_downtown_pivot = route_3_to_downtown.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [12]:
#.reset_index(drop = True)

In [41]:
route_3_to_downtown_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
346048120230801,-1.300000,-0.083333,0.416666,1.000000,4.100000,,,,,
346048120230802,-1.150000,-0.750000,-0.716666,0.150000,0.016666,,,,,
346048120230803,-18.716666,,,,,,,,,
346050120230801,-3.933333,-2.666666,-1.950000,-1.450000,1.533333,,,,,
346050120230802,0.966666,-2.150000,-1.650000,-1.683333,-0.033333,,,,,
...,...,...,...,...,...,...,...,...,...,...
351240120230925,-1.383333,-2.033333,0.616666,-1.833333,-5.600000,-1.300000,,,,
351240120230926,-5.850000,-3.483333,-2.033333,-0.666666,-0.133333,2.666666,,,,
351240120230927,-2.183333,-0.600000,-1.283333,-0.583333,-1.733333,-1.250000,,,,
351240120230928,-1.333333,-1.150000,-1.166666,-0.233333,-0.666666,-2.983333,,,,


In [42]:
route_3_to_downtown_pivot.iloc[:, 0:5].dropna()

stops,1,2,3,4,5
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
346048120230801,-1.300000,-0.083333,0.416666,1.000000,4.100000
346048120230802,-1.150000,-0.750000,-0.716666,0.150000,0.016666
346050120230801,-3.933333,-2.666666,-1.950000,-1.450000,1.533333
346050120230802,0.966666,-2.150000,-1.650000,-1.683333,-0.033333
346050120230803,-1.650000,-0.166666,-0.716666,0.516666,6.000000
...,...,...,...,...,...
351240120230925,-1.383333,-2.033333,0.616666,-1.833333,-5.600000
351240120230926,-5.850000,-3.483333,-2.033333,-0.666666,-0.133333
351240120230927,-2.183333,-0.600000,-1.283333,-0.583333,-1.733333
351240120230928,-1.333333,-1.150000,-1.166666,-0.233333,-0.666666


In [None]:
route_3_to_downtown_pivot

In [15]:
route_3_pivot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4203 entries, 346048120230801 to 351240120230929
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   1       4203 non-null   float64
 1   2       4184 non-null   float64
 2   3       4178 non-null   float64
 3   4       4164 non-null   float64
 4   5       4082 non-null   float64
 5   6       1902 non-null   float64
 6   7       3 non-null      float64
 7   8       3 non-null      float64
 8   9       2 non-null      float64
 9   10      1 non-null      float64
dtypes: float64(10)
memory usage: 361.2+ KB


In [16]:
#trips = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'TRIP_EDGE').reset_index(drop = True)

In [17]:
#trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] >= 0)]

In [18]:
#trip_adherence.info()

In [19]:
#trip_adherence = trip_adherence.copy()

In [20]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#trip_adherence['TRIP_ID'] = trip_adherence['TRIP_ID'].astype(str)


In [21]:
#changing the dtype to string so that i can concat with the "ROUTE ID" column
#trip_adherence['CALENDAR_ID'] = trip_adherence['CALENDAR_ID'].astype(str)


In [22]:
#creating a column that concats the two columns
#trip_adherence['TRIP_IDS'] = trip_adherence['TRIP_ID'] + trip_adherence['CALENDAR_ID']
#trip_adherence

In [23]:
#allows for each stop to be numbered in their respective route IDs
#trip_adherence['row_num'] = 1
#trip_adherence['stops'] = trip_adherence.groupby('TRIP_IDS')['row_num'].cumsum()

In [24]:
#trip_adherence


In [25]:
#trip_adherence['stops'] = trip_adherence['stops'].astype(str)

In [26]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
#trip_adherence_pivot = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops').reset_index(drop = True)

In [27]:
#trip_adherence_pivot

In [28]:
#trip_adherence_pivot[trip_adherence_pivot[1] < 0]

In [29]:
#trip_adherence_pivot[trip_adherence_pivot[1] >= 0]

In [30]:
#the min amount of stops
#trip_adherence['stops'].min()

In [31]:
#the max amount of stops
#trip_adherence['stops'].max()

In [32]:
#Checking the total amount of stops
#trip_adherence['stops'].info()

In [33]:
#trip_adherence_pivot

In [34]:
#trip_adherence_pivot['Stops']

In [35]:
#start_on_time = trip_adherence_pivot[trip_adherence_pivot[1] == 0].reset_index()

In [36]:
#start_late = trip_adherence_pivot[trip_adherence_pivot[1] != 0].reset_index()

In [37]:
#start_late.iloc[0]

In [38]:
#wego.loc[
    #(wego['ROUTE_ABBR'] == 56) & (wego['STOPS']== 20)
    
    
#3]