In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wego = pd.read_csv('../data/headway_data_clean.csv')

In [3]:
# convert times to datetime64 objects, check dtype to confirm
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['DATE'] = pd.to_datetime(wego['DATE'], format = "%Y/%m/%d")


In [4]:
#Create a column that gives each day of the week its on variable
wego['DAYS_OF_THE_WEEK'] = wego['DATE'].dt.weekday


**Question1: How much impact does being late or too spaced out at the first stop have downstream**



*Does being late to start a trip or too spaced out between buses have an affect on the remainder of the trip?*

In [5]:
#dropping all nan values in the adherence column
wego = wego.dropna(subset = 'ADHERENCE')

In [6]:
route_3_to_downtown = wego.loc[
    (wego['ROUTE_ABBR'] == 3) & (wego['ROUTE_DIRECTION_NAME']== 'TO DOWNTOWN')
]

In [7]:
route_3_to_downtown = route_3_to_downtown.copy()

In [8]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column
route_3_to_downtown['TRIP_ID'] = route_3_to_downtown['TRIP_ID'].astype(str)
route_3_to_downtown['CALENDAR_ID'] = route_3_to_downtown['CALENDAR_ID'].astype(str)

In [9]:
#creating a column that concats the two columns
route_3_to_downtown['TRIP_IDS'] = route_3_to_downtown['TRIP_ID'] + route_3_to_downtown['CALENDAR_ID']

In [10]:
#allows for each stop to be numbered in their respective route IDs
route_3_to_downtown['row_num'] = 1
route_3_to_downtown['stops'] = route_3_to_downtown.groupby('TRIP_IDS')['row_num'].cumsum()

In [11]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_3_to_downtown_pivot = route_3_to_downtown.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [12]:
#.reset_index(drop = True)

In [13]:
route_3_to_downtown_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
346048120230801,-1.300000,-0.083333,0.416666,1.000000,4.100000,,,,,
346048120230802,-1.150000,-0.750000,-0.716666,0.150000,0.016666,,,,,
346048120230803,-18.716666,,,,,,,,,
346050120230801,-3.933333,-2.666666,-1.950000,-1.450000,1.533333,,,,,
346050120230802,0.966666,-2.150000,-1.650000,-1.683333,-0.033333,,,,,
...,...,...,...,...,...,...,...,...,...,...
351240120230925,-1.383333,-2.033333,0.616666,-1.833333,-5.600000,-1.300000,,,,
351240120230926,-5.850000,-3.483333,-2.033333,-0.666666,-0.133333,2.666666,,,,
351240120230927,-2.183333,-0.600000,-1.283333,-0.583333,-1.733333,-1.250000,,,,
351240120230928,-1.333333,-1.150000,-1.166666,-0.233333,-0.666666,-2.983333,,,,


In [14]:
route_3_new = route_3_to_downtown_pivot.iloc[:, 0:5].dropna()

In [15]:
route_3_new

stops,1,2,3,4,5
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
346048120230801,-1.300000,-0.083333,0.416666,1.000000,4.100000
346048120230802,-1.150000,-0.750000,-0.716666,0.150000,0.016666
346050120230801,-3.933333,-2.666666,-1.950000,-1.450000,1.533333
346050120230802,0.966666,-2.150000,-1.650000,-1.683333,-0.033333
346050120230803,-1.650000,-0.166666,-0.716666,0.516666,6.000000
...,...,...,...,...,...
351240120230925,-1.383333,-2.033333,0.616666,-1.833333,-5.600000
351240120230926,-5.850000,-3.483333,-2.033333,-0.666666,-0.133333
351240120230927,-2.183333,-0.600000,-1.283333,-0.583333,-1.733333
351240120230928,-1.333333,-1.150000,-1.166666,-0.233333,-0.666666


In [16]:
route_3_new[1].mean().round(2)

-2.82

In [17]:
route_3_new[5].mean().round(2)

-2.64

In [18]:
route_3_new[1].corr(route_3_new[5])

0.5004741699120517

In [19]:
route_3_to_downtown_pivot[1].corr(route_3_to_downtown_pivot[5])

0.5004741699120517

In [20]:
route_3_new = route_3_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:"five"})

In [21]:
lm = smf.ols("five ~ one", data=route_3_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.25
Model:,OLS,Adj. R-squared:,0.25
Method:,Least Squares,F-statistic:,1363.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,9.15e-258
Time:,21:45:07,Log-Likelihood:,-12310.0
No. Observations:,4082,AIC:,24620.0
Df Residuals:,4080,BIC:,24640.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5153,0.096,-5.351,0.000,-0.704,-0.327
one,0.7530,0.020,36.925,0.000,0.713,0.793

0,1,2,3
Omnibus:,4114.296,Durbin-Watson:,1.365
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1089936.058
Skew:,-4.345,Prob(JB):,0.0
Kurtosis:,82.578,Cond. No.,5.98


**On route 3 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately  units or **

In [22]:
#one coeff*(adherence value)-(intercept)
0.7530*(2)-0.5153

0.9907

**where y=mx+b, with y being arrival time... In order to be within 1 minute of arrival adherence(y), the bus needs to leave within x minutes of adherence from stop 1.**

**-1=(coeff one)x + intercept**

In [23]:
#-1 = (0.7530)(X)+(-0.5153)

In [24]:
(-0.5153-1)/(0.7530)

-2.0123505976095615

In [25]:
route_number = 3
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [26]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [27]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [28]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [29]:
route_df_new = route_df_pivot.iloc[:, 0:5].dropna()

In [30]:
route_df_new[5].mean().round(2)

-5.81

In [31]:
route_df_new[1].mean().round(2)

-3.18

In [32]:
route_df_new[1].corr(route_df_new[5])

0.3829595646223628

In [33]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:"five"})

In [34]:
lm = smf.ols("five ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.147
Model:,OLS,Adj. R-squared:,0.146
Method:,Least Squares,F-statistic:,704.5
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,2.23e-143
Time:,21:45:07,Log-Likelihood:,-13502.0
No. Observations:,4101,AIC:,27010.0
Df Residuals:,4099,BIC:,27020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.2013,0.141,-22.649,0.000,-3.478,-2.924
one,0.8184,0.031,26.542,0.000,0.758,0.879

0,1,2,3
Omnibus:,3491.723,Durbin-Watson:,1.356
Prob(Omnibus):,0.0,Jarque-Bera (JB):,318025.061
Skew:,-3.554,Prob(JB):,0.0
Kurtosis:,45.552,Cond. No.,6.52


In [35]:
#one coeff*(adherence value)-(intercept)
0.8184*(-1)-(-3.2013)

2.3829

**On route 3 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.8184 units or 49 seconds.**

***ROUTE 7 FROM DOWNTOWN***

In [36]:
wego['ROUTE_ABBR'].unique()

array([22, 23,  3,  7, 50, 52, 55, 56])

In [37]:
route_number = 7
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [38]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [39]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [40]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [41]:
route_df

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,TRIP_EDGE,LATITUDE,LONGITUDE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,DAYS_OF_THE_WEEK,TRIP_IDS,row_num,stops
6136,120230801,1,99465617,2023-08-01,7,700,3144,348421,0,FROM DOWNTOWN,HBHS,5.0,2,36.107575,-86.812719,2023-08-01 05:40:00,2023-08-01 05:09:44,2023-08-01 05:09:44,30.266666,,,,0,0,1,0,,0,27,0.000000,1,348421120230801,1,1
6140,120230801,1,99465621,2023-08-01,7,700,3144,348423,0,FROM DOWNTOWN,MCC5_9,3.0,1,36.167091,-86.781923,2023-08-01 06:15:00,2023-08-01 05:20:00,2023-08-01 05:27:29,47.516666,10.0,,,1,0,0,0,0.0,0,50,7.483333,1,348423120230801,1,1
6141,120230801,1,99465622,2023-08-01,7,700,3144,348423,0,FROM DOWNTOWN,21BK,4.0,0,36.138881,-86.800622,2023-08-01 06:30:00,2023-08-01 05:34:24,2023-08-01 05:34:24,55.600000,10.0,,,1,0,0,0,0.0,0,61,0.000000,1,348423120230801,1,2
6142,120230801,1,99465623,2023-08-01,7,700,3144,348423,0,FROM DOWNTOWN,HBHS,5.0,2,36.107575,-86.812719,2023-08-01 06:42:00,2023-08-01 05:42:03,2023-08-01 05:42:03,59.950000,,,,1,0,0,0,,0,75,0.000000,1,348423120230801,1,3
6146,120230801,1,99465627,2023-08-01,7,700,3144,348425,0,FROM DOWNTOWN,MCC5_9,3.0,1,36.167091,-86.781923,2023-08-01 07:30:00,2023-08-01 06:07:11,2023-08-01 06:23:28,66.533333,15.0,18.016666,3.016666,1,0,0,0,0.0,0,98,16.283333,1,348425120230801,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350319,120230930,2,100406606,2023-09-30,7,702,2312,353447,0,FROM DOWNTOWN,MCC5_9,3.0,1,36.167091,-86.781923,2023-09-30 21:45:00,2023-09-30 21:43:52,2023-09-30 21:50:54,-5.900000,30.0,36.716666,6.716666,0,0,1,0,0.0,0,504,7.033333,5,353447120230930,1,1
350321,120230930,2,100406608,2023-09-30,7,702,2312,353447,0,FROM DOWNTOWN,HBHS,5.0,2,36.107575,-86.812719,2023-09-30 22:11:00,2023-09-30 22:22:39,2023-09-30 22:22:39,-11.650000,,,,0,1,0,0,,0,529,0.000000,5,353447120230930,1,2
350325,120230930,2,100406612,2023-09-30,7,702,2312,353449,0,FROM DOWNTOWN,MCC5_9,3.0,1,36.167091,-86.781923,2023-09-30 22:45:00,2023-09-30 22:49:19,2023-09-30 22:49:19,-4.316666,30.0,31.866666,1.866666,0,0,1,0,0.0,0,552,0.000000,5,353449120230930,1,1
350326,120230930,2,100406613,2023-09-30,7,702,2312,353449,0,FROM DOWNTOWN,21BK,4.0,0,36.138881,-86.800622,2023-09-30 22:59:00,2023-09-30 23:21:05,2023-09-30 23:21:05,-22.083333,30.0,47.500000,17.500000,0,1,0,0,0.0,0,563,0.000000,5,353449120230930,1,2


In [42]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()


In [43]:
route_df_new[1].mean().round(2)

-2.62

In [44]:
route_df_new[3].mean().round(2)

-4.19

In [45]:
route_df_new[1].corr(route_df_new[3])

0.6616917036779562

In [46]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [47]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.438
Model:,OLS,Adj. R-squared:,0.438
Method:,Least Squares,F-statistic:,2317.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,0.0
Time:,21:45:08,Log-Likelihood:,-9503.5
No. Observations:,2977,AIC:,19010.0
Df Residuals:,2975,BIC:,19020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.5649,0.121,-12.933,0.000,-1.802,-1.328
one,1.0030,0.021,48.136,0.000,0.962,1.044

0,1,2,3
Omnibus:,1556.017,Durbin-Watson:,1.532
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29208.58
Skew:,-2.047,Prob(JB):,0.0
Kurtosis:,17.789,Cond. No.,6.55


In [48]:
(-1) - (-1.5649)/(1.0030)

0.560219341974078

In [49]:
#y = -1.5649 + 1.0030 * x

In [50]:
#-1.5649 + 1.0030

**On route 7 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately **

**ROUTE 7 TO DOWNTOWN**

In [51]:
route_number = 7
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [52]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [53]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [54]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [55]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [56]:
route_df_new[1].mean().round(2)

-2.19

In [57]:
route_df_new[3].mean().round(2)

-2.34

In [58]:
route_df_new[1].corr(route_df_new[3])

0.6279235315819696

In [59]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [60]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.394
Model:,OLS,Adj. R-squared:,0.394
Method:,Least Squares,F-statistic:,1560.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,2.8799999999999998e-263
Time:,21:45:08,Log-Likelihood:,-7635.7
No. Observations:,2399,AIC:,15280.0
Df Residuals:,2397,BIC:,15290.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.3846,0.129,-2.980,0.003,-0.638,-0.132
one,0.8931,0.023,39.501,0.000,0.849,0.937

0,1,2,3
Omnibus:,1418.509,Durbin-Watson:,1.178
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40534.66
Skew:,-2.274,Prob(JB):,0.0
Kurtosis:,22.617,Cond. No.,6.21


In [61]:
(-1) - (-0.3846)/(0.8931)

-0.56936513268391

**On route 7 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately **

**ROUTE 22 TO DOWTOWN**

In [62]:
route_number = 22
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [63]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [64]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [65]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [66]:
wego[wego['ROUTE_ABBR']==22].head(20)

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,TRIP_EDGE,LATITUDE,LONGITUDE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,DAYS_OF_THE_WEEK
0,120230801,1,99457890,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,MHSP,14.0,1,36.181248,-86.847705,2023-08-01 04:42:00,2023-08-01 04:37:38,2023-08-01 04:44:08,-2.133333,,,,0,0,1,0,0.0,0,2,6.5,1
1,120230801,1,99457891,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,ELIZ,10.0,0,36.193454,-86.839981,2023-08-01 04:46:00,2023-08-01 04:48:27,2023-08-01 04:48:27,-2.45,,,,0,0,1,0,0.0,0,9,0.0,1
2,120230801,1,99457892,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,CV23,5.0,0,36.182177,-86.814445,2023-08-01 04:54:00,2023-08-01 04:54:56,2023-08-01 04:54:56,-0.933333,,,,0,0,1,0,0.0,0,19,0.0,1
3,120230801,1,99457893,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,MCC5_10,1.0,2,36.167091,-86.781923,2023-08-01 05:10:00,2023-08-01 05:03:43,2023-08-01 05:03:43,6.283333,,,,0,0,1,0,,0,35,0.0,1
4,120230801,1,99457894,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,MCC5_10,1.0,1,36.167091,-86.781923,2023-08-01 05:15:00,2023-08-01 05:03:43,2023-08-01 05:16:35,-1.583333,,,,0,0,1,0,0.0,0,36,12.866666,1
5,120230801,1,99457895,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,CV23,5.0,0,36.18348,-86.81422,2023-08-01 05:25:00,2023-08-01 05:24:03,2023-08-01 05:24:03,0.95,,,,0,0,1,0,0.0,0,51,0.0,1
6,120230801,1,99457896,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,MHSP,14.0,2,36.181248,-86.847705,2023-08-01 05:32:00,2023-08-01 05:30:30,2023-08-01 05:30:30,1.5,,,,0,0,1,0,,0,62,0.0,1
7,120230801,1,99457897,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,MHSP,14.0,1,36.181248,-86.847705,2023-08-01 05:42:00,2023-08-01 05:30:30,2023-08-01 05:43:43,-1.716666,35.0,37.666666,2.666666,0,0,1,0,0.0,0,63,13.216666,1
8,120230801,1,99457898,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,ELIZ,10.0,0,36.193454,-86.839981,2023-08-01 05:46:00,2023-08-01 05:47:55,2023-08-01 05:47:55,-1.916666,35.0,36.966666,1.966666,0,0,1,0,0.0,0,70,0.0,1
9,120230801,1,99457899,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,CV23,5.0,0,36.182177,-86.814445,2023-08-01 05:54:00,2023-08-01 05:54:11,2023-08-01 05:54:11,-0.183333,15.0,14.516666,-0.483334,0,0,1,0,0.0,0,80,0.0,1


In [67]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [68]:
route_df_new[1].mean().round(2)

-1.07

In [69]:
route_df_new[3].mean().round(2)

2.27

In [70]:
route_df_new_4 = route_df_pivot.iloc[:, 0:4].dropna()

In [71]:
route_df_new[1].corr(route_df_new[3])

0.33794273617741843

In [72]:
 route_df_new_4[1].corr(route_df_new_4[4])

0.09990982527956851

In [73]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [74]:
route_df_new_4 = route_df_new_4.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four'})

In [75]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.114
Model:,OLS,Adj. R-squared:,0.114
Method:,Least Squares,F-statistic:,475.5
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,2.96e-99
Time:,21:45:08,Log-Likelihood:,-10366.0
No. Observations:,3690,AIC:,20740.0
Df Residuals:,3688,BIC:,20750.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8366,0.071,39.923,0.000,2.697,2.976
one,0.5276,0.024,21.806,0.000,0.480,0.575

0,1,2,3
Omnibus:,5502.476,Durbin-Watson:,1.432
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7857204.019
Skew:,-8.499,Prob(JB):,0.0
Kurtosis:,228.421,Cond. No.,3.21


**On route 22 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .This is when the bus has only three stops**

In [76]:
lm = smf.ols("four ~ one", data=route_df_new_4).fit()
lm.summary()

0,1,2,3
Dep. Variable:,four,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,6.342
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,0.012
Time:,21:45:08,Log-Likelihood:,-2486.1
No. Observations:,631,AIC:,4976.0
Df Residuals:,629,BIC:,4985.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1214,0.528,2.123,0.034,0.084,2.159
one,0.4951,0.197,2.518,0.012,0.109,0.881

0,1,2,3
Omnibus:,1506.474,Durbin-Watson:,1.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6809779.127
Skew:,-21.398,Prob(JB):,0.0
Kurtosis:,510.127,Cond. No.,2.92


**On route 22 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately . This is when the bus has only four stops**

**ROUTE 22 FROM DOWNTOWN**

In [77]:
route_number = 22
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [78]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [79]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [80]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [81]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [82]:
route_df_new[1].mean().round(2)

-2.19

In [83]:
route_df_new[3].mean().round(2)

-0.42

In [84]:
route_df_new[1].corr(route_df_new[3])

0.5094255159904719

In [85]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [86]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.26
Model:,OLS,Adj. R-squared:,0.259
Method:,Least Squares,F-statistic:,1277.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,4.05e-240
Time:,21:45:08,Log-Likelihood:,-10453.0
No. Observations:,3647,AIC:,20910.0
Df Residuals:,3645,BIC:,20920.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4205,0.087,16.279,0.000,1.249,1.592
one,0.8412,0.024,35.741,0.000,0.795,0.887

0,1,2,3
Omnibus:,4487.716,Durbin-Watson:,1.184
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3832327.891
Skew:,-5.861,Prob(JB):,0.0
Kurtosis:,161.374,Cond. No.,4.72


**On route 22 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .**

**ROUTE 23 FROM DOWNTOWN**

In [87]:
route_number = 23
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [88]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [89]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [90]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
345435120230801,-3.233333,-2.000000,1.333333,,,,,,,
345435120230802,-2.250000,-1.400000,1.150000,,,,,,,
345435120230803,-2.600000,-1.716666,0.933333,,,,,,,
345437120230801,-5.016666,-4.516666,-5.050000,-0.533333,,,,,,
345437120230802,-3.533333,-3.883333,-5.183333,1.383333,,,,,,
...,...,...,...,...,...,...,...,...,...,...
350603120230903,-1.316666,-0.216666,0.366666,-1.816666,-0.533333,-1.033333,0.400000,,,
350603120230904,-1.500000,-1.516666,-1.950000,-0.050000,-2.066666,-5.316666,-8.566666,,,
350603120230910,-0.666666,-0.983333,-1.383333,1.000000,-0.300000,-1.233333,-1.416666,,,
350603120230917,-1.200000,-1.300000,-1.283333,-0.316666,-0.716666,1.133333,,,,


In [91]:
route_df_new = route_df_pivot.iloc[:, 0:4].dropna()
route_df_new

stops,1,2,3,4
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
345437120230801,-5.016666,-4.516666,-5.050000,-0.533333
345437120230802,-3.533333,-3.883333,-5.183333,1.383333
345437120230803,-3.833333,-3.016666,-1.466666,2.416666
345439120230801,-5.550000,-4.900000,-4.616666,-0.033333
345439120230802,-3.700000,-2.283333,-2.050000,2.650000
...,...,...,...,...
350603120230903,-1.316666,-0.216666,0.366666,-1.816666
350603120230904,-1.500000,-1.516666,-1.950000,-0.050000
350603120230910,-0.666666,-0.983333,-1.383333,1.000000
350603120230917,-1.200000,-1.300000,-1.283333,-0.316666


In [92]:
route_df_new[1].mean().round(2)

-2.48

In [93]:
route_df_new[4].mean().round(2)

-0.99

In [94]:
route_df_new[1].corr(route_df_new[4])

0.4210687531874183

In [95]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four'})

In [96]:
lm = smf.ols("four ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,four,R-squared:,0.177
Model:,OLS,Adj. R-squared:,0.177
Method:,Least Squares,F-statistic:,854.5
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,2.7899999999999998e-170
Time:,21:45:08,Log-Likelihood:,-11640.0
No. Observations:,3967,AIC:,23280.0
Df Residuals:,3965,BIC:,23300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8166,0.095,8.591,0.000,0.630,1.003
one,0.7272,0.025,29.232,0.000,0.678,0.776

0,1,2,3
Omnibus:,2834.794,Durbin-Watson:,1.268
Prob(Omnibus):,0.0,Jarque-Bera (JB):,478289.542
Skew:,-2.461,Prob(JB):,0.0
Kurtosis:,56.567,Cond. No.,5.18


**On route 23 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .**

**ROUTE 23 TO DOWNTOWN**

In [97]:
route_number = 23
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [98]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [99]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()


In [100]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
345436120230801,-6.100000,-2.266666,-3.433333,2.050000,,
345436120230802,-6.200000,-2.933333,-3.450000,0.533333,,
345436120230803,-3.950000,0.683333,0.483333,6.066666,,
345438120230801,-7.266666,0.016666,0.750000,6.000000,,
345438120230802,-5.083333,0.300000,0.700000,3.983333,,
...,...,...,...,...,...,...
350602120230903,-0.633333,-1.350000,-2.916666,3.816666,,
350602120230904,-0.733333,-3.100000,-3.700000,0.583333,,
350602120230910,-0.816666,-0.216666,0.600000,4.033333,,
350602120230917,-0.683333,0.266666,-1.166666,3.833333,,


In [101]:
route_df_new = route_df_pivot.iloc[:, 0:4].dropna()
route_df_new

stops,1,2,3,4
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
345436120230801,-6.100000,-2.266666,-3.433333,2.050000
345436120230802,-6.200000,-2.933333,-3.450000,0.533333
345436120230803,-3.950000,0.683333,0.483333,6.066666
345438120230801,-7.266666,0.016666,0.750000,6.000000
345438120230802,-5.083333,0.300000,0.700000,3.983333
...,...,...,...,...
350602120230903,-0.633333,-1.350000,-2.916666,3.816666
350602120230904,-0.733333,-3.100000,-3.700000,0.583333
350602120230910,-0.816666,-0.216666,0.600000,4.033333
350602120230917,-0.683333,0.266666,-1.166666,3.833333


In [102]:
route_df_new[1].mean().round(2)

-1.96

In [103]:
route_df_new[4].mean().round(2)

2.19

In [104]:
route_df_new[1].corr(route_df_new[4])

0.3598073185926023

In [105]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four'})

In [106]:
lm = smf.ols("four ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,four,R-squared:,0.129
Model:,OLS,Adj. R-squared:,0.129
Method:,Least Squares,F-statistic:,577.6
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,4.1599999999999996e-119
Time:,21:45:08,Log-Likelihood:,-10087.0
No. Observations:,3886,AIC:,20180.0
Df Residuals:,3884,BIC:,20190.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0471,0.063,48.246,0.000,2.923,3.171
one,0.4387,0.018,24.033,0.000,0.403,0.474

0,1,2,3
Omnibus:,1906.316,Durbin-Watson:,1.389
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41097.041
Skew:,-1.841,Prob(JB):,0.0
Kurtosis:,18.5,Cond. No.,4.32


**On route 23 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .**

**ROUTE 50 TO DOWNTOWN**

In [107]:
route_number = 50
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [108]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [109]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [110]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
346804120230801,-1.316666,-0.400000,-0.466666,0.266666,-0.200000,4.783333,,,
346804120230802,-1.116666,-0.466666,-0.916666,-0.583333,-0.483333,3.650000,,,
346804120230803,-0.916666,-0.900000,-1.916666,-1.083333,-0.283333,3.283333,,,
346806120230801,-2.466666,-0.633333,-1.483333,-1.516666,-0.266666,3.966666,,,
346806120230802,-0.700000,-0.483333,-0.366666,-0.166666,-0.133333,2.433333,,,
...,...,...,...,...,...,...,...,...,...
354098120230925,0.950000,7.666666,,,,,,,
354098120230926,-1.733333,-480.866666,,,,,,,
354098120230927,-1.100000,,,,,,,,
354098120230928,-1.066666,-0.966666,,,,,,,


In [111]:
route_df_new = route_df_pivot.iloc[:, 0:6].dropna()
route_df_new

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
346804120230801,-1.316666,-0.400000,-0.466666,0.266666,-0.200000,4.783333
346804120230802,-1.116666,-0.466666,-0.916666,-0.583333,-0.483333,3.650000
346804120230803,-0.916666,-0.900000,-1.916666,-1.083333,-0.283333,3.283333
346806120230801,-2.466666,-0.633333,-1.483333,-1.516666,-0.266666,3.966666
346806120230802,-0.700000,-0.483333,-0.366666,-0.166666,-0.133333,2.433333
...,...,...,...,...,...,...
351939120230925,-2.116666,-0.866666,0.083333,0.866666,0.883333,0.000000
351939120230926,-0.616666,-4.200000,-3.300000,-1.966666,-1.050000,-1.150000
351939120230927,-14.916666,-16.133333,-16.266666,-14.466666,-13.800000,-9.900000
351939120230928,-3.800000,-4.133333,-2.666666,-2.166666,-2.250000,-3.750000


In [112]:
route_df_new[1].mean().round(2)

-2.24

In [113]:
route_df_new[6].mean().round(2)

-0.49

In [114]:
route_df_new[1].corr(route_df_new[6])

0.41171053524783435

In [115]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five', 6:'six'})

In [116]:
lm = smf.ols("six ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,six,R-squared:,0.17
Model:,OLS,Adj. R-squared:,0.169
Method:,Least Squares,F-statistic:,711.3
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,9.12e-143
Time:,21:45:08,Log-Likelihood:,-10228.0
No. Observations:,3487,AIC:,20460.0
Df Residuals:,3485,BIC:,20470.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1209,0.098,11.442,0.000,0.929,1.313
one,0.7214,0.027,26.670,0.000,0.668,0.774

0,1,2,3
Omnibus:,2434.132,Durbin-Watson:,1.359
Prob(Omnibus):,0.0,Jarque-Bera (JB):,150617.11
Skew:,-2.667,Prob(JB):,0.0
Kurtosis:,34.752,Cond. No.,4.75


**On route 50 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.7214 units or 43 seconds.**

**ROUTE 50 FROM DOWNTOWN**

In [117]:
route_number = 50
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [118]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [119]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [120]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10,11,12
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
346805120230801,-2.700000,-2.383333,-1.750000,-1.900000,-1.933333,1.566666,,,,,,
346805120230802,-2.183333,-0.616666,-0.200000,-0.216666,-0.583333,3.016666,,,,,,
346805120230803,-2.883333,-1.500000,-0.383333,-0.400000,-0.366666,3.416666,,,,,,
346807120230801,-3.683333,-3.666666,-4.466666,-4.233333,-3.166666,-2.066666,,,,,,
346807120230802,-2.133333,-0.383333,-0.233333,-0.766666,-0.316666,2.316666,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
353986120230925,-8.966666,-5.966666,,,,,,,,,,
353986120230926,-12.083333,,,,,,,,,,,
353986120230927,-15.133333,,,,,,,,,,,
353986120230928,-17.866666,,,,,,,,,,,


In [121]:
route_df_new = route_df_pivot.iloc[:, 0:6].dropna()
route_df_new

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
346805120230801,-2.700000,-2.383333,-1.750000,-1.900000,-1.933333,1.566666
346805120230802,-2.183333,-0.616666,-0.200000,-0.216666,-0.583333,3.016666
346805120230803,-2.883333,-1.500000,-0.383333,-0.400000,-0.366666,3.416666
346807120230801,-3.683333,-3.666666,-4.466666,-4.233333,-3.166666,-2.066666
346807120230802,-2.133333,-0.383333,-0.233333,-0.766666,-0.316666,2.316666
...,...,...,...,...,...,...
351940120230921,-3.966666,-2.683333,-3.983333,-3.450000,-2.550000,-0.683333
351940120230925,-1.233333,-4.283333,-5.083333,-5.900000,-4.783333,-4.350000
351940120230926,-3.866666,-2.900000,-4.316666,-6.816666,-9.000000,-6.766666
351940120230927,-4.116666,-1.933333,-4.016666,-5.766666,-4.366666,-1.466666


In [122]:
route_df_new[1].mean().round(2)

-3.08

In [123]:
route_df_new[6].mean().round(2)

-0.59

In [124]:
route_df_new[1].corr(route_df_new[6])

0.49250134631881737

In [125]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five', 6:'six'})

In [126]:
lm = smf.ols("six ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,six,R-squared:,0.243
Model:,OLS,Adj. R-squared:,0.242
Method:,Least Squares,F-statistic:,1120.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,3.4700000000000004e-213
Time:,21:45:08,Log-Likelihood:,-9991.7
No. Observations:,3498,AIC:,19990.0
Df Residuals:,3496,BIC:,20000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.8447,0.102,18.126,0.000,1.645,2.044
one,0.7890,0.024,33.459,0.000,0.743,0.835

0,1,2,3
Omnibus:,3632.306,Durbin-Watson:,1.489
Prob(Omnibus):,0.0,Jarque-Bera (JB):,966182.533
Skew:,-4.574,Prob(JB):,0.0
Kurtosis:,83.903,Cond. No.,6.34


**On route 50 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .**

**ROUTE 52 TO DOWNTOWN**

In [127]:
route_number = 52
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [128]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [129]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [130]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
347114120230801,-1.766666,-2.533333,-0.900000,-1.166666,-0.200000,,
347114120230802,-0.616666,0.183333,2.166666,1.400000,0.200000,,
347114120230803,-6.700000,-4.583333,-4.050000,-2.916666,-1.100000,,
347116120230801,-1.383333,0.416666,0.716666,1.083333,5.966666,,
347116120230802,-1.816666,-2.516666,-1.500000,-1.233333,3.566666,,
...,...,...,...,...,...,...,...
352309120230925,-0.616666,-0.066666,0.100000,-0.550000,-0.766666,0.983333,
352309120230926,-0.266666,-0.216666,0.433333,-0.733333,-0.733333,-0.150000,
352309120230927,-0.416666,0.100000,0.400000,-0.783333,-0.600000,-4.650000,
352309120230928,-32.666666,-32.500000,-29.050000,-30.500000,-38.300000,-38.100000,


In [131]:
route_df_new = route_df_pivot.iloc[:, 0:5].dropna()
route_df_new

stops,1,2,3,4,5
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
347114120230801,-1.766666,-2.533333,-0.900000,-1.166666,-0.200000
347114120230802,-0.616666,0.183333,2.166666,1.400000,0.200000
347114120230803,-6.700000,-4.583333,-4.050000,-2.916666,-1.100000
347116120230801,-1.383333,0.416666,0.716666,1.083333,5.966666
347116120230802,-1.816666,-2.516666,-1.500000,-1.233333,3.566666
...,...,...,...,...,...
352309120230925,-0.616666,-0.066666,0.100000,-0.550000,-0.766666
352309120230926,-0.266666,-0.216666,0.433333,-0.733333,-0.733333
352309120230927,-0.416666,0.100000,0.400000,-0.783333,-0.600000
352309120230928,-32.666666,-32.500000,-29.050000,-30.500000,-38.300000


In [132]:
route_df_new[1].mean().round(2)

-2.46

In [133]:
route_df_new[5].mean().round(2)

-2.69

In [134]:
route_df_new[1].corr(route_df_new[5])

0.6550333324092471

In [135]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five'})

In [136]:
lm = smf.ols("five ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.429
Model:,OLS,Adj. R-squared:,0.429
Method:,Least Squares,F-statistic:,3223.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,0.0
Time:,21:45:08,Log-Likelihood:,-12961.0
No. Observations:,4290,AIC:,25930.0
Df Residuals:,4288,BIC:,25940.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5048,0.085,-5.938,0.000,-0.672,-0.338
one,0.8862,0.016,56.767,0.000,0.856,0.917

0,1,2,3
Omnibus:,5989.871,Durbin-Watson:,1.721
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4323776.65
Skew:,-7.695,Prob(JB):,0.0
Kurtosis:,157.765,Cond. No.,6.15


In [137]:
(-1) - (-0.5048)/(0.8862)

-0.4303768900925299

**On route 52 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately .**

**ROUTE 52 FROM DOWNTOWN**

In [138]:
route_number = 52
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [139]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [140]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [141]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
347115120230801,-0.033333,0.283333,1.566666,4.633333,9.100000,,,,,
347115120230802,-2.766666,-2.150000,-2.216666,1.200000,5.866666,,,,,
347115120230803,3.900000,-2.433333,-1.666666,-1.033333,2.000000,,,,,
347117120230801,-3.966666,-1.500000,-2.133333,-0.016666,2.800000,,,,,
347117120230802,-3.333333,-3.716666,-4.333333,-1.316666,1.133333,,,,,
...,...,...,...,...,...,...,...,...,...,...
352310120230925,-1.833333,0.600000,-1.483333,-3.233333,-3.033333,-0.850000,,,,
352310120230926,0.100000,-5.483333,-6.466666,-6.850000,-7.500000,-8.533333,,,,
352310120230927,-1.833333,-2.950000,-4.316666,-11.633333,-10.566666,-6.916666,,,,
352310120230928,-30.683333,-25.533333,-26.383333,-30.516666,-29.333333,-25.066666,,,,


In [142]:
route_df_new = route_df_pivot.iloc[:, 0:5].dropna()
route_df_new

stops,1,2,3,4,5
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
347115120230801,-0.033333,0.283333,1.566666,4.633333,9.100000
347115120230802,-2.766666,-2.150000,-2.216666,1.200000,5.866666
347115120230803,3.900000,-2.433333,-1.666666,-1.033333,2.000000
347117120230801,-3.966666,-1.500000,-2.133333,-0.016666,2.800000
347117120230802,-3.333333,-3.716666,-4.333333,-1.316666,1.133333
...,...,...,...,...,...
352310120230925,-1.833333,0.600000,-1.483333,-3.233333,-3.033333
352310120230926,0.100000,-5.483333,-6.466666,-6.850000,-7.500000
352310120230927,-1.833333,-2.950000,-4.316666,-11.633333,-10.566666
352310120230928,-30.683333,-25.533333,-26.383333,-30.516666,-29.333333


In [143]:
route_df_new[1].mean().round(2)

-3.9

In [144]:
route_df_new[5].mean().round(2)

-4.69

In [145]:
route_df_new[1].corr(route_df_new[5])

0.5315501754711262

In [146]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five'})

In [147]:
lm = smf.ols("five ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.283
Model:,OLS,Adj. R-squared:,0.282
Method:,Least Squares,F-statistic:,1690.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,7.84e-312
Time:,21:45:09,Log-Likelihood:,-15035.0
No. Observations:,4294,AIC:,30070.0
Df Residuals:,4292,BIC:,30090.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.2117,0.149,-8.144,0.000,-1.503,-0.920
one,0.8895,0.022,41.113,0.000,0.847,0.932

0,1,2,3
Omnibus:,5896.44,Durbin-Watson:,1.627
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6603697.801
Skew:,-7.254,Prob(JB):,0.0
Kurtosis:,194.57,Cond. No.,8.41


**On route 52 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.8895  or 53 .**

In [148]:
0.8895*(-2)-1.2117

-2.9907

In [149]:
0.8895*(2)-1.2117

0.5672999999999999

**ROUTE 55 TO DOWNTOWN**

In [150]:
route_number = 55
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [151]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [152]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [153]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10,11
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
347480120230801,-0.900000,-0.616666,-2.233333,-0.733333,-2.283333,-1.216666,,,,,
347480120230802,-1.583333,-2.183333,-3.050000,-2.450000,-3.833333,-1.883333,,,,,
347480120230803,-1.983333,-1.733333,-2.066666,-0.350000,-2.866666,-0.733333,,,,,
347482120230801,-2.150000,-1.566666,-3.350000,-1.600000,-1.350000,1.116666,,,,,
347482120230802,-2.133333,-2.016666,-7.033333,-5.833333,-4.833333,-2.333333,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
353988120230925,-2.200000,8.366666,,,,,,,,,
353988120230926,-2.116666,3.466666,,,,,,,,,
353988120230927,-3.783333,2.116666,,,,,,,,,
353988120230928,-2.900000,5.033333,,,,,,,,,


In [154]:
route_df_new = route_df_pivot.iloc[:, 0:6].dropna()
route_df_new

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
347480120230801,-0.900000,-0.616666,-2.233333,-0.733333,-2.283333,-1.216666
347480120230802,-1.583333,-2.183333,-3.050000,-2.450000,-3.833333,-1.883333
347480120230803,-1.983333,-1.733333,-2.066666,-0.350000,-2.866666,-0.733333
347482120230801,-2.150000,-1.566666,-3.350000,-1.600000,-1.350000,1.116666
347482120230802,-2.133333,-2.016666,-7.033333,-5.833333,-4.833333,-2.333333
...,...,...,...,...,...,...
352725120230926,-5.150000,-6.750000,-9.900000,-7.783333,-10.116666,-11.316666
352725120230927,-1.433333,-0.066666,0.200000,1.216666,-0.600000,-1.316666
352725120230928,-33.266666,-32.350000,-32.300000,-28.416666,-27.466666,-27.250000
352725120230929,-2.283333,-3.316666,-6.583333,-4.283333,-4.316666,-11.316666


In [155]:
route_df_new[1].mean().round(2)

-2.7

In [156]:
route_df_new[6].mean().round(2)

-3.61

In [157]:
route_df_new[1].corr(route_df_new[6])

0.5392054671520023

In [158]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five', 6:'six'})

In [159]:
lm = smf.ols("six ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,six,R-squared:,0.291
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,2011.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,0.0
Time:,21:45:09,Log-Likelihood:,-16183.0
No. Observations:,4908,AIC:,32370.0
Df Residuals:,4906,BIC:,32380.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.1796,0.108,-10.925,0.000,-1.391,-0.968
one,0.8993,0.020,44.845,0.000,0.860,0.939

0,1,2,3
Omnibus:,5056.379,Durbin-Watson:,1.532
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1127254.731
Skew:,-4.606,Prob(JB):,0.0
Kurtosis:,76.671,Cond. No.,6.28


**On route 55 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.8993 units or 53 seconds.**

**ROUTE 55 FROM DOWNTOWN**

In [160]:
route_number = 55
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [161]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [162]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [163]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10,11,12
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
347481120230801,-1.866666,-2.600000,-2.300000,-2.683333,-2.716666,-0.633333,,,,,,
347481120230802,-1.666666,0.000000,-0.533333,-3.300000,-3.416666,-0.433333,,,,,,
347481120230803,-1.483333,-1.950000,-3.616666,-6.866666,-7.883333,-5.683333,,,,,,
347483120230801,-4.483333,-2.866666,-2.750000,-2.800000,-2.750000,-6.683333,,,,,,
347483120230802,-1.516666,-2.950000,-1.216666,-6.350000,-7.616666,-7.683333,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
352727120230925,-11.033333,-6.333333,,,,,,,,,,
352727120230926,-11.050000,-6.583333,,,,,,,,,,
352727120230927,-11.083333,-6.233333,,,,,,,,,,
352727120230928,-11.216666,-8.583333,,,,,,,,,,


In [164]:
route_df_new = route_df_pivot.iloc[:, 0:6].dropna()
route_df_new

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
347481120230801,-1.866666,-2.600000,-2.300000,-2.683333,-2.716666,-0.633333
347481120230802,-1.666666,0.000000,-0.533333,-3.300000,-3.416666,-0.433333
347481120230803,-1.483333,-1.950000,-3.616666,-6.866666,-7.883333,-5.683333
347483120230801,-4.483333,-2.866666,-2.750000,-2.800000,-2.750000,-6.683333
347483120230802,-1.516666,-2.950000,-1.216666,-6.350000,-7.616666,-7.683333
...,...,...,...,...,...,...
352726120230922,-4.783333,-13.950000,-12.066666,-9.583333,-9.516666,-14.950000
352726120230925,-6.133333,-6.133333,-3.233333,-3.016666,-7.533333,-8.100000
352726120230927,-2.200000,-3.233333,-5.533333,-17.816666,-23.983333,-27.166666
352726120230928,-27.333333,-24.600000,-24.700000,-25.500000,-28.650000,-28.700000


In [165]:
route_df_new[1].mean().round(2)

-3.76

In [166]:
route_df_new[6].mean().round(2)

-5.65

In [167]:
route_df_new[1].corr(route_df_new[6])

0.6780651426728795

In [168]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five', 6:'six'})

In [169]:
lm = smf.ols("six ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,six,R-squared:,0.46
Model:,OLS,Adj. R-squared:,0.46
Method:,Least Squares,F-statistic:,4157.0
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,0.0
Time:,21:45:09,Log-Likelihood:,-16504.0
No. Observations:,4886,AIC:,33010.0
Df Residuals:,4884,BIC:,33030.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.5983,0.119,-13.388,0.000,-1.832,-1.364
one,1.0783,0.017,64.472,0.000,1.046,1.111

0,1,2,3
Omnibus:,1848.247,Durbin-Watson:,1.492
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13269.172
Skew:,-1.628,Prob(JB):,0.0
Kurtosis:,10.388,Cond. No.,8.44


In [170]:
(-1) - (-1.5983)/(1.0783)

0.4822405638505054

**On route 55 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 1.0783 units or 1 minute.**

**ROUTE 56 TO DOWNTOWN**

In [171]:
route_number = 56
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [172]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [173]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [174]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
347902120230801,-1.283333,-0.900000,-0.433333,-1.316666,3.700000,,,,,
347902120230802,-1.050000,-1.800000,-0.850000,-1.033333,4.566666,,,,,
347902120230803,-0.783333,-1.166666,-0.583333,-0.283333,5.033333,,,,,
347904120230801,-1.050000,-1.383333,-0.316666,-4.450000,0.583333,,,,,
347904120230802,-1.433333,2.283333,-0.300000,-0.350000,2.933333,,,,,
...,...,...,...,...,...,...,...,...,...,...
354106120230925,0.966666,2.200000,,,,,,,,
354106120230926,-0.266666,-139.683333,,,,,,,,
354106120230927,0.966666,-166.566666,,,,,,,,
354106120230928,-2.566666,-5.816666,,,,,,,,


In [175]:
route_df_new = route_df_pivot.iloc[:, 0:5].dropna()
route_df_new

stops,1,2,3,4,5
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
347902120230801,-1.283333,-0.900000,-0.433333,-1.316666,3.700000
347902120230802,-1.050000,-1.800000,-0.850000,-1.033333,4.566666
347902120230803,-0.783333,-1.166666,-0.583333,-0.283333,5.033333
347904120230801,-1.050000,-1.383333,-0.316666,-4.450000,0.583333
347904120230802,-1.433333,2.283333,-0.300000,-0.350000,2.933333
...,...,...,...,...,...
353121120230925,-7.000000,-3.266666,-4.583333,-4.083333,-0.366666
353121120230926,-2.450000,-3.433333,-4.233333,-2.733333,1.766666
353121120230927,0.400000,-2.550000,-3.250000,-8.100000,-5.166666
353121120230928,-4.066666,-4.633333,-5.550000,-7.466666,-4.700000


In [176]:
route_df_new[1].mean().round(2)

-1.74

In [177]:
route_df_new[5].mean().round(2)

1.56

In [178]:
route_df_new[1].corr(route_df_new[5])

0.20485574437922915

In [179]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five'})

In [180]:
lm = smf.ols("five ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,211.6
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,6.050000000000001e-47
Time:,21:45:09,Log-Likelihood:,-13575.0
No. Observations:,4832,AIC:,27150.0
Df Residuals:,4830,BIC:,27170.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.0103,0.066,30.668,0.000,1.882,2.139
one,0.2590,0.018,14.546,0.000,0.224,0.294

0,1,2,3
Omnibus:,1399.743,Durbin-Watson:,1.51
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51363.477
Skew:,-0.696,Prob(JB):,0.0
Kurtosis:,18.912,Cond. No.,4.25


**On route 56 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.2590 units or 15 seconds.**

**ROUTE 56 FROM DOWNTOWN**

In [181]:
route_number = 56
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [182]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [183]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [184]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10,11,12,13
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
347903120230801,-1.733333,-1.433333,-0.866666,-1.650000,-2.966666,-3.333333,-0.150000,,,,,,
347903120230802,-1.733333,-0.266666,-0.516666,-0.633333,-2.350000,-2.683333,0.500000,,,,,,
347903120230803,-4.350000,-3.283333,-4.083333,-2.800000,-3.000000,-2.283333,0.950000,,,,,,
347905120230801,-3.950000,-1.616666,-0.983333,-0.466666,0.000000,-1.983333,0.200000,,,,,,
347905120230802,0.950000,-1.800000,-0.900000,-0.500000,-0.483333,-0.833333,2.083333,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353123120230925,-10.633333,-0.883333,,,,,,,,,,,
353123120230926,-10.650000,-2.350000,,,,,,,,,,,
353123120230927,-12.333333,-3.983333,,,,,,,,,,,
353123120230928,-10.650000,-1.766666,,,,,,,,,,,


In [185]:
route_df_new = route_df_pivot.iloc[:, 0:6].dropna()
route_df_new

stops,1,2,3,4,5,6
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
347903120230801,-1.733333,-1.433333,-0.866666,-1.650000,-2.966666,-3.333333
347903120230802,-1.733333,-0.266666,-0.516666,-0.633333,-2.350000,-2.683333
347903120230803,-4.350000,-3.283333,-4.083333,-2.800000,-3.000000,-2.283333
347905120230801,-3.950000,-1.616666,-0.983333,-0.466666,0.000000,-1.983333
347905120230802,0.950000,-1.800000,-0.900000,-0.500000,-0.483333,-0.833333
...,...,...,...,...,...,...
353122120230925,-3.516666,-2.183333,-2.850000,-0.966666,-3.666666,-3.633333
353122120230926,0.950000,-2.100000,-3.266666,-3.200000,-4.266666,-4.466666
353122120230927,-5.783333,-5.050000,-7.733333,-6.066666,-5.966666,-5.966666
353122120230928,-2.800000,-3.183333,-4.816666,-4.066666,-3.666666,-5.216666


In [186]:
route_df_new[1].mean().round(2)

-3.11

In [187]:
route_df_new[6].mean().round(2)

-4.91

In [188]:
route_df_new[1].corr(route_df_new[6])

0.271434435935105

In [189]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:'five', 6:'six'})

In [190]:
lm = smf.ols("six ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,six,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.073
Method:,Least Squares,F-statistic:,386.2
Date:,"Tue, 14 Nov 2023",Prob (F-statistic):,8.72e-83
Time:,21:45:10,Log-Likelihood:,-14811.0
No. Observations:,4857,AIC:,29630.0
Df Residuals:,4855,BIC:,29640.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.7480,0.094,-39.812,0.000,-3.933,-3.563
one,0.3739,0.019,19.651,0.000,0.337,0.411

0,1,2,3
Omnibus:,1459.184,Durbin-Watson:,1.797
Prob(Omnibus):,0.0,Jarque-Bera (JB):,86807.735
Skew:,-0.594,Prob(JB):,0.0
Kurtosis:,23.677,Cond. No.,6.46


In [191]:
0.3739

0.3739

**On route 56 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.3739 units or 22 seconds.**

In [192]:
wego[wego['ROUTE_ABBR']==55].head(20)

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,TRIP_EDGE,LATITUDE,LONGITUDE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,DAYS_OF_THE_WEEK
3853,120230801,1,99462915,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,HHWM,11.0,1,36.052637,-86.654878,2023-08-01 04:23:00,2023-08-01 04:14:25,2023-08-01 04:23:54,-0.9,,,,0,0,1,0,0.0,0,2,9.483333,1
3854,120230801,1,99462916,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,MXBELL,10.0,0,36.072852,-86.636862,2023-08-01 04:30:00,2023-08-01 04:30:37,2023-08-01 04:30:37,-0.616666,,,,0,0,1,0,0.0,0,9,0.0,1
3855,120230801,1,99462917,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,MXDONEL,9.0,0,36.106276,-86.672801,2023-08-01 04:37:00,2023-08-01 04:39:14,2023-08-01 04:39:14,-2.233333,,,,0,0,1,0,0.0,0,18,0.0,1
3856,120230801,1,99462918,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,MXTHOMP,8.0,0,36.127172,-86.711441,2023-08-01 04:44:00,2023-08-01 04:44:44,2023-08-01 04:44:44,-0.733333,,,,0,0,1,0,0.0,0,23,0.0,1
3857,120230801,1,99462919,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,MXWHARF,6.0,0,36.14886,-86.762226,2023-08-01 04:54:00,2023-08-01 04:56:17,2023-08-01 04:56:17,-2.283333,,,,0,0,1,0,0.0,0,33,0.0,1
3858,120230801,1,99462920,2023-08-01,55,5500,1206,347480,0,TO DOWNTOWN,MCC4_15,5.0,2,36.167091,-86.781923,2023-08-01 05:07:00,2023-08-01 05:08:13,2023-08-01 05:08:13,-1.216666,,,,0,0,1,0,,0,43,0.0,1
3859,120230801,1,99462921,2023-08-01,55,5500,1206,347481,0,FROM DOWNTOWN,MCC4_15,5.0,1,36.167091,-86.781923,2023-08-01 05:15:00,2023-08-01 05:08:13,2023-08-01 05:16:52,-1.866666,,,,0,0,1,0,0.0,0,44,8.65,1
3860,120230801,1,99462922,2023-08-01,55,5500,1206,347481,0,FROM DOWNTOWN,MXWHARF,6.0,0,36.148839,-86.76271,2023-08-01 05:24:00,2023-08-01 05:24:56,2023-08-01 05:26:36,-2.6,,,,0,0,1,0,0.0,0,50,1.666666,1
3861,120230801,1,99462923,2023-08-01,55,5500,1206,347481,0,FROM DOWNTOWN,MXTHOMP,8.0,0,36.128164,-86.713199,2023-08-01 05:35:00,2023-08-01 05:37:18,2023-08-01 05:37:18,-2.3,,,,0,0,1,0,0.0,0,60,0.0,1
3862,120230801,1,99462924,2023-08-01,55,5500,1206,347481,0,FROM DOWNTOWN,MXDONEL,9.0,0,36.105615,-86.672004,2023-08-01 05:41:00,2023-08-01 05:43:41,2023-08-01 05:43:41,-2.683333,,,,0,0,1,0,0.0,0,66,0.0,1


In [193]:
route_df_new

stops,one,two,three,four,five,six
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
347903120230801,-1.733333,-1.433333,-0.866666,-1.650000,-2.966666,-3.333333
347903120230802,-1.733333,-0.266666,-0.516666,-0.633333,-2.350000,-2.683333
347903120230803,-4.350000,-3.283333,-4.083333,-2.800000,-3.000000,-2.283333
347905120230801,-3.950000,-1.616666,-0.983333,-0.466666,0.000000,-1.983333
347905120230802,0.950000,-1.800000,-0.900000,-0.500000,-0.483333,-0.833333
...,...,...,...,...,...,...
353122120230925,-3.516666,-2.183333,-2.850000,-0.966666,-3.666666,-3.633333
353122120230926,0.950000,-2.100000,-3.266666,-3.200000,-4.266666,-4.466666
353122120230927,-5.783333,-5.050000,-7.733333,-6.066666,-5.966666,-5.966666
353122120230928,-2.800000,-3.183333,-4.816666,-4.066666,-3.666666,-5.216666
