In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wego = pd.read_csv('../data/headway_data_clean.csv')

In [3]:
# convert times to datetime64 objects, check dtype to confirm
wego['SCHEDULED_TIME'] = pd.to_datetime(wego['SCHEDULED_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_ARRIVAL_TIME'] = pd.to_datetime(wego['ACTUAL_ARRIVAL_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['ACTUAL_DEPARTURE_TIME'] = pd.to_datetime(wego['ACTUAL_DEPARTURE_TIME'], format = '%Y-%m-%d %H:%M:%S')
wego['DATE'] = pd.to_datetime(wego['DATE'], format = "%Y/%m/%d")


In [4]:
#Create a column that gives each day of the week its on variable
wego['DAYS_OF_THE_WEEK'] = wego['DATE'].dt.weekday


**Question1: How much impact does being late or too spaced out at the first stop have downstream**



*Does being late to start a trip or too spaced out between buses have an affect on the remainder of the trip?*

In [5]:
#dropping all nan values in the adherence column
wego = wego.dropna(subset = 'ADHERENCE')

In [6]:
route_3_to_downtown = wego.loc[
    (wego['ROUTE_ABBR'] == 3) & (wego['ROUTE_DIRECTION_NAME']== 'TO DOWNTOWN')
]

In [7]:
route_3_to_downtown = route_3_to_downtown.copy()

In [8]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column
route_3_to_downtown['TRIP_ID'] = route_3_to_downtown['TRIP_ID'].astype(str)
route_3_to_downtown['CALENDAR_ID'] = route_3_to_downtown['CALENDAR_ID'].astype(str)

In [9]:
#creating a column that concats the two columns
route_3_to_downtown['TRIP_IDS'] = route_3_to_downtown['TRIP_ID'] + route_3_to_downtown['CALENDAR_ID']

In [10]:
#allows for each stop to be numbered in their respective route IDs
route_3_to_downtown['row_num'] = 1
route_3_to_downtown['stops'] = route_3_to_downtown.groupby('TRIP_IDS')['row_num'].cumsum()

In [11]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_3_to_downtown_pivot = route_3_to_downtown.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [12]:
#.reset_index(drop = True)

In [13]:
route_3_to_downtown_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
346048120230801,-1.300000,-0.083333,0.416666,1.000000,4.100000,,,,,
346048120230802,-1.150000,-0.750000,-0.716666,0.150000,0.016666,,,,,
346048120230803,-18.716666,,,,,,,,,
346050120230801,-3.933333,-2.666666,-1.950000,-1.450000,1.533333,,,,,
346050120230802,0.966666,-2.150000,-1.650000,-1.683333,-0.033333,,,,,
...,...,...,...,...,...,...,...,...,...,...
351240120230925,-1.383333,-2.033333,0.616666,-1.833333,-5.600000,-1.300000,,,,
351240120230926,-5.850000,-3.483333,-2.033333,-0.666666,-0.133333,2.666666,,,,
351240120230927,-2.183333,-0.600000,-1.283333,-0.583333,-1.733333,-1.250000,,,,
351240120230928,-1.333333,-1.150000,-1.166666,-0.233333,-0.666666,-2.983333,,,,


In [14]:
route_3_new = route_3_to_downtown_pivot.iloc[:, 0:5].dropna()

In [15]:
route_3_new[1].corr(route_3_new[5])

0.5004741699120517

In [16]:
route_3_to_downtown_pivot[1].corr(route_3_to_downtown_pivot[5])

0.5004741699120517

In [17]:
route_3_new = route_3_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:"five"})

In [18]:
route_3_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4082 entries, 346048120230801 to 351240120230929
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     4082 non-null   float64
 1   two     4082 non-null   float64
 2   three   4082 non-null   float64
 3   four    4082 non-null   float64
 4   five    4082 non-null   float64
dtypes: float64(5)
memory usage: 191.3+ KB


In [19]:
lm = smf.ols("five ~ one", data=route_3_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.25
Model:,OLS,Adj. R-squared:,0.25
Method:,Least Squares,F-statistic:,1363.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,9.15e-258
Time:,23:11:04,Log-Likelihood:,-12310.0
No. Observations:,4082,AIC:,24620.0
Df Residuals:,4080,BIC:,24640.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.5153,0.096,-5.351,0.000,-0.704,-0.327
one,0.7530,0.020,36.925,0.000,0.713,0.793

0,1,2,3
Omnibus:,4114.296,Durbin-Watson:,1.365
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1089936.058
Skew:,-4.345,Prob(JB):,0.0
Kurtosis:,82.578,Cond. No.,5.98


**on route 3 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.7530 units.**

In [20]:
route_number = 3
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [21]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [22]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [23]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [24]:
route_df_new = route_df_pivot.iloc[:, 0:5].dropna()

In [25]:
route_df_new[1].corr(route_df_new[5])

0.3829595646223628

In [26]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four', 5:"five"})

In [27]:
lm = smf.ols("five ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,five,R-squared:,0.147
Model:,OLS,Adj. R-squared:,0.146
Method:,Least Squares,F-statistic:,704.5
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.23e-143
Time:,23:11:05,Log-Likelihood:,-13502.0
No. Observations:,4101,AIC:,27010.0
Df Residuals:,4099,BIC:,27020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3.2013,0.141,-22.649,0.000,-3.478,-2.924
one,0.8184,0.031,26.542,0.000,0.758,0.879

0,1,2,3
Omnibus:,3491.723,Durbin-Watson:,1.356
Prob(Omnibus):,0.0,Jarque-Bera (JB):,318025.061
Skew:,-3.554,Prob(JB):,0.0
Kurtosis:,45.552,Cond. No.,6.52


**on route 3 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.8184 units.**

***ROUTE 7 FROM DOWNTOWN***

In [28]:
wego['ROUTE_ABBR'].unique()

array([22, 23,  3,  7, 50, 52, 55, 56])

In [29]:
route_number = 7
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [30]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [31]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [32]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [53]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()


In [55]:
route_df_new[1].corr(route_df_new[3])

0.6616917036779562

In [57]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [59]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.438
Model:,OLS,Adj. R-squared:,0.438
Method:,Least Squares,F-statistic:,2317.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,0.0
Time:,23:21:21,Log-Likelihood:,-9503.5
No. Observations:,2977,AIC:,19010.0
Df Residuals:,2975,BIC:,19020.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.5649,0.121,-12.933,0.000,-1.802,-1.328
one,1.0030,0.021,48.136,0.000,0.962,1.044

0,1,2,3
Omnibus:,1556.017,Durbin-Watson:,1.532
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29208.58
Skew:,-2.047,Prob(JB):,0.0
Kurtosis:,17.789,Cond. No.,6.55


**on route 7 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 1.0030 units.**

**ROUTE 7 TO DOWNTOWN**

In [63]:
route_number = 7
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [64]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [65]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [68]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [69]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [71]:
route_df_new[1].corr(route_df_new[3])

0.6279235315819696

In [72]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [73]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.394
Model:,OLS,Adj. R-squared:,0.394
Method:,Least Squares,F-statistic:,1560.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.8799999999999998e-263
Time:,23:28:53,Log-Likelihood:,-7635.7
No. Observations:,2399,AIC:,15280.0
Df Residuals:,2397,BIC:,15290.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.3846,0.129,-2.980,0.003,-0.638,-0.132
one,0.8931,0.023,39.501,0.000,0.849,0.937

0,1,2,3
Omnibus:,1418.509,Durbin-Watson:,1.178
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40534.66
Skew:,-2.274,Prob(JB):,0.0
Kurtosis:,22.617,Cond. No.,6.21


**on route 7 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 1.0030 units.**

**ROUTE 22 TO DOWTOWN**

In [74]:
route_number = 22
route_direction = 'TO DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [75]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [76]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [78]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [81]:
wego[wego['ROUTE_ABBR']==22].head(20)

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,TRIP_EDGE,LATITUDE,LONGITUDE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,DAYS_OF_THE_WEEK
0,120230801,1,99457890,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,MHSP,14.0,1,36.181248,-86.847705,2023-08-01 04:42:00,2023-08-01 04:37:38,2023-08-01 04:44:08,-2.133333,,,,0,0,1,0,0.0,0,2,6.5,1
1,120230801,1,99457891,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,ELIZ,10.0,0,36.193454,-86.839981,2023-08-01 04:46:00,2023-08-01 04:48:27,2023-08-01 04:48:27,-2.45,,,,0,0,1,0,0.0,0,9,0.0,1
2,120230801,1,99457892,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,CV23,5.0,0,36.182177,-86.814445,2023-08-01 04:54:00,2023-08-01 04:54:56,2023-08-01 04:54:56,-0.933333,,,,0,0,1,0,0.0,0,19,0.0,1
3,120230801,1,99457893,2023-08-01,22,2200,1040,345104,0,TO DOWNTOWN,MCC5_10,1.0,2,36.167091,-86.781923,2023-08-01 05:10:00,2023-08-01 05:03:43,2023-08-01 05:03:43,6.283333,,,,0,0,1,0,,0,35,0.0,1
4,120230801,1,99457894,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,MCC5_10,1.0,1,36.167091,-86.781923,2023-08-01 05:15:00,2023-08-01 05:03:43,2023-08-01 05:16:35,-1.583333,,,,0,0,1,0,0.0,0,36,12.866666,1
5,120230801,1,99457895,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,CV23,5.0,0,36.18348,-86.81422,2023-08-01 05:25:00,2023-08-01 05:24:03,2023-08-01 05:24:03,0.95,,,,0,0,1,0,0.0,0,51,0.0,1
6,120230801,1,99457896,2023-08-01,22,2200,1040,345105,0,FROM DOWNTOWN,MHSP,14.0,2,36.181248,-86.847705,2023-08-01 05:32:00,2023-08-01 05:30:30,2023-08-01 05:30:30,1.5,,,,0,0,1,0,,0,62,0.0,1
7,120230801,1,99457897,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,MHSP,14.0,1,36.181248,-86.847705,2023-08-01 05:42:00,2023-08-01 05:30:30,2023-08-01 05:43:43,-1.716666,35.0,37.666666,2.666666,0,0,1,0,0.0,0,63,13.216666,1
8,120230801,1,99457898,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,ELIZ,10.0,0,36.193454,-86.839981,2023-08-01 05:46:00,2023-08-01 05:47:55,2023-08-01 05:47:55,-1.916666,35.0,36.966666,1.966666,0,0,1,0,0.0,0,70,0.0,1
9,120230801,1,99457899,2023-08-01,22,2200,1040,345106,0,TO DOWNTOWN,CV23,5.0,0,36.182177,-86.814445,2023-08-01 05:54:00,2023-08-01 05:54:11,2023-08-01 05:54:11,-0.183333,15.0,14.516666,-0.483334,0,0,1,0,0.0,0,80,0.0,1


In [88]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [89]:
route_df_new_4 = route_df_pivot.iloc[:, 0:4].dropna()

In [90]:
route_df_new[1].corr(route_df_new[3])

0.33794273617741843

In [91]:
 route_df_new_4[1].corr(route_df_new_4[4])

0.09990982527956851

In [92]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [93]:
route_df_new_4 = route_df_new_4.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four'})

In [94]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.114
Model:,OLS,Adj. R-squared:,0.114
Method:,Least Squares,F-statistic:,475.5
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.96e-99
Time:,23:38:38,Log-Likelihood:,-10366.0
No. Observations:,3690,AIC:,20740.0
Df Residuals:,3688,BIC:,20750.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8366,0.071,39.923,0.000,2.697,2.976
one,0.5276,0.024,21.806,0.000,0.480,0.575

0,1,2,3
Omnibus:,5502.476,Durbin-Watson:,1.432
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7857204.019
Skew:,-8.499,Prob(JB):,0.0
Kurtosis:,228.421,Cond. No.,3.21


**on route 22 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.5276 units.This is when the bus has only three stops**

In [96]:
lm = smf.ols("four ~ one", data=route_df_new_4).fit()
lm.summary()

0,1,2,3
Dep. Variable:,four,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.008
Method:,Least Squares,F-statistic:,6.342
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,0.012
Time:,23:39:01,Log-Likelihood:,-2486.1
No. Observations:,631,AIC:,4976.0
Df Residuals:,629,BIC:,4985.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1214,0.528,2.123,0.034,0.084,2.159
one,0.4951,0.197,2.518,0.012,0.109,0.881

0,1,2,3
Omnibus:,1506.474,Durbin-Watson:,1.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6809779.127
Skew:,-21.398,Prob(JB):,0.0
Kurtosis:,510.127,Cond. No.,2.92


**on route 22 to downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.0.4951 units.This is when the bus has only four stops**

**ROUTE 22 FROM DOWNTOWN**

In [97]:
route_number = 22
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [98]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [99]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [101]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [103]:
route_df_new = route_df_pivot.iloc[:, 0:3].dropna()

In [104]:
route_df_new[1].corr(route_df_new[3])

0.5094255159904719

In [105]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three'})

In [106]:
lm = smf.ols("three ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,three,R-squared:,0.26
Model:,OLS,Adj. R-squared:,0.259
Method:,Least Squares,F-statistic:,1277.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,4.05e-240
Time:,23:45:30,Log-Likelihood:,-10453.0
No. Observations:,3647,AIC:,20910.0
Df Residuals:,3645,BIC:,20920.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4205,0.087,16.279,0.000,1.249,1.592
one,0.8412,0.024,35.741,0.000,0.795,0.887

0,1,2,3
Omnibus:,4487.716,Durbin-Watson:,1.184
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3832327.891
Skew:,-5.861,Prob(JB):,0.0
Kurtosis:,161.374,Cond. No.,4.72


**on route 22 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.8412 units.

**ROUTE 23 FROM DOWNTOWN**

In [107]:
route_number = 23
route_direction = 'FROM DOWNTOWN'

route_df = wego.loc[
    (wego['ROUTE_ABBR'] == route_number) & (wego['ROUTE_DIRECTION_NAME']== route_direction)
]
route_df = route_df.copy()

In [108]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#changing the dtype to string so that i can concat with the "ROUTE ID" column

route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']
route_df['TRIP_ID'] = route_df['TRIP_ID'].astype(str)
route_df['CALENDAR_ID'] = route_df['CALENDAR_ID'].astype(str)
route_df['TRIP_IDS'] = route_df['TRIP_ID'] + route_df['CALENDAR_ID']

In [109]:
#allows for each stop to be numbered in their respective route IDs
route_df['row_num'] = 1
route_df['stops'] = route_df.groupby('TRIP_IDS')['row_num'].cumsum()

In [111]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
route_df_pivot = route_df.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops')

In [112]:
route_df_pivot

stops,1,2,3,4,5,6,7,8,9,10
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
345435120230801,-3.233333,-2.000000,1.333333,,,,,,,
345435120230802,-2.250000,-1.400000,1.150000,,,,,,,
345435120230803,-2.600000,-1.716666,0.933333,,,,,,,
345437120230801,-5.016666,-4.516666,-5.050000,-0.533333,,,,,,
345437120230802,-3.533333,-3.883333,-5.183333,1.383333,,,,,,
...,...,...,...,...,...,...,...,...,...,...
350603120230903,-1.316666,-0.216666,0.366666,-1.816666,-0.533333,-1.033333,0.400000,,,
350603120230904,-1.500000,-1.516666,-1.950000,-0.050000,-2.066666,-5.316666,-8.566666,,,
350603120230910,-0.666666,-0.983333,-1.383333,1.000000,-0.300000,-1.233333,-1.416666,,,
350603120230917,-1.200000,-1.300000,-1.283333,-0.316666,-0.716666,1.133333,,,,


In [113]:
wego[wego['ROUTE_ABBR']==23].head(20)

Unnamed: 0,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,ROUTE_DIRECTION_NAME,TIME_POINT_ABBR,ROUTE_STOP_SEQUENCE,TRIP_EDGE,LATITUDE,LONGITUDE,SCHEDULED_TIME,ACTUAL_ARRIVAL_TIME,ACTUAL_DEPARTURE_TIME,ADHERENCE,SCHEDULED_HDWY,ACTUAL_HDWY,HDWY_DEV,ADJUSTED_EARLY_COUNT,ADJUSTED_LATE_COUNT,ADJUSTED_ONTIME_COUNT,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,DAYS_OF_THE_WEEK
491,120230801,1,99458391,2023-08-01,23,2300,1893,345435,0,FROM DOWNTOWN,DWMRT,5.0,1,36.24406,-86.757403,2023-08-01 05:00:00,2023-08-01 04:50:15,2023-08-01 05:03:14,-3.233333,15.0,17.95,2.95,0,0,1,0,0.0,0,2,12.983333,1
492,120230801,1,99458392,2023-08-01,23,2300,1893,345435,0,FROM DOWNTOWN,EDBC,8.0,0,36.231512,-86.78049,2023-08-01 05:09:00,2023-08-01 05:11:00,2023-08-01 05:11:00,-2.0,15.0,15.45,0.45,0,0,1,0,0.0,0,21,0.0,1
493,120230801,1,99458393,2023-08-01,23,2300,1893,345435,0,FROM DOWNTOWN,DWMRT,6.0,2,36.244207,-86.760509,2023-08-01 05:18:00,2023-08-01 05:16:40,2023-08-01 05:16:40,1.333333,,,,0,0,1,0,,0,35,0.0,1
494,120230801,1,99458394,2023-08-01,23,2300,1893,345436,0,TO DOWNTOWN,DWMRT,5.0,1,36.244207,-86.760509,2023-08-01 05:25:00,2023-08-01 05:16:40,2023-08-01 05:31:06,-6.1,22.0,25.833333,3.833333,0,1,0,0,0.0,0,36,14.433333,1
495,120230801,1,99458395,2023-08-01,23,2300,1893,345436,0,TO DOWNTOWN,DKTL,4.0,0,36.204256,-86.769112,2023-08-01 05:39:00,2023-08-01 05:41:16,2023-08-01 05:41:16,-2.266666,22.0,22.966666,0.966666,0,0,1,0,0.0,0,51,0.0,1
496,120230801,1,99458396,2023-08-01,23,2300,1893,345436,0,TO DOWNTOWN,N1SP,3.0,0,36.175714,-86.774535,2023-08-01 05:45:00,2023-08-01 05:48:26,2023-08-01 05:48:26,-3.433333,22.0,22.933333,0.933333,0,0,1,0,0.0,0,62,0.0,1
497,120230801,1,99458397,2023-08-01,23,2300,1893,345436,0,TO DOWNTOWN,MCC4_24,2.0,2,36.167091,-86.781923,2023-08-01 05:54:00,2023-08-01 05:51:57,2023-08-01 05:51:57,2.05,,,,0,0,1,0,,0,66,0.0,1
498,120230801,1,99458398,2023-08-01,23,2300,1893,345437,0,FROM DOWNTOWN,MCC4_24,2.0,1,36.167091,-86.781923,2023-08-01 06:00:00,2023-08-01 05:51:57,2023-08-01 06:05:01,-5.016666,20.0,23.016666,3.016666,0,0,1,0,0.0,0,67,13.066666,1
499,120230801,1,99458399,2023-08-01,23,2300,1893,345437,0,FROM DOWNTOWN,N1SP,3.0,0,36.175433,-86.77432,2023-08-01 06:06:00,2023-08-01 06:10:31,2023-08-01 06:10:31,-4.516666,21.0,25.45,4.45,0,0,1,0,0.0,0,71,0.0,1
500,120230801,1,99458400,2023-08-01,23,2300,1893,345437,0,FROM DOWNTOWN,DKTL,4.0,0,36.205694,-86.768557,2023-08-01 06:13:00,2023-08-01 06:18:03,2023-08-01 06:18:03,-5.05,21.0,26.066666,5.066666,0,0,1,0,0.0,0,82,0.0,1


In [122]:
route_df_new = route_df_pivot.iloc[:, 0:4].dropna()
route_df_new

stops,1,2,3,4
TRIP_IDS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
345437120230801,-5.016666,-4.516666,-5.050000,-0.533333
345437120230802,-3.533333,-3.883333,-5.183333,1.383333
345437120230803,-3.833333,-3.016666,-1.466666,2.416666
345439120230801,-5.550000,-4.900000,-4.616666,-0.033333
345439120230802,-3.700000,-2.283333,-2.050000,2.650000
...,...,...,...,...
350603120230903,-1.316666,-0.216666,0.366666,-1.816666
350603120230904,-1.500000,-1.516666,-1.950000,-0.050000
350603120230910,-0.666666,-0.983333,-1.383333,1.000000
350603120230917,-1.200000,-1.300000,-1.283333,-0.316666


In [123]:
route_df_new[1].corr(route_df_new[4])

0.4210687531874183

In [124]:
route_df_new = route_df_new.rename(columns = {1:"one", 2:'two', 3:'three', 4:'four'})

In [125]:
lm = smf.ols("four ~ one", data=route_df_new).fit()
lm.summary()

0,1,2,3
Dep. Variable:,four,R-squared:,0.177
Model:,OLS,Adj. R-squared:,0.177
Method:,Least Squares,F-statistic:,854.5
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.7899999999999998e-170
Time:,23:56:27,Log-Likelihood:,-11640.0
No. Observations:,3967,AIC:,23280.0
Df Residuals:,3965,BIC:,23300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8166,0.095,8.591,0.000,0.630,1.003
one,0.7272,0.025,29.232,0.000,0.678,0.776

0,1,2,3
Omnibus:,2834.794,Durbin-Watson:,1.268
Prob(Omnibus):,0.0,Jarque-Bera (JB):,478289.542
Skew:,-2.461,Prob(JB):,0.0
Kurtosis:,56.567,Cond. No.,5.18


**on route 23 from downtown As you move from the first bus stop to the last one, the lateness tends to increase by approximately 0.7272 units.

**ROUTE 23 TO DOWNTOWN**

In [None]:
#trips = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'TRIP_EDGE').reset_index(drop = True)

In [None]:
#trip_adherence[(trip_adherence['TRIP_EDGE'] == 1) & (trip_adherence['ADHERENCE'] >= 0)]

In [None]:
#trip_adherence.info()

In [None]:
#trip_adherence = trip_adherence.copy()

In [None]:
#changing the dtype to string so that i can concat with the "CALENDAR ID" column
#trip_adherence['TRIP_ID'] = trip_adherence['TRIP_ID'].astype(str)


In [None]:
#changing the dtype to string so that i can concat with the "ROUTE ID" column
#trip_adherence['CALENDAR_ID'] = trip_adherence['CALENDAR_ID'].astype(str)


In [None]:
#creating a column that concats the two columns
#trip_adherence['TRIP_IDS'] = trip_adherence['TRIP_ID'] + trip_adherence['CALENDAR_ID']
#trip_adherence

In [None]:
#allows for each stop to be numbered in their respective route IDs
#trip_adherence['row_num'] = 1
#trip_adherence['stops'] = trip_adherence.groupby('TRIP_IDS')['row_num'].cumsum()

In [None]:
#trip_adherence


In [None]:
#trip_adherence['stops'] = trip_adherence['stops'].astype(str)

In [None]:
#creating a pivot table that will make the columns the stop #, the rows is each trip, and the values the adherence(amount of minutes late)
#trip_adherence_pivot = trip_adherence.pivot_table(values = 'ADHERENCE', index = 'TRIP_IDS', columns = 'stops').reset_index(drop = True)

In [None]:
#trip_adherence_pivot

In [None]:
#trip_adherence_pivot[trip_adherence_pivot[1] < 0]

In [None]:
#trip_adherence_pivot[trip_adherence_pivot[1] >= 0]

In [None]:
#the min amount of stops
#trip_adherence['stops'].min()

In [None]:
#the max amount of stops
#trip_adherence['stops'].max()

In [None]:
#Checking the total amount of stops
#trip_adherence['stops'].info()

In [None]:
#trip_adherence_pivot

In [None]:
#trip_adherence_pivot['Stops']

In [None]:
#start_on_time = trip_adherence_pivot[trip_adherence_pivot[1] == 0].reset_index()

In [None]:
#start_late = trip_adherence_pivot[trip_adherence_pivot[1] != 0].reset_index()

In [None]:
#start_late.iloc[0]

In [None]:
#wego.loc[
    #(wego['ROUTE_ABBR'] == 56) & (wego['STOPS']== 20)
    
    
#3]