In [1]:
import geopandas as gpd
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path
from shapely import *

# Importing all the GTFS files

To save on the memory used, we don't import empty columns and use more efficient data types when possible


In [2]:
dir_GTFS= "/home/lubuntu/GSDMA_2024/Tec GTFS"
print("Loading stops")
stops = pd.read_csv(dir_GTFS+"/stops.txt", usecols=['stop_id','stop_name','stop_lat','stop_lon','zone_id','location_type'])
print("Loading stop_times")
stop_times = pd.read_csv(dir_GTFS+"/stop_times.txt", dtype={'service_id':'category',
                                                            'pickup_type':'category',
                                                            'drop_off_type':'int8',
                                                            'stop_sequence':'uint32'},
                        parse_dates=["arrival_time", "departure_time"], date_format="%H:%M")
print("Loading trips")
trips = pd.read_csv(dir_GTFS+"/trips.txt", dtype={'service_id':'category',
                                                  'trip_short_name':'category',
                                                  'direction_id':'int8'})
print("Loading shapes")
shapes = pd.read_csv(dir_GTFS+"/shapes.txt", dtype={'shape_id':'category',
                                                    'shape_pt_sequence':'uint32'})
print("Loading routes")
routes = pd.read_csv(dir_GTFS+"/routes.txt", usecols=['route_id','agency_id','route_short_name','route_long_name','route_type'],
                    dtype={'route_type':'uint8',
                          'agency_id':'category'})
#print("Loading agency")
#agency = pd.read_csv(dir_GTFS+"/agency.txt")
print("Loading calendar")
calendar = pd.read_csv(dir_GTFS+"/calendar.txt",dtype={'monday':'boolean',
                                                      'tuesday':'boolean',
                                                      'wednesday':'boolean',
                                                      'thursday':'boolean',
                                                      'friday':'boolean',
                                                      'saturday':'boolean',
                                                      'sunday':'boolean'},
                      parse_dates=["start_date", "end_date"], date_format="%Y%m%d")
#print("Loading calendar_dates")
#calendar_dates = pd.read_csv(dir_GTFS+"/calendar_dates.txt", dtype={'exception_type':'uint8'}, parse_dates=["date"], date_format="%Y%m%d")
print("Done !")

Loading stops
Loading stop_times
Loading trips
Loading shapes
Loading routes
Loading calendar
Done !


# Converting the dataframes to geodataframes
## Stops

In [3]:
geometry=gpd.points_from_xy(stops['stop_lon'], stops['stop_lat'], z=None, crs='epsg:4326')
geo_stops=gpd.GeoDataFrame(data=stops, geometry=geometry)
#geo_stops.sample(n=30).plot()
del stops

## Shapes

In [4]:
geometry=gpd.points_from_xy(shapes['shape_pt_lon'], shapes['shape_pt_lat'], z=None, crs='epsg:4326')
geo_shapes=gpd.GeoDataFrame(data=shapes, geometry=geometry)
#geo_shapes.sample(50).plot()
del shapes

In [5]:
#Sort the lines by id and Sequence so they are ordered
geo_shapes_sorted = geo_shapes.sort_values(by=['shape_id', 'shape_pt_sequence'])

#Group the points by 'route'
lines = geo_shapes_sorted.groupby('shape_id', observed=True).apply(lambda x: LineString(x.geometry.tolist()))
lines=lines.reset_index()
lines = gpd.GeoDataFrame(data=lines['shape_id'], geometry=lines[0], crs=geo_shapes.crs)
#lines.sample(20).explore()
del geo_shapes
del geo_shapes_sorted

  lines = geo_shapes_sorted.groupby('shape_id', observed=True).apply(lambda x: LineString(x.geometry.tolist()))


In [6]:
lines

Unnamed: 0,shape_id,geometry
0,B00050012,"LINESTRING (4.87717 50.72296, 4.87730 50.72357..."
1,B00050013,"LINESTRING (4.92584 50.80830, 4.92601 50.80819..."
2,B00060086,"LINESTRING (4.61307 50.66783, 4.61302 50.66768..."
3,B00060087,"LINESTRING (4.61307 50.66783, 4.61302 50.66768..."
4,B00060088,"LINESTRING (4.74619 50.79346, 4.74582 50.79333..."
...,...,...
6523,X99800096,"LINESTRING (5.34656 50.23010, 5.34674 50.23013..."
6524,X99800098,"LINESTRING (5.30987 50.18658, 5.30988 50.18658..."
6525,X99800101,"LINESTRING (5.26088 50.32912, 5.26086 50.32919..."
6526,X99800102,"LINESTRING (5.33441 50.36218, 5.33446 50.36225..."


# Network
Time when we can arrive to the bus_stops starting from a given bus_stop at a given time

Trips active on the given day

In [7]:
stop_times[(stop_times['departure_time']<'08:00:00')]#(stop_times['stop_id']=="H1ms290a")&

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type
334,40915329-C2024-choi-Samedi-09,07:08:00,07:08:00,Cbwdoua1,1,0,0
335,40915329-C2024-choi-Samedi-09,07:09:00,07:09:00,Cbweco1,2,0,0
336,40915329-C2024-choi-Samedi-09,07:10:00,07:10:00,Cbwegl1,3,0,0
337,40915329-C2024-choi-Samedi-09,07:11:00,07:11:00,Cmgbras1,4,0,0
338,40915329-C2024-choi-Samedi-09,07:12:00,07:12:00,Cmgpn1,5,0,0
...,...,...,...,...,...,...,...
5790232,45599644-X-2024-VA-Sem-Vac-11,07:41:00,07:41:00,X688aca,16,0,0
5790233,45599644-X-2024-VA-Sem-Vac-11,07:44:00,07:44:00,X684aaa,17,0,0
5790234,45599644-X-2024-VA-Sem-Vac-11,07:49:00,07:49:00,X684aba,18,0,0
5790235,45599644-X-2024-VA-Sem-Vac-11,07:52:00,07:52:00,X685afa,19,0,0


In [8]:
def find_next_stops(active_trips, active_stop_times, cur_stop_id, cur_time):
    #Extract trips stopping by the current bus stop that are active
    cur_stop_times=active_stop_times.query("(stop_id.isin(@cur_stop_id))"
                                    +"&(departure_time>@cur_time)"
                                    #+"&(trip_id.isin(@active_trips_id))"
                                     ).sort_values('departure_time')
    #Extract the other stops that can be reached with the trips
    other_stops_times=[]
    for row in cur_stop_times.itertuples(index=False):
        departure_seq=int(row.stop_sequence)
        other_stops_time=active_stop_times[((active_stop_times['trip_id']==row.trip_id)
                                      #&(stop_times['departure_time']>departure_time)
                                      &(active_stop_times['stop_sequence']>departure_seq)
                                     )]
        other_stops_times.append(other_stops_time)
        #active_stop_times=active_stop_times[~active_stop_times.eq(other_stops_time,axis=0).all(axis=1)]
        print(row.departure_time, end='\r')
    if(len(other_stops_times)>0):
        other_stops_times=pd.concat(other_stops_times)
    return other_stops_times

In [9]:
departure_date="2024-10-30"
departure_time="18:00:00"
departure_stop_name="MONS Lycée"
walk_time=pd.Timedelta(hours=2)

#Find the schedule of the day
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
active_trips=trips[trips['service_id'].isin(active_services['service_id'])]
active_trips_id=active_trips['trip_id']
active_stop_times=stop_times[stop_times['trip_id'].isin(active_trips_id)]
#Extract the stops and the stop_id's
departure_stop=geo_stops.query("stop_name==@departure_stop_name")


departure_stop_id=departure_stop['stop_id']
found_stop_times=active_stop_times.query("(stop_id.isin(@departure_stop_id))"
                                     +"&(departure_time>@departure_time)"
                                     #+"&(trip_id.isin(@active_trips_id))"
                                      ).sort_values('departure_time').head(1)
found_stop_times['number_trips']=0
found_stop_times.loc[:, "intermediary_stops"] = np.array([['']] * len(found_stop_times))


other_stops_times=find_next_stops(active_trips, active_stop_times, departure_stop_id, departure_time)

# #Extract trips stopping by the departure bus stop that are active
# departure_stop_times=active_stop_times.query("(stop_id.isin(@departure_stop_id))"
#                                     +"&(departure_time>@departure_time)"
#                                     #+"&(trip_id.isin(@active_trips_id))"
#                                      ).sort_values('departure_time')
# #Extract the other stops that can be reached with the trips
# other_stops_times=[]
# for row in departure_stop_times.itertuples(index=False):
#     departure_seq=int(row.stop_sequence)
#     other_stops_time=active_stop_times[((active_stop_times['trip_id']==row.trip_id)
#                                   #&(stop_times['departure_time']>departure_time)
#                                   &(active_stop_times['stop_sequence']>departure_seq)
#                                  )]
#     other_stops_times.append(other_stops_time)
#     active_stop_times=active_stop_times[~active_stop_times.eq(other_stops_time,axis=0).all(axis=1)]
#     print(row.departure_time, end='\r')
other_stops_times['number_trips']=1
other_stops_times.loc[:, "intermediary_stops"] = [[departure_stop_id.iloc[0]]] * len(other_stops_times) #other_stops_times['intermediary_stops']=''
found_stop_times=pd.concat([found_stop_times,other_stops_times])
found_stop_times=found_stop_times.sort_values('arrival_time').drop_duplicates('stop_id')#We keep only the earliest arrival time to a stop.  
new_stop_times=found_stop_times[found_stop_times['number_trips']==1]
#Apply the same alghorithm to all the bus stops found 
for row in new_stop_times.itertuples(index=False):
    inter_stop_id=pd.Series(row.stop_id)
    print(inter_stop_id)
    inter_time=row.arrival_time
    previous_stops=row.intermediary_stops[:]
    other_stops_times=find_next_stops(active_trips, active_stop_times, inter_stop_id, inter_time)
    # inter_stop_times=active_stop_times.query("(stop_id==@inter_stop_id)"
    #                                 +"&(departure_time>@inter_time)"
    #                                  ).sort_values('departure_time')
    # other_stops_times2=[]
    # for row2 in inter_stop_times.itertuples(index=False):
    #     inter_seq=int(row2.stop_sequence)
    #     other_stops_time2=active_stop_times[((active_stop_times['trip_id']==row2.trip_id)
    #                                   #&(stop_times['departure_time']>departure_time)
    #                                   &(active_stop_times['stop_sequence']>inter_seq)
    #                                  )]
    #     other_stops_times2.append(other_stops_time2)
    #     active_stop_times=active_stop_times[~active_stop_times.eq(other_stops_time2,axis=0).all(axis=1)]
    #     print(row2.departure_time,end='\r')
    if(len(other_stops_times)>0):
        previous_stops.append(row.stop_id)
        other_stops_times['number_trips']=2
        other_stops_times['intermediary_stops']=[previous_stops] * len(other_stops_times)
        found_stop_times=pd.concat([found_stop_times,other_stops_times])
        found_stop_times=found_stop_times.sort_values('arrival_time').drop_duplicates('stop_id')
new_stop_times=found_stop_times[found_stop_times['number_trips']==2]
found_stop_times

0    H1ms299a
dtype: object
0    H1ms401a
dtype: object
0    H1ms401e
dtype: object
0    H1ms294a
dtype: object
0    H1ms254a
dtype: object
0    H1ms401c
dtype: object
0    H1ms296d
dtype: object
0    H1ms293a
dtype: object
0    H1ms312a
dtype: object
0    H1ms276a
dtype: object
0    H1hn202a
dtype: object
0    H1ci102a
dtype: object
0    H1mv243a
dtype: object
0    H1mv240a
dtype: object
0    H1ms254d
dtype: object
0    H1br135b
dtype: object
0    H1as154b
dtype: object
0    H1as103d
dtype: object
0    H1hg178b
dtype: object
0    H2pe163b
dtype: object
0    H1qy131b
dtype: object
0    H1qy132b
dtype: object
0    H1qy137b
dtype: object
0    H1hm174a
dtype: object
0    H1qy133b
dtype: object
0    H1qg138b
dtype: object
0    H1qy134b
dtype: object
0    H1gi118b
dtype: object
0    H1qy136b
dtype: object
0    H3bi110b
dtype: object
0    H1gc122b
dtype: object
0    H1gc124b
dtype: object
0    H1ry135a
dtype: object
0    H1go118c
dtype: object
0    H1bg110b
dtype: object
0    H2ep145b
dtype:

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,number_trips,intermediary_stops
4032825,44639737-H_2024-H24_P3-Sem-Vac-10,18:04:00,18:04:00,H1ms290a,40,0,0,0,
4032826,44639737-H_2024-H24_P3-Sem-Vac-10,18:05:00,18:05:00,H1ms299a,41,0,0,1,[H1ms290a]
4032827,44639737-H_2024-H24_P3-Sem-Vac-10,18:07:00,18:07:00,H1ms401a,42,1,0,1,[H1ms290a]
3976122,44638094-H_2024-H24_P3-Sem-Vac-10,18:17:00,18:17:00,H1ms303a,2,0,0,2,"[H1ms290a, H1ms401a]"
3974809,44638039-H_2024-H24_P3-Sem-Vac-10,18:17:00,18:17:00,H1ms360a,2,0,0,2,"[H1ms290a, H1ms401a]"
...,...,...,...,...,...,...,...,...,...
4016455,44639242-H_2024-H24_P3-Sem-Vac-10,22:03:00,22:03:00,H1ba102b,27,0,0,2,"[H1ms290a, H1ms401e]"
905311,42963216-C2024-choi-Sem-Cong-40,22:04:00,22:04:00,Csschat2,38,0,0,2,"[H1ms290a, Cbmind2]"
905312,42963216-C2024-choi-Sem-Cong-40,22:04:00,22:04:00,Cssgare2,39,0,0,2,"[H1ms290a, Cbmind2]"
905313,42963216-C2024-choi-Sem-Cong-40,22:04:00,22:04:00,Csspn2,40,0,0,2,"[H1ms290a, Cbmind2]"


In [None]:
departure_date="2024-11-29"
departure_time="13:00:00"
departure_stop_name="MONS Lycée"
walk_time=pd.Timedelta(hours=2)

#Find the schedule of the day
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
active_trips=trips[trips['service_id'].isin(active_services['service_id'])]
active_trips_id=active_trips['trip_id']
active_stop_times=stop_times[stop_times['trip_id'].isin(active_trips_id)]
#Extract the stops and the stop_id's
departure_stop=geo_stops.query("stop_name==@departure_stop_name")


departure_stop_id=departure_stop['stop_id']
found_stop_times=active_stop_times.query("(stop_id.isin(@departure_stop_id))"
                                     +"&(departure_time>@departure_time)"
                                     #+"&(trip_id.isin(@active_trips_id))"
                                      ).sort_values('departure_time').head(1)
found_stop_times['number_trips']=0
found_stop_times.loc[:, "intermediary_stops"] = np.array([['']] * len(found_stop_times))


other_stops_times=find_next_stops(active_trips, active_stop_times, departure_stop_id, departure_time)

other_stops_times['number_trips']=1
other_stops_times.loc[:, "intermediary_stops"] = [[departure_stop_id.iloc[0]]] * len(other_stops_times) #other_stops_times['intermediary_stops']=''
found_stop_times=pd.concat([found_stop_times,other_stops_times])
found_stop_times=found_stop_times.sort_values('arrival_time').drop_duplicates('stop_id')#We keep only the earliest arrival time to a stop.  

#Apply the same alghorithm to all the bus stops found 
for i in range(2, 5):
    print(f"Trips with {i} transfers")
    new_stop_times=found_stop_times[found_stop_times['number_trips']==(i-1)]
    r=0
    for row in new_stop_times.itertuples(index=False):
        r=r+1
        inter_stop_id=pd.Series(row.stop_id)
        print(f"{r}/{new_stop_times.shape[0]}",end='\r')
        inter_time=row.arrival_time
        previous_stops=row.intermediary_stops[:]
        other_stops_times=find_next_stops(active_trips, active_stop_times, inter_stop_id, inter_time)
    
        if(len(other_stops_times)>0):
            previous_stops.append(row.stop_id)
            other_stops_times['number_trips']=i
            other_stops_times['intermediary_stops']=[previous_stops] * len(other_stops_times)
            found_stop_times=pd.concat([found_stop_times,other_stops_times])
            found_stop_times=found_stop_times.sort_values('arrival_time').drop_duplicates('stop_id')
found_stop_times

Trips with 2 transfers
Trips with 3 transfers
527/9180

In [None]:
[previous_stops]* len(other_stops_times)

In [None]:
previous_stops.append(row.stop_id)

In [None]:
geo_stops.sort_values('stop_name').iloc[18935:19085]

In [None]:
geo_stops.merge(found_stop_times, on="stop_id", how='right').explore(column="arrival_time", cmap='magma_r')