In [None]:
import geopandas as gpd
import geopandas as gpd
import pandas as pd
import numpy as np
from pathlib import Path
from shapely import *
import time

# Importing all the GTFS files

To save on the memory used, we don't import empty columns and use more efficient data types when possible


In [None]:
dir_GTFS= "/home/lubuntu/GSDMA_2024/Tec GTFS"
print("Loading stops")
stops = pd.read_csv(dir_GTFS+"/stops.txt", usecols=['stop_id','stop_name','stop_lat','stop_lon','zone_id','location_type'])
print("Loading stop_times")
stop_times = pd.read_csv(dir_GTFS+"/stop_times.txt", dtype={'service_id':'category',
                                                            'pickup_type':'category',
                                                            'drop_off_type':'category',
                                                            'stop_sequence':'int8',
                                                           'departure_time':'string',
                                                           'arrival_time':'string'}
                        )#,parse_dates=["arrival_time", "departure_time"], date_format="%H:%M")
print("Loading trips")
trips = pd.read_csv(dir_GTFS+"/trips.txt", dtype={'service_id':'category',
                                                  'trip_short_name':'category',
                                                  'direction_id':'int8'})
print("Loading shapes")
shapes = pd.read_csv(dir_GTFS+"/shapes.txt", dtype={'shape_id':'category',
                                                    'shape_pt_sequence':'uint32'})
print("Loading routes")
routes = pd.read_csv(dir_GTFS+"/routes.txt", usecols=['route_id','agency_id','route_short_name','route_long_name','route_type'],
                    dtype={'route_type':'uint8',
                          'agency_id':'category'})
#print("Loading agency")
#agency = pd.read_csv(dir_GTFS+"/agency.txt")
print("Loading calendar")
calendar = pd.read_csv(dir_GTFS+"/calendar.txt",dtype={'monday':'boolean',
                                                      'tuesday':'boolean',
                                                      'wednesday':'boolean',
                                                      'thursday':'boolean',
                                                      'friday':'boolean',
                                                      'saturday':'boolean',
                                                      'sunday':'boolean'},
                      parse_dates=["start_date", "end_date"], date_format="%Y%m%d")
#print("Loading calendar_dates")
#calendar_dates = pd.read_csv(dir_GTFS+"/calendar_dates.txt", dtype={'exception_type':'uint8'}, parse_dates=["date"], date_format="%Y%m%d")

# Converting the dataframes to geodataframes
## Stops

In [None]:
geometry=gpd.points_from_xy(stops['stop_lon'], stops['stop_lat'], z=None, crs='epsg:4326')
geo_stops=gpd.GeoDataFrame(data=stops, geometry=geometry)
#geo_stops.sample(n=30).plot()
del stops

## Shapes

In [None]:
geometry=gpd.points_from_xy(shapes['shape_pt_lon'], shapes['shape_pt_lat'], z=None, crs='epsg:4326')
geo_shapes=gpd.GeoDataFrame(data=shapes, geometry=geometry)
#geo_shapes.sample(50).plot()
del shapes

In [None]:
#Sort the lines by id and Sequence so they are ordered
geo_shapes_sorted = geo_shapes.sort_values(by=['shape_id', 'shape_pt_sequence'])

#Group the points by 'route'
lines = geo_shapes_sorted.groupby('shape_id', observed=True).apply(lambda x: LineString(x.geometry.tolist()))
lines=lines.reset_index()
lines = gpd.GeoDataFrame(data=lines['shape_id'], geometry=lines[0], crs=geo_shapes.crs)
#lines.sample(20).explore()
del geo_shapes
del geo_shapes_sorted

In [None]:
lines

# Direct trips

Find the services corresponding to the given date

In [None]:
departure_date="2024-10-30"
pd.Timestamp(departure_date).weekday()
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
active_services

## Find the direct trips from one station to the other
For that, find all the trips frome the departure station and the trips to the arrival station.
Then find the trips in common

In [None]:
departure_time="16:00"
departure_stop_name="MONS Place de Flandre"
arrival_stop_name="NIMY Limite"#"SOIGNIES SNCB"

#Find the active services
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
#Extract the stops and the stop_id's
departure_stop=geo_stops.query("stop_name==@departure_stop_name")
arrival_stop=geo_stops.query("stop_name==@arrival_stop_name")

departure_stop_id=departure_stop['stop_id']
arrival_stop_id=arrival_stop['stop_id']
#Extract the stop times
departure_stop_times=stop_times.query("(stop_id.isin(@departure_stop_id))&(departure_time>@departure_time)")
arrival_stop_times=stop_times.query("stop_id.isin(@arrival_stop_id)&arrival_time>@departure_time")
#Find the trips linking one stop to the other
common_trips = pd.Series(np.intersect1d(departure_stop_times['trip_id'].values,arrival_stop_times['trip_id'].values))
#Filter the active trips
common_active_trips = trips[(trips['trip_id'].isin(common_trips))& (trips['service_id'].isin(active_services['service_id']))]
common_active_trips

final_departure_stop_times=departure_stop_times[departure_stop_times['trip_id'].isin(common_active_trips["trip_id"])].sort_values('departure_time')
final_arrival_stop_times=arrival_stop_times[arrival_stop_times['trip_id'].isin(common_active_trips["trip_id"])].sort_values('arrival_time')

possible_times=pd.merge(final_departure_stop_times,final_arrival_stop_times, on='trip_id', how='inner')
possible_times

In [None]:
best_time=possible_times.iloc[0,:]
ax=departure_stop.explore()
arrival_stop.explore(m=ax)
best_trip=trips[trips['trip_id']==best_time.trip_id]
best_lines=lines[lines['shape_id'].isin(best_trip['shape_id'])]
best_lines.explore(m=ax)

# Trips with transfers

Find the services corresponding to the given date

In [None]:
departure_date="2024-10-30"
pd.Timestamp(departure_date).weekday()
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
#active_services=active_services[active_services[week_day]]

## Find the trips with one transfer from one station to the other
First naïve method:
Find all the trips passing by the stops and check if there is one in common
If not, list all the stops accessible by these trips and check if there is one in common
If not 


In [None]:
departure_time="10:00"
departure_stop_name="BRAINE-LE-COMTE Gare - Quai 2"
arrival_stop_name="NIVELLES Gare - Quai 3"#"SOIGNIES SNCB"

#Find the active services
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
#Extract the stops and the stop_id's
departure_stop=geo_stops.query("stop_name==@departure_stop_name")
arrival_stop=geo_stops.query("stop_name==@arrival_stop_name")

departure_stop_id=departure_stop['stop_id']
arrival_stop_id=arrival_stop['stop_id']
#Extract the stop times
departure_stop_times=stop_times.query("(stop_id.isin(@departure_stop_id))&(departure_time>@departure_time)")
arrival_stop_times=stop_times.query("stop_id.isin(@arrival_stop_id)&arrival_time>@departure_time")
#
departure_trip_ids=departure_stop_times['trip_id'].values
arrival_trip_ids=arrival_stop_times['trip_id'].values
#Find the trips linking one stop to the other
common_trips = pd.Series(np.intersect1d(departure_trip_ids, arrival_trip_ids))
#Filter the active trips
common_active_trips = trips[(trips['trip_id'].isin(common_trips))& (trips['service_id'].isin(active_services['service_id']))]
if(common_active_trips.shape[0]<1):
    print("No direct trips")
    print("Searching trips with transfers...")
    stop_times_linked_departure=stop_times.query("(trip_id.isin(@departure_trip_ids))&(departure_time>@departure_time)")
    stop_times_linked_arrival=stop_times.query("(trip_id.isin(@arrival_trip_ids))&(arrival_time>@departure_time)")
    stop_times_linked_departure_ids=stop_times_linked_departure['stop_id'].values
    stop_times_linked_arrival_ids=stop_times_linked_arrival['stop_id'].values
    common_stops = pd.Series(np.intersect1d(stop_times_linked_departure_ids, stop_times_linked_arrival_ids))
    
else:    
    final_departure_stop_times=departure_stop_times[departure_stop_times['trip_id'].isin(common_active_trips["trip_id"])].sort_values('departure_time')
    final_arrival_stop_times=arrival_stop_times[arrival_stop_times['trip_id'].isin(common_active_trips["trip_id"])].sort_values('arrival_time')
    
    possible_times=pd.merge(final_departure_stop_times,final_arrival_stop_times, on='trip_id', how='inner')
    possible_times

## Bruteforcesque method


Some arrival and departure time are above 24 hours (Ex: 24:03:00). They indicate that the bus trip started the previous day and is still considered as active even if it is not today.
To handle those times, the columns 'departure_time' and 'arrival_time' are not stored as datetime but as timedelta.

In [None]:
stop_times['arrival_time']=pd.to_timedelta(stop_times['arrival_time'])
stop_times['departure_time']=pd.to_timedelta(stop_times['departure_time'])

In [None]:
stop_times[stop_times['arrival_time']>pd.to_timedelta('24:00:00')]#7645 stops after midnight

In [None]:
def compute_walk_time(A_stop, B_stop):
    walk_speed=4#km/h
    #For the moment, I divide the distance between them by 4km/h 
    #In the future, we will use the length of the shortest walkable path between the points 

    #The stops are projected in the Belgian Lambert 2008 coordinates system (crs=3812) to have accurate distances
    if(type(B_stop)==gpd.geodataframe.GeoDataFrame):
        B_stop=B_stop.head(1)
        A_stop=A_stop.head(1)
    birdfly_dist=A_stop.distance(B_stop, align=False)
    walk_time=birdfly_dist/(1000*walk_speed/60)#walking time in minutes (float)
    #walk_time=walk_time.mean()#If several stops have the same name, take the average of their walking distance
    walk_time=pd.to_timedelta(walk_time,unit='min')#Converted to TimeDelta
    return walk_time

In [None]:
walked_stop_times=pd.DataFrame(columns=stop_times.columns)
walked_stop_times['stop_id']=geo_stops['stop_id']
walked_stop_times['trip_id']='walking'
walked_stop_times['arrival_time']=compute_walk_time(geo_stops.to_crs(3812), departure_stop.to_crs(3812).iloc[0].geometry)
walked_stop_times['departure_time']=walked_stop_times['arrival_time']
walked_stop_times

In [None]:
def find_next_stops(best_arrival_time, active_trips, active_stop_times, cur_stop_id, cur_time):
    walk_range=1000#Max length in meters between the current stop and the other stops reached by foot
    
    #walk_time=compute_walk_time(arrival_stop, cur_stop)
    #Extract trips stopping by the current bus stop that are active
    #print(cur_time)
    cur_stop_times=active_stop_times[((active_stop_times['stop_id'].isin(cur_stop_id))&
                                     (active_stop_times['departure_time']>cur_time)&
                                     (active_stop_times['departure_time']<(best_arrival_time))
                                     #(active_stop_times[trip_id].isin(active_trips_id))
                                      )].sort_values('departure_time')
#    cur_stop_times=active_stop_times.query("(stop_id.isin(@cur_stop_id))"
#                                    +"&(departure_time>@cur_time)"
#                                    #+"&(trip_id.isin(@active_trips_id))"
#                                     ).sort_values('departure_time')
    
    #Extract the other stops that can be reached with the trips
    other_stops_times=[]
    for row in cur_stop_times.itertuples(index=False):
        departure_seq=int(row.stop_sequence)
        other_stops_time=active_stop_times[((active_stop_times['trip_id']==row.trip_id)
                                      &(active_stop_times['arrival_time']<(cur_time+walk_time))
                                      &(active_stop_times['stop_sequence']>departure_seq)
                                     )]
        other_stops_times.append(other_stops_time)
        #active_stop_times=active_stop_times[~active_stop_times.eq(other_stops_time,axis=0).all(axis=1)]
        #print(row.departure_time, end='\r')

    #Add the stops reachable by foot
    cur_stop=geo_stops[geo_stops['stop_id'].isin(cur_stop_id)]
    meas_geo_stops=geo_stops.to_crs(3812)
    walked_stops=meas_geo_stops[meas_geo_stops.within(cur_stop.to_crs(3812).buffer(1000).geometry.iloc[0])]
    walked_stop_times=pd.DataFrame(columns=stop_times.columns)
    walked_stop_times['stop_id']=walked_stops['stop_id']
    walked_stop_times['trip_id']='walking'
    walked_stop_times['arrival_time']=cur_time+compute_walk_time(walked_stops, cur_stop.to_crs(3812).iloc[0].geometry)
    walked_stop_times['departure_time']=walked_stop_times['arrival_time']
    walked_stop_times['stop_sequence']=1
    walked_stop_times['pickup_type']=0
    walked_stop_times['drop_off_type']=0
    
    #print(walked_stop_times)
    if(walked_stop_times.size>0):
        other_stops_times.append(walked_stop_times)
    
    if(len(other_stops_times)>0):
        other_stops_times=pd.concat(other_stops_times)
    else:
        print("No next stops found")
        return pd.DataFrame()
    return other_stops_times
def explore_node(geo_stops, active_trips, active_stop_times, found_stop_times, best_arrival_time, best_path, cur_stop_time):
    num_trans=cur_stop_time['number_trips']+1
    #print(cur_stop_time[['stop_id','final_arrival_time', 'intermediary_stops']])
    num_trans=num_trans.iloc[0]
    max_trans=6
    if(num_trans>max_trans):
        print("Max number of transfers reached")
        return pd.DataFrame(), best_arrival_time, best_path
    cur_stop_id=cur_stop_time['stop_id']#pd.Series(row.stop_id)
    cur_time=cur_stop_time['arrival_time'].iloc[0]#row.arrival_time
    previous_stops=cur_stop_time['intermediary_stops'].iloc[0].copy()#row.intermediary_stops[:]
    #Extract the stops directly linked to the current stop
    #print(previous_stops)
    other_stops_times=find_next_stops(best_arrival_time, active_trips, active_stop_times, cur_stop_id, cur_time)

    if(other_stops_times.size>0):
        #Update the path going to these stops
        previous_stops.append(cur_stop_id.iloc[0])
        #print(previous_stops)
        other_stops_times['number_trips']=num_trans
        other_stops_times['intermediary_stops']=[previous_stops] * len(other_stops_times)#TODO: also include the trips used (or walking)
        
        #Compute the proximity of the stops discovered from the arrival stop
        other_stops=geo_stops.merge(other_stops_times.reset_index(), on='stop_id',how='right', )
        new_walk_times=compute_walk_time(other_stops.to_crs(3812), arrival_stop.to_crs(3812).iloc[0].geometry)#TODO: Optimize: store arrival_stop in crs 3812
        other_stops['final_arrival_time']=other_stops['arrival_time']+new_walk_times
        other_stops=other_stops.set_index('index')
        new_best_arrival_time=other_stops['final_arrival_time'].min()
        if(new_best_arrival_time<best_arrival_time):
            #Update the current best path
            best_path=[previous_stops, 'w']
            best_arrival_time=new_best_arrival_time

        #TODO: Filter out the stops from which you would arrive later at the arrival stop than the best path even in bus in bird fly
        other_stops_times['final_arrival_time']=other_stops['final_arrival_time']
        other_stops_times=other_stops_times[other_stops_times['arrival_time']<best_arrival_time]
    else:
        return pd.DataFrame(), best_arrival_time, best_path
    return other_stops_times, best_arrival_time, best_path

In [None]:
geo_stops[geo_stops['stop_name'].str.contains('Kennedy')].iloc[0:10]

In [None]:
departure_date="2024-11-05"
departure_time="08:00:00"
departure_stop_name="MONS Lycée"#"MONS Place de Flandre"#
arrival_stop_name="SOIGNIES Place du Jeu de Balle"

start_time = time.time()
#Extract the stops and the stop_id's
departure_stop=geo_stops.query("stop_name==@departure_stop_name")
arrival_stop=geo_stops.query("stop_name==@arrival_stop_name")

arrival_stop_id=arrival_stop['stop_id']
departure_stop_id=departure_stop['stop_id']

#The walking time between the departure and the arrival is computed
walk_time=compute_walk_time(departure_stop.to_crs(3812), arrival_stop.to_crs(3812))
walk_time=walk_time.mean()

departure_time=pd.to_timedelta(departure_time)
best_arrival_time=departure_time+walk_time
print(f"First arrival time: {best_arrival_time}")
best_path=['w']

#Find the schedule of the day
week_day=pd.Timestamp(departure_date).day_name().lower()
active_services=calendar.query("(start_date<=@departure_date)&(@departure_date<=end_date)")
active_services=active_services[active_services[week_day]]
active_trips=trips[trips['service_id'].isin(active_services['service_id'])]
active_trips_id=active_trips['trip_id']
active_stop_times=stop_times[stop_times['trip_id'].isin(active_trips_id)]
active_stop_times=active_stop_times[active_stop_times['arrival_time']<best_arrival_time]

#Find the stoptimes of the departure stop
found_stop_times=active_stop_times[((active_stop_times['stop_id'].isin(departure_stop_id))&
                                     (active_stop_times['departure_time']>departure_time)&
                                     (active_stop_times['departure_time']<(best_arrival_time))
                                     #(active_stop_times[trip_id].isin(active_trips_id))
                                      )].sort_values('departure_time')
#TODO: Add other stops within walkable distance

#print(found_stop_times)
found_stop_times['number_trips']=0
found_stop_times.loc[:, "intermediary_stops"] = np.array([['']] * len(found_stop_times))
found_stop_times['final_arrival_time']=best_arrival_time

other_stops_times=find_next_stops(best_arrival_time, active_trips, active_stop_times, departure_stop_id, departure_time)

found_stop_times['explored']=True
other_stops_times['number_trips']=1
other_stops_times.loc[:, "intermediary_stops"] = [[departure_stop_id.iloc[0]]] * len(other_stops_times) #other_stops_times['intermediary_stops']=''
#TODO: Add other stops within walkable distance
other_stops_times=other_stops_times.sort_values('arrival_time').drop_duplicates('stop_id')#We keep only the earliest arrival time to a stop.
other_stops_times['explored']=False

#Compare the walking time from the new stops
other_stops=geo_stops.merge(other_stops_times.reset_index(), on='stop_id',how='right', )
new_walk_times=compute_walk_time(other_stops.to_crs(3812), arrival_stop.to_crs(3812).iloc[0].geometry)
other_stops['final_arrival_time']=other_stops['arrival_time']+new_walk_times
other_stops=other_stops.set_index('index')
new_best_arrival_time=other_stops['final_arrival_time'].min()
if(new_best_arrival_time<best_arrival_time):
    print(f"New best arrival time: {new_best_arrival_time}")
    best_path=[(departure_stop_id.iloc[0], 'w')]
    best_arrival_time=new_best_arrival_time
    #Drop the scheduled stops later than the current best arrival time 
    active_stop_times=active_stop_times[active_stop_times['arrival_time']<best_arrival_time]

other_stops_times['final_arrival_time']=other_stops['final_arrival_time']
found_stop_times=pd.concat([found_stop_times,other_stops_times])
found_stop_times=found_stop_times.sort_values('arrival_time').drop_duplicates('stop_id')#We keep only the earliest arrival time to a stop.
found_stop_times=found_stop_times.sort_values('final_arrival_time')

new_stop_times=found_stop_times[found_stop_times['explored']==False]#The found stops not already explored


while(new_stop_times.size>0):
    new_stop_times=found_stop_times[found_stop_times['explored']==False]#The found stops not already explored
    cur_stop_time=new_stop_times.iloc[0:1,:]

    other_stops_times, new_best_arrival_time, new_best_path=explore_node(geo_stops, active_trips, active_stop_times, found_stop_times, best_arrival_time, best_path, cur_stop_time)
    other_stops_times['explored']=False
    found_stop_times.loc[cur_stop_time.index,'explored']=True

    #Drop the scheduled stops later than the current best arrival time
    if(new_best_arrival_time<best_arrival_time):
        print(f"New best arrival time: {new_best_arrival_time}")
        best_arrival_time=new_best_arrival_time
        best_path=new_best_path
        active_stop_times=active_stop_times[active_stop_times['arrival_time']<best_arrival_time]
        found_stop_times=found_stop_times[found_stop_times['arrival_time']<best_arrival_time]

    found_stop_times=pd.concat([found_stop_times,other_stops_times])
    found_stop_times=found_stop_times.sort_values('arrival_time')
    found_stop_times=found_stop_times.drop_duplicates('stop_id')#We keep only the earliest arrival time to a stop.
    found_stop_times=found_stop_times.sort_values('final_arrival_time')
    new_stop_times=found_stop_times[found_stop_times['explored']==False]#The found stops not already explored
    print(f"{found_stop_times.shape[0]-new_stop_times.shape[0]}/{found_stop_times.shape[0]}")
end_time = time.time()
print(f"Best arrival time:{best_arrival_time}")
print(f"Best path:{best_path}")
print(f"It took {end_time-start_time} seconds to compute")

In [None]:
found_stop_times

In [None]:
found_stop_times['departure_time']=found_stop_times['departure_time'].apply(lambda x: x.total_seconds())
found_stop_times['final_arrival_time']=found_stop_times['final_arrival_time'].apply(lambda x: x.total_seconds())
found_stop_times['arrival_time']=found_stop_times['arrival_time'].apply(lambda x: x.total_seconds())

In [None]:
geo_stops.merge(found_stop_times, how='right').sort_values('final_arrival_time').explore(column='final_arrival_time')