Processing sample GTFS data for Gdańsk from November 2 to November 16, 2024

In [1]:
import pandas as pd
import math
import numpy as np
from shapely import Point, LineString
from sklearn.neighbors import KDTree
import folium 
import joblib
import ast
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors  
import random
import ipywidgets as widgets
from IPython.display import display

1) creating the `stops_modified` and `shapes_modified` data frames; additionally, we add the `shape_distance_traveled` and `stop_dist_traveled` columns, which are not provided directly in the source data

We create a `stops_modified` data frame linked to individual bus trips via `trip_id`, containing precise information about bus arrival times at stops and their geographic locations. In GTFS data, times beyond 24:00 (e.g., 26:34) indicate that the trip continued past midnight into the following calendar day. Therefore, we adjust the date by adding one day and convert the time to the standard 24-hour format using modulo 24.

In [2]:
trips = pd.read_table("gdansk_04_18_11_2024\\trips.txt", sep = ",")
stops = pd.read_table("gdansk_04_18_11_2024\\stops.txt", sep = ",")
stop_times = pd.read_table("gdansk_04_18_11_2024\\stop_times.txt", sep = ",")
calendar_dates = pd.read_table("gdansk_04_18_11_2024\\calendar_dates.txt", sep = ",")

def adjust_time_vectorized(df):
    
    df['date'] = pd.to_datetime(df['date'], format = '%Y%m%d')
    arrival_hours1 = df['arrival_time'].str[:2].astype(int) 
    invalid_times1 = arrival_hours1 >= 24
    arrival_hours2 = df['departure_time'].str[:2].astype(int) 
    invalid_times2 = arrival_hours2 >= 24
    adjusted_dates = df['date'] + pd.Timedelta(days=1)
    df.loc[invalid_times1, 'arrival_time'] = adjusted_dates[invalid_times1].astype(str) + ' ' + (arrival_hours1[invalid_times1] % 24).astype(str).str.zfill(2) + df['arrival_time'].str[2:]
    df.loc[invalid_times2, 'departure_time'] = adjusted_dates[invalid_times2].astype(str) + ' ' + (arrival_hours2[invalid_times2] % 24).astype(str).str.zfill(2) + df['departure_time'].str[2:]
    df.loc[~invalid_times1, 'arrival_time'] = df['date'].astype(str) + ' ' + df['arrival_time']
    df.loc[~invalid_times2, 'departure_time'] = df['date'].astype(str) + ' ' + df['departure_time']
    return df


stops_result = pd.merge(stop_times.iloc[:, [0,1,2,3,4]], stops.iloc[:, [0,2,3]], on='stop_id')
stops_result = stops_result.merge(trips[['trip_id', 'shape_id', 'service_id']], on='trip_id')
stops_result = stops_result.merge(calendar_dates, on='service_id', how='left').dropna()
stops_result = adjust_time_vectorized(stops_result)


stops_result.to_csv('gdansk_04_18_11_2024\\stops_modified.txt', index = False)

#sample 
stops_result.iloc[39:45]



Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_lat,stop_lon,shape_id,service_id,date,exception_type
39,456202411030243_102_456-22,2024-11-03 03:07:00,2024-11-03 03:07:00,1743,15,54.324457,18.605851,456_64432,120241102,2024-11-02,1
40,456202411030243_102_456-22,2024-11-03 03:08:00,2024-11-03 03:08:00,1232,16,54.327313,18.611176,456_64432,120241102,2024-11-02,1
41,456202411030243_102_456-22,2024-11-03 03:09:00,2024-11-03 03:09:00,1230,17,54.329075,18.61755,456_64432,120241102,2024-11-02,1
42,456202411030243_102_456-22,2024-11-03 03:10:00,2024-11-03 03:10:00,1228,18,54.330152,18.626578,456_64432,120241102,2024-11-02,1
43,456202411030243_102_456-22,2024-11-03 03:12:00,2024-11-03 03:12:00,1206,19,54.332401,18.634878,456_64432,120241102,2024-11-02,1
44,456202411030243_102_456-22,2024-11-03 03:13:00,2024-11-03 03:13:00,1204,20,54.335933,18.63666,456_64432,120241102,2024-11-02,1


Creating `shapes_modified` data frame by adding the `shape_dist_traveled` column to `shapes` df. This allows us to precisely determine the position of each shape point as a percentage of the total route distance.

In [3]:
def euclidian_distance_in_km(wsp1, wsp2): # due to the small area, we simplify the calculations and use Euclidean distance 
    delta_lat = wsp1[0] - wsp2[0]
    delta_lon = wsp1[1] - wsp2[1]  
    # conversion to kilometers
    delta_lat_km = delta_lat * 111.32  # 1 degree of latitude ≈ 111.32 km
    delta_lon_km = delta_lon * 69.5    # 1 degree of longitude ≈ 69.5 km near Warsaw   
    
    return math.sqrt(delta_lat_km**2 + delta_lon_km**2)


shapes = pd.read_csv("gdansk_04_18_11_2024\\shapes.txt").sort_values(['shape_id', 'shape_pt_sequence'])
groups = shapes.groupby('shape_id')

shape_distance = np.array([])
for shape_id, group in groups:

    distances, distance, bearing = np.zeros(len(group)), 0, np.zeros(len(group)) 
    for i in range(1, len(group)):
        
        distance += euclidian_distance_in_km((group.shape_pt_lat.iloc[i], group.shape_pt_lon.iloc[i]), (group.shape_pt_lat.iloc[i-1], group.shape_pt_lon.iloc[i-1]))
        distances[i] = distance

    shape_distance = np.append(shape_distance, distances)


shapes['shape_dist_traveled'] = shape_distance #creating new column
shapes.to_csv("gdansk_04_18_11_2024\\shapes_modified.txt", index = False)

We add the `stop_dist_traveled` column to the `stops` data frame. Our goal is to precisely determine the percentage of the route at which each stop is located for a given trip. This will later allow us, for example, to answer questions like whether the bus has already passed a specific stop.

In [4]:
df = { #resulting df
        'stop_id': [],
        'shape_id': [],
        'stop_dist_traveled': [],
    }

def point_to_segment_distance(coords1, coords2, coords3): # calculating the Euclidian distance from a bus point to a line formed by consecutive points from shapes 
    px, py = coords1
    x1, y1 = coords2
    x2, y2 = coords3

    px *= 111.32
    x1 *= 111.32
    x2 *= 111.32
    py *= 69.5
    y1 *= 69.5
    y2 *= 69.5

    point = Point(px, py)
    line = LineString([(x1, y1), (x2, y2)])
    
    return point.distance(line)



unique_stops = stops_result.drop_duplicates(['stop_id', 'shape_id']).sort_values(['shape_id', 'stop_sequence']) # unique stops based on their geographic location to avoid duplicating calculations
groups = unique_stops.groupby('shape_id') #grouping by shape_id 


for shape_id, group in groups:

    shape_pom = shapes.loc[shapes['shape_id'] == shape_id].drop_duplicates('shape_dist_traveled') 
    coordinates = shape_pom.loc[:,['shape_pt_lat', 'shape_pt_lon']].values.tolist() #coordinates of shape points for given shape_id
    tree = KDTree(np.array(coordinates[:-1])) #the goal is to find 5 nearest neighbors among shape points for bus stop locations to precisely determine the accumulated distance (using k-d tree)

    st_coordinates = group.loc[:,['stop_lat', 'stop_lon']].values.tolist() # stops coordinates
    query_point = np.array(st_coordinates)
    indices = tree.query(query_point, k=5)[1] 
    
    for i in range(len(group)): # iterating over the stops of a given trip_id

        if i==0:
            df['stop_id'].append(group.stop_id.iloc[i]) 
            df['shape_id'].append(shape_id) 
            df['stop_dist_traveled'].append(0) #we start accumulated distance from 0
            continue

        array = []    
        for j in range(len(indices[i])): #iterating over neighbours of i-th stop     
            
            # Calculating the distance from the i-th stop to the segment in pairs_df, determined by the starting point that is the nearest
            # neighbor of the stop. This allows us to identify the segment to which the stop belongs.
            dist = point_to_segment_distance(st_coordinates[i], coordinates[indices[i][j]], coordinates[indices[i][j]+1]) * 1000 
            array.append(dist)
            
        
        # We take the accumulated distance (shape_dist_traveled) of the starting point of the segment
        # to which the stop is assigned and apply a correction for the distance between this point and the actual location of the stop.
        distance = shape_pom.shape_dist_traveled.iloc[indices[i][array.index(min(array))]] + euclidian_distance_in_km(st_coordinates[i], coordinates[indices[i][array.index(min(array))]]) 
        df['stop_id'].append(group.stop_id.iloc[i]) 
        df['shape_id'].append(shape_id) 
        df['stop_dist_traveled'].append(distance) 
    

df = pd.DataFrame(df)
stops_result = stops_result.merge(df, on = ['stop_id', 'shape_id']) #connecting with main df
stops_result.to_csv("gdansk_04_18_11_2024\\stops_modified.txt", index = False)  



In [5]:
#sample 
stops_result.iloc[39:45]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_lat,stop_lon,shape_id,service_id,date,exception_type,stop_dist_traveled
39,456202411030243_102_456-22,2024-11-03 03:07:00,2024-11-03 03:07:00,1743,15,54.324457,18.605851,456_64432,120241102,2024-11-02,1,12.713805
40,456202411030243_102_456-22,2024-11-03 03:08:00,2024-11-03 03:08:00,1232,16,54.327313,18.611176,456_64432,120241102,2024-11-02,1,13.223143
41,456202411030243_102_456-22,2024-11-03 03:09:00,2024-11-03 03:09:00,1230,17,54.329075,18.61755,456_64432,120241102,2024-11-02,1,13.708618
42,456202411030243_102_456-22,2024-11-03 03:10:00,2024-11-03 03:10:00,1228,18,54.330152,18.626578,456_64432,120241102,2024-11-02,1,14.362225
43,456202411030243_102_456-22,2024-11-03 03:12:00,2024-11-03 03:12:00,1206,19,54.332401,18.634878,456_64432,120241102,2024-11-02,1,15.080542
44,456202411030243_102_456-22,2024-11-03 03:13:00,2024-11-03 03:13:00,1204,20,54.335933,18.63666,456_64432,120241102,2024-11-02,1,15.49427


simple visualization of bus stops for 456202411030243_102_456-22 (`trip_id`) with `stop_dist_traveled` information:

In [None]:
stops = pd.read_csv('gdansk_04_18_11_2024\\stops_modified.txt')
stops = stops[stops['trip_id'] == '456202411030243_102_456-22']
coords = stops.loc[:,['stop_lat', 'stop_lon']].values.tolist()
map = folium.Map(location=[54.352, 18.646], zoom_start=13) #settings for Gdansk

for i in range(len(coords)):
    txt = f'{str(i)}, {stops.stop_dist_traveled.iloc[i]}'
    folium.Marker(location=coords[i],
                popup=txt,
                icon=folium.Icon(color='red') 
                ).add_to(map)

map.save('gdansk_04_18_11_2024\\stops_example.html')
display(map) #red -> bus stops

2) dividing the bus network into shared segments

Creating the smallest possible segmentation of the bus network – we connect consecutive points from `shapes_modified` to form individual segments. This is a technical step; we are not introducing any new data, just representing the existing shape points as segments.

In [7]:

shapes = pd.read_csv("gdansk_04_18_11_2024\\shapes_modified.txt", sep = ",")
trips = pd.read_csv("gdansk_04_18_11_2024\\trips.txt", sep = ",")

pom = pd.merge(shapes, (trips.iloc[:,[0,6]]), on = "shape_id").drop_duplicates()
all_pairs = [] #resulting df
groups = pom.groupby('shape_id')
for shape_id, group in groups:

    group = group.sort_values(by='shape_pt_sequence') #ensuring that we follow the route in the correct order
    unique_id = 0
    
    for i in range(len(group) - 1): #iterating through all geographically distinct routes
        
        pair = {
            'code': unique_id,
            'distance1': group.iloc[i]['shape_dist_traveled'], #distance to the line beginning
            'distance2': group.iloc[i + 1]['shape_dist_traveled'], #distance to the line end
            'distance': group.iloc[i + 1]['shape_dist_traveled'] - group.iloc[i]['shape_dist_traveled'],
            'shape_id': shape_id, 
            'route_id': group.iloc[i]['route_id'],
            'latitude1': group.iloc[i]['shape_pt_lat'],
            'longitude1': group.iloc[i]['shape_pt_lon'],
            'coords1': [group.iloc[i]['shape_pt_lat'], group.iloc[i]['shape_pt_lon']], #coordinates of the line beginning
            'latitude2': group.iloc[i + 1]['shape_pt_lat'],
            'longitude2': group.iloc[i + 1]['shape_pt_lon'],
            'coords2': [group.iloc[i + 1]['shape_pt_lat'], group.iloc[i + 1]['shape_pt_lon']], #coordinates of the line end
        }
        all_pairs.append(pair)
        unique_id += 1

pairs_df = pd.DataFrame(all_pairs) #creating final df

pairs_df['geo_id'] = pairs_df['latitude1'].astype(str) + '_' + pairs_df['longitude1'].astype(str) + '_' + pairs_df['latitude2'].astype(str) + '_' + pairs_df['longitude2'].astype(str) # creating an ID that identifies the segment’s location
pairs_df['count'] = pairs_df.groupby('geo_id')['geo_id'].transform('count')
pairs_df = pairs_df.sort_values(by=['shape_id', 'code'])
wszystkie = pairs_df.groupby('geo_id').agg(all_small_lines=('shape_id', list)).reset_index() #???
pairs_df = pd.merge(pairs_df, wszystkie, on='geo_id')

pairs_df['coords1'] = pairs_df['coords1'].apply(lambda x: list(float(v) for v in x))
pairs_df['coords2'] = pairs_df['coords2'].apply(lambda x: list(float(v) for v in x)) #converting coordinates to float
pairs_df.to_csv("gdansk_04_18_11_2024\\shapes_pairs.txt", index=False)

#sample
pairs_df.iloc[5:10]

Unnamed: 0,code,distance1,distance2,distance,shape_id,route_id,latitude1,longitude1,coords1,latitude2,longitude2,coords2,geo_id,count,all_small_lines
5,5,0.172964,0.17664,0.003677,100_190859,100,54.355856,18.645474,"[54.355856179457, 18.645474189103]",54.355829,18.645445,"[54.355828639981, 18.645444981933]",54.355856179457_18.645474189103_54.35582863998...,49,"[100_190859, 100_192474, 100_193241, 100_19398..."
6,6,0.17664,0.18686,0.010219,100_190859,100,54.355829,18.645445,"[54.355828639981, 18.645444981933]",54.355738,18.645419,"[54.355738241974, 18.645419370882]",54.355828639981_18.645444981933_54.35573824197...,471,"[100_190859, 100_192474, 100_193241, 100_19398..."
7,7,0.18686,0.249744,0.062884,100_190859,100,54.355738,18.645419,"[54.355738241974, 18.645419370882]",54.355178,18.645297,"[54.355178494384, 18.645297482135]",54.355738241974_18.645419370882_54.35517849438...,471,"[100_190859, 100_192474, 100_193241, 100_19398..."
8,8,0.249744,0.315765,0.066021,100_190859,100,54.355178,18.645297,"[54.355178494384, 18.645297482135]",54.354592,18.645162,"[54.354591507201, 18.645161764059]",54.355178494384_18.645297482135_54.35459150720...,471,"[100_190859, 100_192474, 100_193241, 100_19398..."
9,9,0.315765,0.322843,0.007078,100_190859,100,54.354592,18.645162,"[54.354591507201, 18.645161764059]",54.354528,18.64515,"[54.354528348588, 18.645149986699]",54.354591507201_18.645161764059_54.35452834858...,471,"[100_190859, 100_192474, 100_193241, 100_19398..."


merging segments with the same `count` and the same `all_lines` list (combining smaller segments from the finest division of the bus network into larger segments by iterating through each `shape_id`)

In [8]:

shapes_pairs = pd.read_csv("gdansk_04_18_11_2024\\shapes_pairs.txt") 
groups = shapes_pairs.groupby('shape_id')

result = []
for shape_id, group in groups:
    info = [shape_id] # starting each information set with shape_id
    distance = 0

    for i in range(len(group)):
        
        if i == 0: # beginning
            
            info.append([float(group.iloc[0]['latitude1']), float(group.iloc[0]['longitude1'])]) # starting so we add coords of segment beginning
            
            pom = group.iloc[0]['count'] # segment cutoff control
            pom2 = group.iloc[0]['all_small_lines'] # double segment cutoff control (sometimes depending on count isnt enough e.g. roundabouts)
            
            distance = group.iloc[0]['distance'] # accumulating the total dist of segment

        elif i != (len(group) - 1): # middle 
            
            if group.iloc[i]['count'] == pom and group.iloc[i]['all_small_lines'] == pom2 and distance < 0.6: 
                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])])
                
                distance += group.iloc[i]['distance']

            else: # if the segment is longer than 0.6km -> cutoff
                
                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])]) #segment end
                info.append(str(info[1][0]) + '_' + str(info[1][1]) + '_' + str(info[-1][0])+ '_' + str(info[-1][1])) # creating unique segment id
                info.append(pom)
                info.append(distance)
                
                result.append(info) # full info set
                info = [shape_id] # new set
                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])])
                
                #info update
                pom = group.iloc[i]['count']
                pom2 = group.iloc[i]['all_small_lines']
                distance = group.iloc[i]['distance'] 
        
        else: # end
            
            if group.iloc[i]['count'] == pom and group.iloc[i]['all_small_lines'] == pom2: #last line from group belongs to considered segment (we don't cutoff)
                
                #updating and adding info set
                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])])
                info.append([float(group.iloc[i]['latitude2']), float(group.iloc[i]['longitude2'])])
                distance += group.iloc[i]['distance']
                info.append(str(info[1][0]) + '_' + str(info[1][1]) + '_' + str(info[-1][0])+ '_' + str(info[-1][1])) #segment id
                info.append(pom)
                info.append(distance)
                
            else: #cutoff

                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])])
                info.append(str(info[1][0]) + '_' + str(info[1][1]) + '_' + str(info[-1][0])+ '_' + str(info[-1][1]))
                info.append(pom)
                info.append(distance)
                result.append(info)
                info = [shape_id]
                
                #info set for single last line
                info.append([float(group.iloc[i]['latitude1']), float(group.iloc[i]['longitude1'])])
                info.append([float(group.iloc[i]['latitude2']), float(group.iloc[i]['longitude2'])])
                info.append(str(info[1][0]) + '_' + str(info[1][1]) + '_' + str(info[-1][0])+ '_' + str(info[-1][1]))
                pom = group.iloc[i]['count']
                info.append(pom)
                info.append(distance)
            
            
            result.append(info)



# function for converting list into proper df format 
def convert_list_to_dataframe(data):
    converted_data = []
    for item in data:
        shape_id = item[0]
        start = item[1]
        end = item[-4]
        count = item[-2]
        id = item[-3]
        dystans = item[-1]
        intermediate_data = item[1:-3]  # this will collect items between start and end
    
        converted_data.append([shape_id, start, end, count, dystans, id, intermediate_data])
    
    df = pd.DataFrame(converted_data, columns=['shape_id', 'beginning', 'end', 'count', 'distance', 'id', 'coordinates'])
    return df

# list to df conversion
df = convert_list_to_dataframe(result)
df.to_csv("gdansk_04_18_11_2024\\segments.txt", index=False) 

#sample
df.iloc[5:10]

Unnamed: 0,shape_id,beginning,end,count,distance,id,coordinates
5,100_190859,"[54.351773852814, 18.643583907758]","[54.351762930454, 18.646708662796]",110,0.219604,54.351773852814_18.643583907758_54.35176293045...,"[[54.351773852814, 18.643583907758], [54.35177..."
6,100_190859,"[54.351762930454, 18.646708662796]","[54.350233677554, 18.65496775518]",5,0.617689,54.351762930454_18.646708662796_54.35023367755...,"[[54.351762930454, 18.646708662796], [54.35179..."
7,100_190859,"[54.350233677554, 18.65496775518]","[54.353308551841, 18.657762913813]",5,0.499945,54.350233677554_18.65496775518_54.353308551841...,"[[54.350233677554, 18.65496775518], [54.350137..."
8,100_190859,"[54.353308551841, 18.657762913813]","[54.353479766367, 18.657783951833]",10,0.499945,54.353308551841_18.657762913813_54.35347976636...,"[[54.353308551841, 18.657762913813], [54.35347..."
9,100_190860,"[54.353308551841, 18.657762913813]","[54.353479766367, 18.657783951833]",10,0.019116,54.353308551841_18.657762913813_54.35347976636...,"[[54.353308551841, 18.657762913813], [54.35347..."


Finally, we add the accumulated initial and final distance to the `segments` data frame. Then, we remove unnecessary records that may have been created due to merging. Specifically, we check whether the `initial distance` of the (i+1)-th segment is equal to the `final distance` of the i-th segment to ensure there are no uncontrolled jumps.

In [9]:

shapes = pd.read_csv("gdansk_04_18_11_2024\\shapes_modified.txt")
shapes['coordinates2'] = shapes.apply(lambda row: (row['shape_pt_lat'], row['shape_pt_lon']), axis=1)
shapes = shapes[['shape_id', 'coordinates2', 'shape_dist_traveled']]

segments = pd.read_csv("gdansk_04_18_11_2024\\segments.txt") #zle
segments['beginning'] = segments['beginning'].apply(eval).apply(tuple)
segments['end'] = segments['end'].apply(eval).apply(tuple)

#creating beginning distance
segments = segments.merge(shapes, left_on=['shape_id', 'beginning'], right_on= ['shape_id', 'coordinates2']) 
segments.rename(columns={'shape_dist_traveled': 'initial_distance'}, inplace=True)
segments.drop(columns=['coordinates2'], inplace=True)


#creating final distance
segments = segments.merge(shapes, left_on=['shape_id', 'end'], right_on= ['shape_id', 'coordinates2']) #dalej mergujemy po grupach dystansy
segments.rename(columns={'shape_dist_traveled': 'final_distance'}, inplace=True) 
segments.drop(columns=['coordinates2'], inplace=True)

segments = segments[segments.initial_distance < segments.final_distance]

segments_help = pd.DataFrame()
for shape_id, group in segments.groupby('shape_id'): #usuwamy zbedne rekordy 
    
    check = group.final_distance.iloc[0]
    remove_indx = []
    
    for i in range(1, len(group)):
        
        if group.initial_distance.iloc[i] != check:
            remove_indx.append(group.index[i])
        else:
            check = group.final_distance.iloc[i]
    
    group = group.drop(index=remove_indx)
    segments_help = pd.concat([segments_help, group], ignore_index=True)


segments = segments_help



segments = segments.sort_values(by = ['shape_id', 'initial_distance'])
segments.to_csv("gdansk_04_18_11_2024\\segments.txt", index=False) #updating file

#sample
segments.iloc[5:10]


Unnamed: 0,shape_id,beginning,end,count,distance,id,coordinates,initial_distance,final_distance
5,100_190859,"(54.351773852814, 18.643583907758)","(54.351762930454, 18.646708662796)",110,0.219604,54.351773852814_18.643583907758_54.35176293045...,"[[54.351773852814, 18.643583907758], [54.35177...",0.664721,0.884324
6,100_190859,"(54.351762930454, 18.646708662796)","(54.350233677554, 18.65496775518)",5,0.617689,54.351762930454_18.646708662796_54.35023367755...,"[[54.351762930454, 18.646708662796], [54.35179...",0.884324,1.502013
7,100_190859,"(54.350233677554, 18.65496775518)","(54.353308551841, 18.657762913813)",5,0.499945,54.350233677554_18.65496775518_54.353308551841...,"[[54.350233677554, 18.65496775518], [54.350137...",1.502013,2.001959
8,100_190859,"(54.353308551841, 18.657762913813)","(54.353479766367, 18.657783951833)",10,0.499945,54.353308551841_18.657762913813_54.35347976636...,"[[54.353308551841, 18.657762913813], [54.35347...",2.001959,2.021074
9,100_190860,"(54.353308551841, 18.657762913813)","(54.353479766367, 18.657783951833)",10,0.019116,54.353308551841_18.657762913813_54.35347976636...,"[[54.353308551841, 18.657762913813], [54.35347...",0.0,0.019116


visualization of divided bus network into segments:

In [None]:

segments = pd.read_csv("gdansk_04_18_11_2024\\segments.txt")
segments = segments.drop_duplicates('id')  


# creatig colours
num_colors = len(segments)
colors = plt.cm.get_cmap("nipy_spectral", num_colors)  
color_list = [mcolors.to_hex(colors(i)) for i in range(num_colors)]  

# creating map
map = folium.Map(location=[54.352, 18.646], zoom_start=13)

for i, row in enumerate(segments.itertuples()):
    count_value = row.count

    if pd.isna(count_value):  #or count_value > 220:
        continue  # omitting nan

    coords = ast.literal_eval(row.coordinates)
    color = color_list[i % num_colors]  # colour cycle

    folium.PolyLine(coords, color=color, weight=5).add_to(map)

display(map)
map.save('gdansk_04_18_11_2024\\segments_map.html')


for greater readability:

In [None]:

segments = pd.read_csv("gdansk_04_18_11_2024\\segments.txt")
segments = segments.drop_duplicates('id')  

unique_counts = sorted(segments['count'].unique())

# creating a slider that moves only between existing count values
slider = widgets.SelectionSlider(
    options=unique_counts, 
    value=unique_counts[0],  
    description="Count:",
    continuous_update=False  
)

# function to update the map based on the selected count value
def update_map(selected_count):
    map = folium.Map(location=[54.352, 18.646], zoom_start=13)

    for _, row in segments.iterrows():
        if row['count'] != selected_count:  # display only segments with the exact count
            continue

        coords = ast.literal_eval(row['coordinates'])
        color = "#{:06x}".format(random.randint(0, 0xFFFFFF))  # Random color

        folium.PolyLine(coords, color=color, weight=5).add_to(map)

    display(map)

# Connect the slider to the function
widgets.interactive(update_map, selected_count=slider)



3) Creating a file (`kdtree_with_shape_ids.joblib`) containing k-d trees, shape point coordinates, accumulated distances, and segment_id with shape_id as the key. This will be useful in step 4 to speed up the search process.

In [12]:
pairs_df = pd.read_csv("gdansk_04_18_11_2024\\shapes_pairs.txt")
pairs_df = pairs_df.sort_values(['shape_id', 'code'])


kdtree_dict = {}
for shape_id, group in pairs_df.groupby('shape_id'):
    
    
    group['coords1'] = group['coords1'].apply(ast.literal_eval).to_list() #coordinates of segment starting points
    coor3 = group['coords1'].to_list()
    dist1 = group['distance1'].to_list()
    tree = KDTree(coor3)

    coor3.append(group['coords2'].iloc[-1])
    dist1.append(group['distance2'].iloc[-1])

    kdtree_dict[shape_id] = [tree, coor3, dist1]
    
joblib.dump(kdtree_dict, 'gdansk_04_18_11_2024\\kdtree_with_shape_ids.joblib')


['gdansk_04_18_11_2024\\kdtree_with_shape_ids.joblib']