In [None]:
import pandas as pd

# Read GTFS stop_times.txt; read everything as string to avoid dtype surprises
stop_times_df = pd.read_csv("gtfs/stop_times.txt", dtype=str, low_memory=False)

# Convert departure times to pandas Timedelta (handles >24:00:00)
stop_times_df["departure_time_td"] = pd.to_timedelta(stop_times_df["departure_time"], errors="coerce")

# Quick check
print(stop_times_df.info())
print(stop_times_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49066 entries, 0 to 49065
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype          
---  ------             --------------  -----          
 0   trip_id            49066 non-null  object         
 1   stop_id            49066 non-null  object         
 2   stop_sequence      49066 non-null  object         
 3   stop_headsign      49066 non-null  object         
 4   arrival_time       49066 non-null  object         
 5   departure_time     49066 non-null  object         
 6   timepoint          49066 non-null  object         
 7   arrival_time_td    49066 non-null  timedelta64[ns]
 8   departure_time_td  49066 non-null  timedelta64[ns]
dtypes: object(7), timedelta64[ns](2)
memory usage: 3.4+ MB
None
                     trip_id           stop_id stop_sequence stop_headsign  \
0  C01_route_1_0000001_16500  mxc_C01_P_STP_01             0      Libertad   
1  C01_route_1_0000001_16500  mxc_C01_P_STP_02         

In [None]:
# Group by trip_id and produce lists of stop_ids and timedeltas between consecutive stops
def _agg_trip(g):
    g = g.copy()
    g['stop_sequence_int'] = pd.to_numeric(g['stop_sequence'], errors='coerce')
    g = g.sort_values('stop_sequence_int')
    stops = g['stop_id'].tolist()
    times = g['departure_time_td']
    deltas = times.diff().iloc[1:].dt.total_seconds().astype(int).tolist()  # list of int seconds (length = len(stops)-1)
    return pd.Series({'stop_ids': stops, 'deltas': deltas, 'num_stops': len(stops)})

transit_stops_df = stop_times_df.groupby('trip_id').apply(_agg_trip).reset_index()

# quick check
print(transit_stops_df.head())

   trip_id                                           stop_ids  \
0    A03_1  [MM_A03_1, MM_A03_2, MM_A03_3, MM_A03_4, MM_A0...   
1    A03_2  [MM_A03_17, MM_A03_19, MM_A03_20, MM_C138_47, ...   
2    A05_1  [MM_A05_26, MM_A05_2, MM_A05_3, MM_A05_4, MM_A...   
3    A05_2  [MM_A05_14, MM_A05_15, MM_A05_16, MM_A05_17, M...   
4  A06_1_1  [MM_A06_1, MM_A06_2, MM_A06_4, MM_A06_5, MM_A0...   

                                              deltas  num_stops  
0  [74, 60, 49, 66, 57, 47, 49, 49, 47, 47, 46, 5...         17  
1  [95, 53, 55, 49, 51, 47, 47, 48, 49, 49, 47, 5...         18  
2  [50, 61, 51, 57, 43, 44, 86, 60, 55, 49, 62, 102]         13  
3   [83, 62, 50, 52, 60, 85, 44, 42, 57, 48, 68, 50]         13  
4  [51, 44, 51, 44, 41, 38, 38, 48, 46, 39, 44, 4...         29  


  transit_stops_df = stop_times_df.groupby('trip_id').apply(_agg_trip).reset_index()


In [None]:
transit_stops_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   trip_id    989 non-null    object
 1   stop_ids   989 non-null    object
 2   deltas     989 non-null    object
 3   num_stops  989 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 31.0+ KB


In [None]:
#trips_df = pd.read_csv("gtfs/trips.txt", dtype=str, low_memory=False)
# Modified trips.txt with Excel to fix some issues regarding some shape allocations
trips_df = pd.read_excel("gtfs/trips_df_fixed.xlsx", dtype=str)

In [None]:
set(trips_df['service_id'])

{'LD', 'LS', 'LV'}

In [None]:
# choose one trip per shape_id preferring service_id in order: 'LD' (highest), then 'LS', then 'LV'
# uses existing variables: selected_trips (DataFrame) and priority_map (dict)
# define priority for service_id values (lower is higher priority)
priority_map = {'LD': 0, 'LS': 1, 'LV': 2}

# work on a copy of trips_df
tmp = trips_df.copy()
tmp['service_prio'] = tmp['service_id'].map(priority_map).fillna(999).astype(int)

# sort by shape_id then priority so the preferred service is first; drop duplicates to keep the chosen trip per shape
tmp = tmp.sort_values(['shape_id', 'service_prio'])
rep = tmp.drop_duplicates(subset='shape_id', keep='first').reset_index(drop=True)

# DataFrame with the assignments
transit_df = rep[['shape_id', 'route_id', 'trip_id', 'trip_headsign']]

print(f"Assigned route_id for {len(transit_df)} shapes.")
print(transit_df.head())

Assigned route_id for 488 shapes.
  shape_id route_id                    trip_id          trip_headsign
0   C01_r1      C01                 C01_trip_1   Centro Metropolitano
1   C01_r2      C01                 C01_trip_2     San Juan de Ocotan
2   C02_r1      C02                 C02_trip_1   Centro Metropolitano
3   C02_r2      C02                 C02_trip_2  Mision de los Vi√±edos
4   C03_r1      C03  C03_route_1_1111110_18000              El Fresno


In [None]:
shapes_df = pd.read_csv("gtfs/shapes.txt", dtype=str, low_memory=False)

In [None]:
routes_df = pd.read_csv("gtfs/routes.txt", dtype=str, low_memory=False)

In [None]:
set_shapes = set(shapes_df['shape_id'])
set_routes = set(routes_df['route_id'])

set_used_shapes = set(transit_df['shape_id'])
set_used_routes = set(transit_df['route_id'])

#these should be empty sets
print(set_used_shapes - set_shapes)
print(set_used_routes - set_routes)

set()
set()


In [None]:
#these are the shapes (routes) that wont be used
print(set_shapes - set_used_shapes)
print(set_routes - set_used_routes)

{'T06-03_r1', 'T06-03_r2', 'C119_r2', 'C63_r2', 'C119_r1', 'T13C-1_r2', 'T14A-C02_r1', 'T07-C01_r1', 'T07-C01_r2', 'T13B-V1_r2', 'T15-1_r2', 'T16B-C01_r2', 'T13A-C01_r1', 'T07-C04_r2', 'T15-C03_r2', 'T15-C02_r1', 'T14A-C03_r1', 'T15-C04_r1', 'T15-C03_r1', 'T16A-C01_r2', 'T13A-C01_r2', 'T13B-C01-1_r2', 'T13A-C02_r2', 'T15-C04_r2', 'T15-C06_r1', 'T15-C05_r2', 'T15-1_r1', 'T15-C05_r1', 'T16B-C03_r2', 'T16A-C01_r1', 'T16B-C03_r1', 'T17-1_r1', 'T13C-2_r2', 'T07-C04_r1', 'T15-C06_r2', 'T16B-1_r1', 'T13B-C02-1_r2', 'T16B-1_r2', 'T13C-2_r1', 'T14A-C03_r2', 'T13B-C01-1_r1', 'T16B-C01_r1', 'T13C-1_r1', 'T15-C02_r2', 'T14A-C01_r2', 'T15-C01_r1', 'T13A-C03_r1', 'T13B-V2_r1', 'T17-1_r2', 'T13A-C03_r2', 'T14A-C02_r2', 'T13B-C02-1_r1', 'T15-C01_r2', 'T13B-V1_r1', 'T13B-V2_r2', 'T13A-C02_r1', 'T14A-C01_r1'}
{'T13A-C03', 'T15-C05', 'T15-C01', 'T13C-2', 'C119', 'T15-C04', 'C63', 'T16B-1', 'T13B-V1', 'T14A-C01', 'T15-C03', 'T13B-V2', 'T13A-C02', 'T15-C02', 'T13B-C02-1', 'T17-1', 'T07-C04', 'T14A-C03', 'T

In [None]:
# Convert shapes_df into a GeoDataFrame of LineStrings (one per shape_id)
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString


# Ensure numeric columns and drop invalid rows
shapes_df['shape_pt_lat'] = pd.to_numeric(shapes_df.get('shape_pt_lat'), errors='coerce')
shapes_df['shape_pt_lon'] = pd.to_numeric(shapes_df.get('shape_pt_lon'), errors='coerce')
shapes_df['shape_pt_sequence'] = pd.to_numeric(shapes_df.get('shape_pt_sequence'), errors='coerce')
shapes_pts = shapes_df.dropna(subset=['shape_pt_lat','shape_pt_lon','shape_pt_sequence']).copy()
shapes_pts = shapes_pts.sort_values(['shape_id','shape_pt_sequence'])

# Create point geometries
shapes_pts['geometry'] = shapes_pts.apply(lambda r: Point(r['shape_pt_lon'], r['shape_pt_lat']), axis=1)
gdf_shapes_points = gpd.GeoDataFrame(shapes_pts, geometry='geometry', crs='EPSG:4326')

# Build LineString per shape_id
lines = []
for shape_id, grp in gdf_shapes_points.groupby('shape_id'):
    grp = grp.sort_values('shape_pt_sequence')
    coords = [(x, y) for x, y in zip(grp['shape_pt_lon'], grp['shape_pt_lat'])]
    if len(coords) == 0:
        continue
    geom = LineString(coords) if len(coords) > 1 else Point(coords[0])
    lines.append({'shape_id': shape_id, 'geometry': geom, 'shape_pt_count': len(coords)})

gdf_shapes = gpd.GeoDataFrame(lines, crs='EPSG:4326')

# Optional: attach additional attributes from the first point of each shape
first_rows = shapes_pts.sort_values('shape_pt_sequence').groupby('shape_id').first().reset_index()
gdf_shapes = gdf_shapes.merge(first_rows.drop(columns=['shape_pt_lat','shape_pt_lon','geometry','shape_pt_sequence']), on='shape_id', how='left')

print('Number of shapes created:', len(gdf_shapes))
gdf_shapes.head()

Number of shapes created: 545


Unnamed: 0,shape_id,geometry,shape_pt_count
0,C01_r1,"LINESTRING (-103.45304 20.71202, -103.45321 20...",362
1,C01_r2,"LINESTRING (-103.34199 20.68155, -103.34199 20...",336
2,C02_r1,"LINESTRING (-103.21471 20.65078, -103.21486 20...",289
3,C02_r2,"LINESTRING (-103.35247 20.67278, -103.35158 20...",238
4,C03_r1,"LINESTRING (-103.32986 20.54759, -103.32955 20...",345


In [None]:
# Add to the transit_df the geometry of each shape_id

transit_df = transit_df.merge(gdf_shapes[['shape_id', 'geometry']], on='shape_id', how='left')
transit_df = transit_df.rename(columns={'geometry': 'shape_geometry'})
transit_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488 entries, 0 to 487
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   shape_id        488 non-null    object  
 1   route_id        488 non-null    object  
 2   trip_id         488 non-null    object  
 3   trip_headsign   488 non-null    object  
 4   shape_geometry  488 non-null    geometry
dtypes: geometry(1), object(4)
memory usage: 19.2+ KB
