In [1]:
import pandas as pd

# Read GTFS stop_times.txt; read everything as string to avoid dtype surprises
df = pd.read_csv("gtfs/stop_times.txt", dtype=str, low_memory=False)

# If present, convert arrival/departure times to pandas Timedelta (handles >24:00:00)
for col in ("arrival_time", "departure_time"):
    if col in df.columns:
        df[col + "_td"] = pd.to_timedelta(df[col], errors="coerce")

# Quick check
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49066 entries, 0 to 49065
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype          
---  ------             --------------  -----          
 0   trip_id            49066 non-null  object         
 1   stop_id            49066 non-null  object         
 2   stop_sequence      49066 non-null  object         
 3   stop_headsign      49066 non-null  object         
 4   arrival_time       49066 non-null  object         
 5   departure_time     49066 non-null  object         
 6   timepoint          49066 non-null  object         
 7   arrival_time_td    49066 non-null  timedelta64[ns]
 8   departure_time_td  49066 non-null  timedelta64[ns]
dtypes: object(7), timedelta64[ns](2)
memory usage: 3.4+ MB
None
                     trip_id           stop_id stop_sequence stop_headsign  \
0  C01_route_1_0000001_16500  mxc_C01_P_STP_01             0      Libertad   
1  C01_route_1_0000001_16500  mxc_C01_P_STP_02         

In [2]:
# Group by trip_id and produce lists of stop_ids and timedeltas between consecutive stops
def _agg_trip(g):
    g = g.copy()
    g['stop_sequence_int'] = pd.to_numeric(g['stop_sequence'], errors='coerce')
    g = g.sort_values('stop_sequence_int')
    stops = g['stop_id'].tolist()
    times = g['departure_time_td']
    deltas = times.diff().iloc[1:].tolist()  # list of pandas.Timedelta (length = len(stops)-1)
    return pd.Series({'stop_ids': stops, 'deltas': deltas, 'num_stops': len(stops)})

trip_stops = df.groupby('trip_id').apply(_agg_trip).reset_index()

# quick check
print(trip_stops.head())

   trip_id                                           stop_ids  \
0    A03_1  [MM_A03_1, MM_A03_2, MM_A03_3, MM_A03_4, MM_A0...   
1    A03_2  [MM_A03_17, MM_A03_19, MM_A03_20, MM_C138_47, ...   
2    A05_1  [MM_A05_26, MM_A05_2, MM_A05_3, MM_A05_4, MM_A...   
3    A05_2  [MM_A05_14, MM_A05_15, MM_A05_16, MM_A05_17, M...   
4  A06_1_1  [MM_A06_1, MM_A06_2, MM_A06_4, MM_A06_5, MM_A0...   

                                              deltas  num_stops  
0  [0 days 00:01:14, 0 days 00:01:00, 0 days 00:0...         17  
1  [0 days 00:01:35, 0 days 00:00:53, 0 days 00:0...         18  
2  [0 days 00:00:50, 0 days 00:01:01, 0 days 00:0...         13  
3  [0 days 00:01:23, 0 days 00:01:02, 0 days 00:0...         13  
4  [0 days 00:00:51, 0 days 00:00:44, 0 days 00:0...         29  


  trip_stops = df.groupby('trip_id').apply(_agg_trip).reset_index()


In [3]:
trip_stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   trip_id    989 non-null    object
 1   stop_ids   989 non-null    object
 2   deltas     989 non-null    object
 3   num_stops  989 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 31.0+ KB


In [4]:
trips_df = pd.read_csv("gtfs/trips.txt", dtype=str, low_memory=False)

In [5]:
set(trips_df['service_id'])

{'D', 'LD', 'LS', 'LV', 'S', 'SD'}

In [6]:
# choose one trip per shape_id preferring service_id in order: 'LD' (highest), then 'LS', then 'LV'
# uses existing variables: selected_trips (DataFrame) and priority_map (dict)
# define priority for service_id values (lower is higher priority)
priority_map = {'LD': 0, 'LS': 1, 'LV': 2}

# work on a copy of trips_df
tmp = trips_df.copy()
tmp['svc_prio'] = tmp['service_id'].map(priority_map).fillna(999).astype(int)

# sort by shape_id then priority so the preferred service is first; drop duplicates to keep the chosen trip per shape
tmp = tmp.sort_values(['shape_id', 'svc_prio'])
rep = tmp.drop_duplicates(subset='shape_id', keep='first').reset_index(drop=True)

# mapping shape_id -> chosen route_id
shape_route_map = rep.set_index('shape_id')['route_id'].to_dict()

# DataFrame with the assignments
shape_routes = rep[['shape_id', 'route_id', 'trip_id', 'trip_headsign']]

print(f"Assigned route_id for {len(shape_routes)} shapes.")
print(shape_routes.head())

Assigned route_id for 466 shapes.
  shape_id route_id                    trip_id          trip_headsign
0   C01_r1      C01                 C01_trip_1   Centro Metropolitano
1   C01_r2      C01                 C01_trip_2     San Juan de Ocotan
2   C02_r1      C02                 C02_trip_1   Centro Metropolitano
3   C02_r2      C02                 C02_trip_2  Mision de los ViÃ±edos
4   C03_r1      C03  C03_route_1_1111110_18000              El Fresno


In [7]:
shapes_df = pd.read_csv("gtfs/shapes.txt", dtype=str, low_memory=False)

In [8]:
routes_df = pd.read_csv("gtfs/routes.txt", dtype=str, low_memory=False)

In [9]:
set_shapes = set(shapes_df['shape_id'])
set_routes = set(routes_df['route_id'])

set_used_shapes = set(shape_routes['shape_id'])
set_used_routes = set(shape_routes['route_id'])

print(set_used_shapes - set_shapes)
print(set_used_routes - set_routes)

{'LM_V02_r2', 'LM_V03_r1', 'LM_C03_r2', 'LM_V02_r1', 'LM_V01_r2', 'LM_C02_r2', 'LM_C03_r1', 'LM_C01_r1', 'LM_C02_r1', 'LM_V01_r1', 'LM_C01_r2', 'LM_V03_r2'}
set()
