In [26]:
# Add the src directory to Python path so we can import from src modules
import sys
sys.path.append('../src')

# Import existing BigQuery utilities
from bigquery.bigquery_client import _bq_client
from common.logging_utils import logger
import pandas as pd

# Configuration

In [27]:
# Set your GCP project and dataset
PROJECT_ID = "regal-dynamo-470908-v9"
DATASET = "auckland_data_dev"
SERVICE_DATE = 20250914  # 14 September 2025

# Step 1: Get Feed Hash

In [30]:
client = _bq_client(PROJECT_ID)
query = f"""
SELECT feed_hash FROM `{PROJECT_ID}.{DATASET}.sc_feed_info` 
WHERE feed_start_date <= {SERVICE_DATE} 
AND feed_end_date >= {SERVICE_DATE} 
ORDER BY ingestion_timestamp DESC LIMIT 1
"""
feed_df = client.query(query).to_dataframe()

if not feed_df.empty:
    APPLICABLE_FEED_HASH = feed_df.iloc[0]['feed_hash']
    logger.info(f"Feed hash: {APPLICABLE_FEED_HASH}")
else:
    APPLICABLE_FEED_HASH = None
    logger.info("No feed found")

INFO - Feed hash: 9c9a42de469af40e4367e411c387577a


# Step 2: Get Active Services

In [36]:
if APPLICABLE_FEED_HASH:
    from datetime import date
    date_obj = date(SERVICE_DATE // 10000, (SERVICE_DATE % 10000) // 100, SERVICE_DATE % 100)
    day_name = date_obj.strftime('%A').lower()

    # Get regular services
    calendar_query = f"""
    SELECT service_id FROM `{PROJECT_ID}.{DATASET}.sc_calendar`
    WHERE feed_hash = '{APPLICABLE_FEED_HASH}'
    AND start_date <= {SERVICE_DATE} AND end_date >= {SERVICE_DATE}
    AND {day_name} = 1
    """
    calendar_df = client.query(calendar_query).to_dataframe()
    regular_services = set(calendar_df['service_id'].tolist())

    # Get added services (exception_type = 1)
    added_query = f"""
    SELECT service_id FROM `{PROJECT_ID}.{DATASET}.sc_calendar_dates`
    WHERE feed_hash = '{APPLICABLE_FEED_HASH}'
    AND date = {SERVICE_DATE} AND exception_type = 1
    """
    added_df = client.query(added_query).to_dataframe()
    added_services = set(added_df['service_id'].tolist())

    # Get removed services (exception_type = 2)
    removed_query = f"""
    SELECT service_id FROM `{PROJECT_ID}.{DATASET}.sc_calendar_dates`
    WHERE feed_hash = '{APPLICABLE_FEED_HASH}'
    AND date = {SERVICE_DATE} AND exception_type = 2
    """
    removed_df = client.query(removed_query).to_dataframe()
    removed_services = set(removed_df['service_id'].tolist())

    # Combine: (regular + added) - removed
    active_services = (regular_services | added_services) - removed_services
    active_services = list(active_services)
    logger.info(f"Active service ids: {len(active_services)}")
else:
    active_services = []
    logger.info("No feed hash")

INFO - Active services: 16


# Step 3: Get Trips

In [43]:
if APPLICABLE_FEED_HASH:
    # Get ALL trips for this feed (no filtering in SQL)
    trips_query = f"""
    SELECT
        t.service_id, t.route_id, r.route_short_name, r.route_type,
        t.trip_id, t.trip_headsign, t.direction_id, t.shape_id
    FROM `{PROJECT_ID}.{DATASET}.sc_trips` t
    LEFT JOIN `{PROJECT_ID}.{DATASET}.sc_routes` r
    ON t.route_id = r.route_id AND t.feed_hash = r.feed_hash
    WHERE t.feed_hash = '{APPLICABLE_FEED_HASH}'
    """
    all_trips_df = client.query(trips_query).to_dataframe()

    # Filter by active services in pandas
    if active_services:
        trips_df = all_trips_df[all_trips_df['service_id'].isin(active_services)]
    else:
        trips_df = pd.DataFrame()

    logger.info(f"All trips: {len(all_trips_df)}")
    logger.info(f"Filtered trips: {len(trips_df)}")
else:
    trips_df = pd.DataFrame()
    logger.info("No feed hash")

INFO - All trips: 29302
INFO - Filtered trips: 10493
INFO - Filtered trips: 10493


# Step 4: Get Stop Times

In [74]:
def convert_gtfs_time_to_utc(service_date, time_str):
    """Convert GTFS time string (HH:MM:SS) to UTC datetime, handling times > 24:00"""
    from datetime import datetime, timedelta
    import pytz

    if not time_str or pd.isna(time_str):
        return None

    # Parse the time string
    hours, minutes, seconds = map(int, time_str.split(':'))

    # Calculate days to add (for times > 24:00)
    extra_days = hours // 24
    hours = hours % 24

    # Create base datetime for service date
    base_date = datetime(service_date // 10000, (service_date % 10000) // 100, service_date % 100)

    # Add the time
    dt = base_date + timedelta(days=extra_days, hours=hours, minutes=minutes, seconds=seconds)

    # Assume local timezone (you may need to adjust this based on your GTFS feed's timezone)
    local_tz = pytz.timezone('Pacific/Auckland')  # Adjust for your timezone
    dt_local = local_tz.localize(dt)

    # Convert to UTC
    dt_utc = dt_local.astimezone(pytz.UTC)

    return dt_utc

if not trips_df.empty and APPLICABLE_FEED_HASH:
    # Get ALL stop times for this feed (no filtering in SQL)
    stop_times_query = f"""
    SELECT
        st.trip_id, st.stop_id, st.stop_sequence, s.stop_code, s.stop_name, st.stop_headsign, st.arrival_time, st.departure_time,
        s.stop_lat, s.stop_lon, st.shape_dist_traveled
    FROM `{PROJECT_ID}.{DATASET}.sc_stop_times` st
    LEFT JOIN `{PROJECT_ID}.{DATASET}.sc_stops` s
    ON st.stop_id = s.stop_id AND st.feed_hash = s.feed_hash
    WHERE st.feed_hash = '{APPLICABLE_FEED_HASH}'
    """
    stop_times_df = client.query(stop_times_query).to_dataframe()

    # Convert times to UTC format with shorter field names
    stop_times_df['arrival'] = stop_times_df['arrival_time'].apply(
        lambda x: convert_gtfs_time_to_utc(SERVICE_DATE, x)
    )
    stop_times_df['departure'] = stop_times_df['departure_time'].apply(
        lambda x: convert_gtfs_time_to_utc(SERVICE_DATE, x)
    )
    stop_times_df['arrival_s'] = stop_times_df['arrival'].apply(
        lambda x: int(x.timestamp()) if x else None
    )
    stop_times_df['departure_s'] = stop_times_df['departure'].apply(
        lambda x: int(x.timestamp()) if x else None
    )

    # Join with trips data in pandas
    schedule_df = stop_times_df.merge(trips_df, on='trip_id', how='inner')

    logger.info(f"Stop times: {len(stop_times_df)}")
    logger.info(f"Schedule entries: {len(schedule_df)}")
else:
    stop_times_df = pd.DataFrame()
    schedule_df = pd.DataFrame()
    logger.info("No stop times")

INFO - Stop times: 870102
INFO - Schedule entries: 315112
INFO - Schedule entries: 315112


# Step 5: Build Schedule

In [75]:
if not schedule_df.empty:
    # Add service date and sort the final schedule
    schedule_df['service_date'] = SERVICE_DATE
    schedule_df = schedule_df.sort_values(['route_short_name', 'direction_id', 'trip_id', 'stop_sequence'])

    # Reorder columns in logical groups
    logical_column_order = [
        # Identity
        'service_date', 'service_id', 'route_id','trip_id', 'stop_id', 'stop_sequence',

        # Route info
        'route_short_name', 'route_type', 'direction_id',

        # Trip info
        'trip_headsign',

        # Stop info
        'stop_code', 'stop_name', 'stop_headsign',

        # Time fields (original GTFS + converted)
        'arrival_time', 'departure_time', 'arrival', 'departure', 'arrival_s', 'departure_s',

        # Other
        'stop_lat', 'stop_lon', 'shape_id', 'shape_dist_traveled'
    ]

    # Only include columns that actually exist
    existing_columns = [col for col in logical_column_order if col in schedule_df.columns]
    schedule_df = schedule_df[existing_columns]

    logger.info(f"Final schedule: {len(schedule_df)} entries")
    logger.info(f"Columns: {list(schedule_df.columns)}")
else:
    schedule_df = pd.DataFrame()
    logger.info("No schedule data")

INFO - Final schedule: 315112 entries
INFO - Columns: ['service_date', 'service_id', 'route_id', 'trip_id', 'stop_id', 'stop_sequence', 'route_short_name', 'route_type', 'direction_id', 'trip_headsign', 'stop_code', 'stop_name', 'stop_headsign', 'arrival_time', 'departure_time', 'arrival', 'departure', 'arrival_s', 'departure_s', 'stop_lat', 'stop_lon', 'shape_id', 'shape_dist_traveled']
INFO - Columns: ['service_date', 'service_id', 'route_id', 'trip_id', 'stop_id', 'stop_sequence', 'route_short_name', 'route_type', 'direction_id', 'trip_headsign', 'stop_code', 'stop_name', 'stop_headsign', 'arrival_time', 'departure_time', 'arrival', 'departure', 'arrival_s', 'departure_s', 'stop_lat', 'stop_lon', 'shape_id', 'shape_dist_traveled']


In [76]:
schedule_df.columns

Index(['service_date', 'service_id', 'route_id', 'trip_id', 'stop_id',
       'stop_sequence', 'route_short_name', 'route_type', 'direction_id',
       'trip_headsign', 'stop_code', 'stop_name', 'stop_headsign',
       'arrival_time', 'departure_time', 'arrival', 'departure', 'arrival_s',
       'departure_s', 'stop_lat', 'stop_lon', 'shape_id',
       'shape_dist_traveled'],
      dtype='object')

In [77]:
schedule_df.iloc[0]

service_date                                        20250914
service_id                                          Sunday-1
route_id                                             105-202
trip_id                          1255-10501-22500-2-4d9a247b
stop_id                                        8496-67820052
stop_sequence                                              1
route_short_name                                         105
route_type                                                 3
direction_id                                               0
trip_headsign          Westmere To Britomart Via Richmond Rd
stop_code                                               8496
stop_name                                  Cox's Bay Reserve
stop_headsign                                      BRITOMART
arrival_time                                        06:15:00
departure_time                                      06:15:00
arrival                            2025-09-13 18:15:00+00:00
departure               

# Summary
Built daily schedule from GTFS data.