In [6]:
import time
import pandas as pd
from google.protobuf.json_format import MessageToDict
from urllib.request import urlopen
import gtfs_realtime_pb2

In [7]:
# Initialize the main DataFrame
df = pd.DataFrame()

# Initialize the DataFrame to store new trip information
new_trips_df = pd.DataFrame()


In [8]:
import csv

MAX_ROWS = 1000  # Maximum number of rows to keep in the DataFrame

# Create an empty DataFrame to store the delay data
delays_df = pd.DataFrame(columns=["trip_id", "stop_sequence", "arrival_delay"])

# Create an empty dictionary to store delays
delays = {}

while True:
    # Retrieve the data
    feed = gtfs_realtime_pb2.FeedMessage()
    response = urlopen(
        "https://s3.amazonaws.com/kcm-alerts-realtime-prod/tripupdates.pb"
    )
    feed.ParseFromString(response.read())

    # Convert protobuf feed to dictionary
    buses_dict = MessageToDict(feed)

    # Extract the relevant data and convert to DataFrame
    new_data = pd.json_normalize(buses_dict["entity"])

    # Iterate through each row in the new_data DataFrame
    for index, row in new_data.iterrows():
        try:
            trip_id = row["trip_update.trip.trip_id"]
            stop_time_updates = row["trip_update.stop_time_update"]

            # Iterate through each dictionary within the 'stop_time_updates' list
            for update in stop_time_updates:
                stop_sequence = update["stop_sequence"]

                # Check if the 'arrival' key exists in the dictionary
                if "arrival" in update:
                    arrival_delay = update["arrival"]["delay"]
                else:
                    arrival_delay = None

                # Check if the trip_id exists in the delays dictionary
                if trip_id in delays:
                    # Append the delay to the existing list
                    delays[trip_id].append((stop_sequence, arrival_delay))
                else:
                    # Create a new list for the trip_id and add the delay
                    delays[trip_id] = [(stop_sequence, arrival_delay)]

                # Append the delay information to the delays_df DataFrame
                delays_df = delays_df.append(
                    {
                        "trip_id": trip_id,
                        "stop_sequence": stop_sequence,
                        "arrival_delay": arrival_delay,
                    },
                    ignore_index=True,
                )

        except KeyError:
            # Handle missing keys gracefully
            continue

    # Check if the DataFrame exceeds the maximum number of rows
    if len(delays_df) > MAX_ROWS:
        # Save the delay data to a CSV file
        delays_df.to_csv("combined_data.csv", mode="a", header=False, index=False)

        # Clear the delay DataFrame to free up memory
        delays_df = pd.DataFrame(columns=["trip_id", "stop_sequence", "arrival_delay"])

    # Wait for 30 seconds before refreshing the data
    time.sleep(30)

# Save any remaining delay data to the CSV file before exiting
if not delays_df.empty:
    delays_df.to_csv("combined_data.csv", mode="a", header=False, index=False)

KeyboardInterrupt: 