# DS4420 Transit Data EDA
Exploring GTFS Schedule data for the MBTA. Loading data, observing features and preparing for analysis.

In [None]:
# imports
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import time

## Date Range

In [9]:
# MBTA API endpoint for alerts
url = "https://api-v3.mbta.com/alerts"

# Parameters to get alerts
params = {
    "api_key": "f8a9ad97579d4ed2978147f7187eced5",  # Replace with your API key
    "page[limit]": 700  # Adjust as needed
}

# Fetch alerts data
response = requests.get(url, params=params)
alerts_data = response.json()

# Extract 'created_at' dates from the alerts
dates = []
for alert in alerts_data.get('data', []):
    created_at = alert.get('attributes', {}).get('created_at')
    if created_at:
        dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
        dates.append(dt)

# Compute min and max dates if available
if dates:
    min_date = min(dates)
    max_date = max(dates)
    print("Minimum alert date:", min_date)
    print("Maximum alert date:", max_date)
else:
    print("No alert dates found.")


Minimum alert date: 2021-05-25 17:08:09-04:00
Maximum alert date: 2025-04-12 10:45:29-04:00


## predicted data for T by stop id

In [13]:
# MBTA API endpoint for alerts
url = "https://api-v3.mbta.com/predictions?filter[stop]=70010" # predictions for stop 70010, e.g., Orange Line Ruggles in the direction of Forest Hills

# Parameters to get alerts
params = {
    "api_key": "f8a9ad97579d4ed2978147f7187eced5",  # Replace with your API key
    "page[limit]": 700  # Adjust as needed
}

# Fetch predictions data
response = requests.get(url, params=params)
predictions_data = response.json()

# Extract relevant information from the predictions
predictions = []
for prediction in predictions_data.get('data', []):
    attributes = prediction.get('attributes', {})
    predictions.append({
        'arrival_time': attributes.get('arrival_time'),
        'departure_time': attributes.get('departure_time'),
        'departure_uncertainty': attributes.get('departure_uncertainty'),
        'direction_id': attributes.get('direction_id'),
        'status': attributes.get('status'),
        'stop_sequence': attributes.get('stop_sequence'),
        'schedule_relationship': attributes.get('schedule_relationship')
    })

# Display the predictions
if predictions:
    for pred in predictions:
        print(pred)
else:
    print("No predictions found.")

{'arrival_time': '2025-04-12T10:58:02-04:00', 'departure_time': '2025-04-12T10:58:56-04:00', 'departure_uncertainty': 60, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}
{'arrival_time': '2025-04-12T11:06:14-04:00', 'departure_time': '2025-04-12T11:07:08-04:00', 'departure_uncertainty': 60, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}
{'arrival_time': '2025-04-12T11:12:58-04:00', 'departure_time': '2025-04-12T11:13:52-04:00', 'departure_uncertainty': 60, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}
{'arrival_time': '2025-04-12T11:20:01-04:00', 'departure_time': '2025-04-12T11:20:55-04:00', 'departure_uncertainty': 120, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}
{'arrival_time': '2025-04-12T11:27:59-04:00', 'departure_time': '2025-04-12T11:28:53-04:00', 'departure_uncertainty': 120, 'direction_id': 0, 'status': None, '

## Scheduled Data for T by stop id

In [None]:
url="https://api-v3.mbta.com//schedules?filter[stop]=70010" # actual schedule for stop 70010, e.g., Orange Line Ruggles in the direction of Forest Hills

# Parameters to get alerts
params = {
    "api_key": "f8a9ad97579d4ed2978147f7187eced5",  # Replace with your API key
    "page[limit]": 700  # Adjust as needed
}
# Fetch predictions data
response = requests.get(url, params=params)
schedules_data = response.json()
# Extract relevant information from the predictions
schedules = []
for schedule in schedules_data.get('data', []):
    attributes = schedule.get('attributes', {})
    schedules.append({
        'arrival_time': attributes.get('arrival_time'),
        'departure_time': attributes.get('departure_time'),
        'departure_uncertainty': attributes.get('departure_uncertainty'),
        'direction_id': attributes.get('direction_id'),
        'status': attributes.get('status'),
        'stop_sequence': attributes.get('stop_sequence'),
        'schedule_relationship': attributes.get('schedule_relationship')
    })
# Display the predictions
if schedules:
    for schedule in schedules:
        print(schedule)
else:
    print("No schedules found.")


## calculating expected arrival delay

In [64]:
# index into schedules where the arrival time contains the same time as the first prediction with an error of +/- 3 minutes (when MBTA considers a train is "on time")
def find_matching_schedule(predictions, schedules):
    for i, pred in enumerate(predictions):
        arrival_time = pred['arrival_time']
        arrival_time_dt = datetime.fromisoformat(arrival_time.replace('Z', '+00:00'))
        # Check if the arrival time is within +/- 3 minutes of any schedule
        for j, schedule in enumerate(schedules):
            schedule_arrival_time = schedule['arrival_time']
            schedule_arrival_time_dt = datetime.fromisoformat(schedule_arrival_time.replace('Z', '+00:00'))
            if abs((arrival_time_dt - schedule_arrival_time_dt).total_seconds()) <= 180:
                return i, j  # Return the indices of the matching prediction and schedule
    return None, None  # Return None if no match is found
# Find the indices of the matching prediction and schedule
# Ensure predictions and schedules are not empty
if predictions and schedules:
    matching_pred_index, matching_sched_index = find_matching_schedule(predictions, schedules)
    if matching_sched_index is not None:
        print(f"Matching schedule index: {matching_sched_index}")
        print(f"Matching schedule: {schedules[matching_sched_index]}")
        print(f"Associated prediction: {predictions[matching_pred_index]}")
    else:
        print("No matching schedule found.")

Matching schedule index: 38
Matching schedule: {'arrival_time': '2025-04-12T10:56:00-04:00', 'departure_time': '2025-04-12T10:56:00-04:00', 'departure_uncertainty': None, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}
Associated prediction: {'arrival_time': '2025-04-12T10:58:02-04:00', 'departure_time': '2025-04-12T10:58:56-04:00', 'departure_uncertainty': 60, 'direction_id': 0, 'status': None, 'stop_sequence': 140, 'schedule_relationship': None}


In [77]:
# now, calculate the difference between the predicted arrival time and the actual arrival time
def calculate_time_difference_within_tolerance(predictions, schedules):
    time_differences = []
    for pred in predictions:
        arrival_time = pred['arrival_time']
        arrival_time_dt = datetime.fromisoformat(arrival_time.replace('Z', '+00:00'))
        for schedule in schedules:
            schedule_arrival_time = schedule['arrival_time']
            schedule_arrival_time_dt = datetime.fromisoformat(schedule_arrival_time.replace('Z', '+00:00'))
            # Check if the times are within +/- 3 minutes
            if abs((arrival_time_dt - schedule_arrival_time_dt).total_seconds()) <= 180:
                time_difference = (schedule_arrival_time_dt - arrival_time_dt).total_seconds() / 60  # Convert to minutes
                time_differences.append((schedule_arrival_time, arrival_time, time_difference))
                break  # Stop checking once a match is found
    return time_differences

# Calculate the time differences
if predictions and schedules:
    time_differences_within_tolerance = calculate_time_difference_within_tolerance(predictions, schedules)
    # print("Time differences within tolerance (in minutes):")
    # print(time_differences_within_tolerance)

# store this value in a numpy array with the associated scheduled arrival time and predicted arrival time
def create_time_difference_array_within_tolerance(predictions, schedules):
    time_diff_array = []
    for pred in predictions:
        arrival_time = pred['arrival_time']
        arrival_time_dt = datetime.fromisoformat(arrival_time.replace('Z', '+00:00'))
        for schedule in schedules:
            schedule_arrival_time = schedule['arrival_time']
            schedule_arrival_time_dt = datetime.fromisoformat(schedule_arrival_time.replace('Z', '+00:00'))
            # Check if the times are within +/- 3 minutes
            if abs((arrival_time_dt - schedule_arrival_time_dt).total_seconds()) <= 180:
                time_difference = (schedule_arrival_time_dt - arrival_time_dt).total_seconds()
                time_diff_array.append([schedule_arrival_time, time_difference])
                break  # Stop checking once a match is found
    return np.array(time_diff_array)

# Create the time difference array
if predictions and schedules:
    time_diff_array_within_tolerance = create_time_difference_array_within_tolerance(predictions, schedules)
    print("Time difference array of scheduled arrival times with tolerance (in seconds):")
    print(time_diff_array_within_tolerance)

Time difference array of scheduled arrival times with tolerance (in seconds):
[['2025-04-12T10:56:00-04:00' '-122.0']
 ['2025-04-12T11:04:00-04:00' '-134.0']
 ['2025-04-12T11:12:00-04:00' '-58.0']
 ['2025-04-12T11:20:00-04:00' '-1.0']
 ['2025-04-12T11:28:00-04:00' '1.0']
 ['2025-04-12T11:37:00-04:00' '-1.0']
 ['2025-04-12T11:45:00-04:00' '-1.0']
 ['2025-04-12T11:53:00-04:00' '-1.0']
 ['2025-04-12T12:01:00-04:00' '-1.0']
 ['2025-04-12T12:09:00-04:00' '-1.0']]


In [63]:
# sanity check, make suere there are no other arrival times in the schedule witihin +/- 3 minutes of the associated prediction
def find_other_matching_schedules(predictions, schedules, matching_sched_index):
    for i, schedule in enumerate(schedules):
        if i == matching_sched_index:
            continue  # Skip the matching schedule
        schedule_arrival_time = schedule['arrival_time']
        schedule_arrival_time_dt = datetime.fromisoformat(schedule_arrival_time.replace('Z', '+00:00'))
        # Check if the arrival time is within +/- 3 minutes of the associated prediction
        for j, pred in enumerate(predictions):
            arrival_time = pred['arrival_time']
            arrival_time_dt = datetime.fromisoformat(arrival_time.replace('Z', '+00:00'))
            if abs((arrival_time_dt - schedule_arrival_time_dt).total_seconds()) <= 180:
                return i  # Return the index of the other matching schedule
    return None  # Return None if no other match is found

## Weather data: 

In [44]:
url = "https://api.open-meteo.com/v1/forecast?latitude=42.3584&longitude=-71.0598&hourly=temperature_2m,precipitation,rain,showers,snowfall,snow_depth&minutely_15=temperature_2m,snowfall,precipitation,rain&past_days=14&forecast_days=16&wind_speed_unit=mph&temperature_unit=fahrenheit&precipitation_unit=inch"
# Fetch weather data
response = requests.get(url)
weather_data = response.json()


# Restructure the weather data
weather = []
for i, time in enumerate(weather_data['minutely_15']['time']):
    weather.append({
        'time': time,
        'temperature_2m': weather_data['minutely_15']['temperature_2m'][i],
        'precipitation': weather_data['minutely_15']['precipitation'][i],
    })

# change dictionaries to rows
weather_array = np.array([list(d.values()) for d in weather])

# add bias
weather_array = np.insert(weather_array, 0, 1, axis=1)

In [45]:
weather_array

array([['1', '2025-03-29T00:00', '54.1', '0.0'],
       ['1', '2025-03-29T00:15', '53.6', '0.0'],
       ['1', '2025-03-29T00:30', '52.4', '0.0'],
       ...,
       ['1', '2025-04-27T23:15', '68.8', '0.0'],
       ['1', '2025-04-27T23:30', '68.0', '0.0'],
       ['1', '2025-04-27T23:45', '67.3', '0.0']], dtype='<U32')

# creating x and y

In [87]:
print(time_diff_array_within_tolerance)
print(f"train delay shape: {time_diff_array_within_tolerance.shape}")

[['2025-04-12T10:56:00-04:00' '-122.0']
 ['2025-04-12T11:04:00-04:00' '-134.0']
 ['2025-04-12T11:12:00-04:00' '-58.0']
 ['2025-04-12T11:20:00-04:00' '-1.0']
 ['2025-04-12T11:28:00-04:00' '1.0']
 ['2025-04-12T11:37:00-04:00' '-1.0']
 ['2025-04-12T11:45:00-04:00' '-1.0']
 ['2025-04-12T11:53:00-04:00' '-1.0']
 ['2025-04-12T12:01:00-04:00' '-1.0']
 ['2025-04-12T12:09:00-04:00' '-1.0']]
train delay shape: (10, 2)


In [86]:
print(weather_array)
print(f"Weather data shape: {weather_array.shape}")

[['1' '2025-03-29T00:00' '54.1' '0.0']
 ['1' '2025-03-29T00:15' '53.6' '0.0']
 ['1' '2025-03-29T00:30' '52.4' '0.0']
 ...
 ['1' '2025-04-27T23:15' '68.8' '0.0']
 ['1' '2025-04-27T23:30' '68.0' '0.0']
 ['1' '2025-04-27T23:45' '67.3' '0.0']]
Weather data shape: (2880, 4)


In [107]:
# match the index of the weather array to the index of the time difference array by comparing the time of the weather array to the time of the time difference array.
# if the arrival time is within 15 minutes past the weather time, then that arrival time gets that weather data associated with it
def match_weather_to_time_diff(weather_array, time_diff_array_within_tolerance):
    matched_data = []
    for i, weather in enumerate(weather_array):
        weather_time = datetime.fromisoformat(weather[1].replace('Z', '+00:00')).astimezone(None)
        for j, time_diff in enumerate(time_diff_array_within_tolerance):
            schedule_arrival_time = datetime.fromisoformat(time_diff[0].replace('Z', '+00:00')).astimezone(None)
            # Calculate actual time by adding delay to scheduled time
            delay_seconds = float(time_diff[1])
            actual_arrival_time = schedule_arrival_time + timedelta(seconds=abs(delay_seconds))
            
            if abs((schedule_arrival_time - weather_time).total_seconds()) <= 900:
                matched_data.append({
                    'bias': weather[0],                   # Bias from weather data
                    'precip': weather[3],                 # Precipitation from weather data
                    'temp': weather[2],                   # Temperature from weather data
                    'delay': time_diff[1],                # Time difference (delay)
                    'scheduled_time': time_diff[0],       # Scheduled arrival time
                    'actual_time': actual_arrival_time.isoformat()  # Calculated actual arrival time
                })
                break  # Stop checking once a match is found
    return matched_data

# Match the weather data to the time difference array
if time_diff_array_within_tolerance.size > 0 and weather_array.size > 0:
    matched_weather_data = match_weather_to_time_diff(weather_array, time_diff_array_within_tolerance)
    # print("Matched weather data:")
    for data in matched_weather_data:
        # print(data)
        pass
else:
    print("No matched weather data found.")

# create a numpy array from the data
def create_matched_weather_array(matched_weather_data):
    matched_weather_array = []
    for data in matched_weather_data:
        matched_weather_array.append([
            data['bias'],           # Bias
            data['precip'],         # Precipitation
            data['temp'],           # Temperature
            data['delay'],          # Delay (time difference)
        ])
    return np.array(matched_weather_array)

# Create the matched weather array
if matched_weather_data:
    matched_weather_array = create_matched_weather_array(matched_weather_data)
    print("Matched weather array:")
    print(matched_weather_array)
else:
    print("No matched weather data found.")

Matched weather array:
[['1' '0.024' '34.3' '-122.0']
 ['1' '0.0' '34.2' '-122.0']
 ['1' '0.02' '33.8' '-134.0']
 ['1' '0.016' '33.9' '-1.0']
 ['1' '0.008' '34.0' '-1.0']
 ['1' '0.0' '34.3' '-1.0']
 ['1' '0.004' '34.0' '-1.0']]


In [95]:
matched_weather_array

array([['2025-04-12T10:56:00-04:00', '1', '2025-04-12T10:45', '34.3',
        '0.024'],
       ['2025-04-12T10:56:00-04:00', '1', '2025-04-12T11:00', '34.2',
        '0.0'],
       ['2025-04-12T11:04:00-04:00', '1', '2025-04-12T11:15', '33.8',
        '0.02'],
       ['2025-04-12T11:20:00-04:00', '1', '2025-04-12T11:30', '33.9',
        '0.016'],
       ['2025-04-12T11:37:00-04:00', '1', '2025-04-12T11:45', '34.0',
        '0.008'],
       ['2025-04-12T11:45:00-04:00', '1', '2025-04-12T12:00', '34.3',
        '0.0'],
       ['2025-04-12T12:01:00-04:00', '1', '2025-04-12T12:15', '34.0',
        '0.004']], dtype='<U25')