## Testing Access of CTA Bus Tracker

In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.



### Pulls
Limit of 100,000 pulls/day/key. There are 107 bus routes. We can query up to 10 routes at once.

### Features

- vid:
- tms
- ...

In [2]:
# imports

import requests

In [3]:
# liv's api key: xYZ6rHMweaTy5Ejzahk5Y4eQs

url_bus = 'https://www.ctabustracker.com/bustime/api/v3/?key=xYZ6rHMweaTy5Ejzahk5Y4eQs'
url_predictions = 'https://www.ctabustracker.com/bustime/api/v3/getvehicles?key=xYZ6rHMweaTy5Ejzahk5Y4eQs&rt=201,55&format=json'

r = requests.get(url_predictions)


# print(r.text)



In [4]:
bus = r.json()

In [5]:
bus

{'bustime-response': {'vehicle': [{'vid': '1570',
    'tmstmp': '20260218 17:31',
    'lat': '41.793087005615234',
    'lon': '-87.73565673828125',
    'hdg': '90',
    'pid': 5424,
    'rt': '55',
    'des': 'Museum of Science & Industry',
    'pdist': 5991,
    'dly': False,
    'tatripid': '204604',
    'origtatripno': '273985436',
    'tablockid': '55 -651',
    'zone': '',
    'mode': 1,
    'psgld': 'N/A',
    'stst': 61710,
    'stsd': '2026-02-18'},
   {'vid': '8057',
    'tmstmp': '20260218 17:30',
    'lat': '41.79317855834961',
    'lon': '-87.72671055793762',
    'hdg': '85',
    'pid': 5424,
    'rt': '55',
    'des': 'Museum of Science & Industry',
    'pdist': 8426,
    'dly': False,
    'tatripid': '204605',
    'origtatripno': '273985437',
    'tablockid': '55 -659',
    'zone': '',
    'mode': 1,
    'psgld': 'N/A',
    'stst': 62280,
    'stsd': '2026-02-18'},
   {'vid': '8019',
    'tmstmp': '20260218 17:30',
    'lat': '41.79350357055664',
    'lon': '-87.705194091

In [None]:
# import requests
# import pandas as pd
# import time
# from datetime import datetime
# from zoneinfo import ZoneInfo

# ####### Main Functions - Pedro's Version #######

# ##############################
# # FUNCTION 1 - API EXTRACTING
# ##############################

# def get_api(url):
#   r = requests.get(url_predictions)

#   if r.ok:
#     data = r.json()
#     print('API Request complete')
#     return data

#   else:
#     raise ValueError('Could not complete API Request')

# ##############################
# # FUNCTION 2 - SAVING DATA
# ##############################

# def save_data(data):
#   try:
#     vehicles = data['bustime-response']['vehicle']
#     df = pd.DataFrame(vehicles)

#     chicago_tz = ZoneInfo("America/Chicago")
#     timestamp = datetime.now(chicago_tz).strftime("%Y-%m-%d_%H-%M-%S")
#     filename = f"{timestamp}_bus_data.csv"

#     df.to_csv(filename, index=False)
#     print(f'Data saved to {filename}')

#     return filename, len(df)

#   except:
#     raise ValueError('No active vehicle data')


# ##############################
# # FUNCTION 3 - MAIN RUN
# ##############################
# def main(url, sleep_time=30, runtime_seconds=3600):
#   n_calls = 0
#   try:
#     start_time = time.time()
#     end_time = start_time + runtime_seconds

#     while time.time() - start_time < end_time:
#       data = get_api(url)
#       filename, n_rows = save_data(data)
#       n_calls += 1
#       print(f'{n_rows} rows were saved to {filename}')

#       time.sleep(sleep_time)

#   except Exception as e:
#       print(f"[{n_calls+1}] Error: {e}")


In [6]:
import os
import requests
import pandas as pd
import time
from datetime import datetime
from zoneinfo import ZoneInfo

CHICAGO_TZ = ZoneInfo("America/Chicago")

def chunk_list(xs, n=10):
    return [xs[i:i+n] for i in range(0, len(xs), n)]

def get_routes(api_key):
    print("Getting routes...")
    url = f"https://www.ctabustracker.com/bustime/api/v3/getroutes?key={api_key}&format=json"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    data = r.json()

    routes = data.get("bustime-response", {}).get("routes", [])
    if not routes:
        err = data.get("bustime-response", {}).get("error", [])
        raise ValueError(f"No routes returned. Error: {err}")

    return [rt["rt"] for rt in routes if "rt" in rt]

def get_api(url):
    r = requests.get(url, timeout=30)
    if r.ok:
        return r.json()
    raise ValueError(f"API request failed (status={r.status_code})")

def append_vehicles_to_csv(data, outfile, pulled_at, rt_chunk):
    vehicles = data.get("bustime-response", {}).get("vehicle", None)
    if not vehicles:
        # no vehicles is normal sometimes; don't crash the whole run
        return 0

    df = pd.DataFrame(vehicles)

    # add metadata columns so you can trace pulls later
    df["pulled_at"] = pulled_at
    df["rt_chunk"] = rt_chunk

    file_exists = os.path.exists(outfile)
    df.to_csv(outfile, mode="a", header=not file_exists, index=False)
    return len(df)

def main(api_key, per_chunk_sleep=5, per_sweep_sleep=30, runtime_hours=1200, break_hours=300, out_dir="."):
    routes = get_routes(api_key)
    chunks = chunk_list(routes, n=10)

    start_stamp = datetime.now(CHICAGO_TZ).strftime("%Y-%m-%d_%H-%M-%S")
    outfile = os.path.join(out_dir, f"bus_data_{start_stamp}_chicago.csv")

    print(f"Found {len(routes)} routes -> {len(chunks)} chunks")
    print(f"Writing EVERYTHING to one file:\n  {outfile}\n")
    print(f"Chunk sleep: {per_chunk_sleep}s | Sweep sleep: {per_sweep_sleep}s | Runtime: {runtime_hours} hours\n")

    start = time.time()
    end = start + runtime_hours*3600

    sweep_num = 0
    call_num = 0
    total_rows = 0

    while time.time() < end:
        sweep_num += 1
        print(f"--- Sweep {sweep_num} @ {datetime.now(CHICAGO_TZ).strftime('%Y-%m-%d %H:%M:%S %Z')} ---")

        for i, chunk in enumerate(chunks):
            if time.time() >= end:
                break

            rt_param = ",".join(chunk)
            url = (
                "https://www.ctabustracker.com/bustime/api/v3/getvehicles"
                f"?key={api_key}&rt={rt_param}&format=json"
            )

            pulled_at = datetime.now(CHICAGO_TZ).strftime("%Y-%m-%d %H:%M:%S %Z")

            try:
                data = get_api(url)
                n_rows = append_vehicles_to_csv(data, outfile, pulled_at=pulled_at, rt_chunk=rt_param)
                call_num += 1
                total_rows += n_rows
                print(f"[Call {call_num}] chunk {i+1}/{len(chunks)}: appended {n_rows} rows (total {total_rows})")
            except Exception as e:
                call_num += 1
                print(f"[Call {call_num}] chunk {i+1}/{len(chunks)} ERROR: {e}")

            if time.time() < end:
                time.sleep(min(per_chunk_sleep, max(0, end - time.time())))

        if time.time() < end:
            sleep_now = min(per_sweep_sleep, max(0, end - time.time()))
            print(f"--- Sweep {sweep_num} complete. Sleeping {sleep_now:.0f}s ---\n")
            time.sleep(sleep_now)

    print(f"\nDone. Sweeps: {sweep_num}, calls: {call_num}, total rows written: {total_rows}")
    print(f"Output file: {outfile}")
    return outfile


In [None]:
key = 'NPeYcrgS6Pt432G5F64u8jgQD'
main(key)

# Calculating Delays Percentage

In [None]:
# Change file manually for now
file = pd.read_csv('bus_data_2026-02-18_17-31-35_chicago.csv')

## Analysis 1: Percentage of Delayed Observed Delays

In [None]:
def delay_percentage(df, route_col="rt", delay_col="dly"):
  '''
  This function analyzes the percentage of time a route is delayed (delayed observations / total observations per route).
  '''

  # Ensuring that delays are read as booleans
  df[delay_col] = df[delay_col].astype(bool)
  
  
  delay = (df.groupby(route_col)[delay_col]
           .agg(delayed_obs="sum", total_obs="count")
           .assign(delay_pct=lambda x: 100 * x["delayed_obs"] / x["total_obs"])
           .reset_index()
           .sort_values("delay_pct", ascending=False))

  return delay

In [17]:
delay_stats = delay_percentage(file)
delay_stats.head()


Unnamed: 0,rt,delayed_obs,total_obs,delay_pct
1,100,3,12,25.0
73,59,3,24,12.5
100,84,2,18,11.111111
3,106,1,10,10.0
41,26,3,30,10.0


## Analysis 2: Delay rate per bus; Average per Route

In [None]:
def delay_per_bus_route(df, route_col="rt", vid_col="vid", delay_col="dly"):
      '''
      This function calculates the delay rate for each vehicle, and then averages the rate across the number of vehicles.
      '''
      
      d = df.copy()
      
      # Ensuring boleean
      d[delay_col] = d[delay_col].astype(bool)
      
      # Creating the average per link
      per_vehicle = (d.groupby([route_col, vid_col])[delay_col]
                     .mean()
                     .reset_index(name="delay_share_vid"))
      
      # Averaging percentages by the number of buses
      out = (per_vehicle.groupby(route_col)["delay_share_vid"]
             .mean()
             .mul(100)
             .reset_index(name="delay_pct_vehicle_weighted")
             .sort_values("delay_pct_vehicle_weighted", ascending=False))
      
      
      return out


In [None]:
a1 = delay_per_bus_route(file)
a1.head()

Unnamed: 0,rt,delay_pct_vehicle_weighted
3,106,25.0
1,100,25.0
73,59,12.5
100,84,11.111111
41,26,10.0


## Analysis 3: Share of vehicles delayed by Route at each Sweep

In [None]:
def delay_vehicles_delayed(df, route_col="rt", timestamp_col="tmstmp", vid_col="vid", delay_col="dly"):
       
       '''
       This function calculates the percentage of vehicles delayed in a route at each timestamp
       '''
       
       d = df.copy()
       
       # Ensuring delay is read as boolean
       d[delay_col] = d[delay_col].astype(bool)
       
       # Dropping duplicates if no new observations
       d = d.drop_duplicates([route_col, timestamp_col, vid_col])
       
       per_pull = (d.groupby([route_col, timestamp_col])[delay_col]
                   .mean()
                   .reset_index(name="delay_share_tmstmp"))
       
       out = (per_pull.groupby(route_col)["delay_share_tmstmp"]
              .mean()
              .mul(100)
              .reset_index(name="delay_pct_tmptmp_weighted")
              .sort_values("delay_pct_pull_weighted", ascending=False))
       
       return out


In [None]:
a2 = delay_vehicles_delayed(file, route_col="rt", vid_col="vid", delay_col="dly")

a2.head()

Unnamed: 0,rt,delay_pct_pull_weighted
1,100,25.0
73,59,12.5
100,84,11.111111
41,26,10.0
3,106,8.333333


In [21]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   vid           3825 non-null   int64  
 1   tmstmp        3825 non-null   object 
 2   lat           3825 non-null   float64
 3   lon           3825 non-null   float64
 4   hdg           3825 non-null   int64  
 5   pid           3825 non-null   int64  
 6   rt            3825 non-null   object 
 7   des           3825 non-null   object 
 8   pdist         3825 non-null   int64  
 9   dly           3825 non-null   bool   
 10  tatripid      3825 non-null   int64  
 11  origtatripno  3825 non-null   int64  
 12  tablockid     3825 non-null   object 
 13  zone          0 non-null      float64
 14  mode          3825 non-null   int64  
 15  psgld         0 non-null      float64
 16  stst          3825 non-null   int64  
 17  stsd          3825 non-null   object 
 18  pulled_at     3825 non-null 