In [1]:
import pandas as pd
import numpy as np
import requests
import zipfile 
import kaggle   


# https://open.toronto.ca/dataset/ttc-bus-delay-data/

In [2]:
# toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/

# to hit our API, the URL is:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# datasets are called "packages". Each package can contain many "resources"
# to retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = { "id": "ttc-bus-delay-data"}
package = requests.get(url, params = params).json()

# to get resource data:
for idx, resource in enumerate(package["result"]["resources"]):

    # to get metadata for non datastore_active resources:
    if not resource["datastore_active"]:
        url = base_url + "/api/3/action/resource_show?id=" + resource["id"]
        resource_metadata = requests.get(url).json()
        
        # use the "url" attribute to download this file


In [21]:
# read in data and parse datetime from separate date and time columns

ttc_delay = pd.read_excel(resource_metadata['result']['url'], parse_dates=[['Date','Time']])

  ttc_delay = pd.read_excel(resource_metadata['result']['url'], parse_dates=[['Date','Time']])


In [22]:
ttc_delay.head()

Unnamed: 0,Date_Time,Route,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle
0,2023-01-01 02:30:00,91,Sunday,WOODBINE AND MORTIMER,Diversion,81,111,,8772
1,2023-01-01 02:34:00,69,Sunday,WARDEN STATION,Security,22,44,S,8407
2,2023-01-01 03:06:00,35,Sunday,JANE STATION,Cleaning - Unsanitary,30,60,N,1051
3,2023-01-01 03:14:00,900,Sunday,KIPLING STATION,Security,17,17,,3334
4,2023-01-01 03:43:00,85,Sunday,MEADOWALE LOOP,Security,1,1,,1559


In [5]:
# get basic description of dataset

ttc_delay.describe()

Unnamed: 0,Date,Min Delay,Min Gap,Vehicle
count,34981,34981.0,34981.0,34981.0
mean,2023-05-07 23:25:42.563105536,20.823418,33.46611,5499.208685
min,2023-01-01 00:00:00,0.0,0.0,0.0
25%,2023-03-05 00:00:00,8.0,16.0,3130.0
50%,2023-05-10 00:00:00,11.0,21.0,7924.0
75%,2023-07-12 00:00:00,20.0,40.0,8547.0
max,2023-08-31 00:00:00,998.0,992.0,91024.0
std,,53.451456,55.58887,3847.960427


In [6]:
# check shape of dataset

ttc_delay.shape

(34981, 10)

In [7]:
# get idea of how many nulls there are in dataset

ttc_delay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34981 entries, 0 to 34980
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       34981 non-null  datetime64[ns]
 1   Route      34609 non-null  object        
 2   Time       34981 non-null  object        
 3   Day        34981 non-null  object        
 4   Location   34981 non-null  object        
 5   Incident   34981 non-null  object        
 6   Min Delay  34981 non-null  int64         
 7   Min Gap    34981 non-null  int64         
 8   Direction  28412 non-null  object        
 9   Vehicle    34981 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 2.7+ MB


In [8]:
# count unique values in incident type 

ttc_delay.Incident.value_counts()

Incident
Mechanical                          12509
Operations - Operator                6050
Security                             3225
Collision - TTC                      2599
Diversion                            2376
General Delay                        2047
Emergency Services                   1953
Utilized Off Route                   1464
Cleaning - Unsanitary                1292
Investigation                         814
Vision                                416
Road Blocked - NON-TTC Collision      190
Held By                                46
Name: count, dtype: int64

In [9]:
# count unique values in delay location

ttc_delay.Location.value_counts()

Location
KENNEDY STATION           740
KIPLING STATION           672
FINCH STATION             605
WILSON STATION            593
PIONEER VILLAGE STATIO    551
                         ... 
STEELES AND BARMARC         1
PRINCES' BLVD AND NOVA      1
YORK MILLS AND              1
MCOWAN AND HWY 7            1
PORT UNION RD AND LAWR      1
Name: count, Length: 8485, dtype: int64

In [26]:
# preparing for tableau 

ttc_delay['Date'] = pd.to_datetime(ttc_delay['Date_Time']).dt.date
ttc_delay['Time'] = pd.to_datetime(ttc_delay['Date_Time']).dt.time

In [27]:
ttc_delay.head()

Unnamed: 0,Date_Time,Route,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle,Date,Time
0,2023-01-01 02:30:00,91,Sunday,WOODBINE AND MORTIMER,Diversion,81,111,,8772,2023-01-01,02:30:00
1,2023-01-01 02:34:00,69,Sunday,WARDEN STATION,Security,22,44,S,8407,2023-01-01,02:34:00
2,2023-01-01 03:06:00,35,Sunday,JANE STATION,Cleaning - Unsanitary,30,60,N,1051,2023-01-01,03:06:00
3,2023-01-01 03:14:00,900,Sunday,KIPLING STATION,Security,17,17,,3334,2023-01-01,03:14:00
4,2023-01-01 03:43:00,85,Sunday,MEADOWALE LOOP,Security,1,1,,1559,2023-01-01,03:43:00


In [25]:
ttc_delay.to_excel('ttc_delay_2023.xlsx', sheet_name='Data')

In [None]:
# correlation plot and EDA