# Comparing GTFS stops and OSM public transport points
It is planned to use General Transit Feed Specification (GTFS) data where available for public transport analysis. However, GTFS data is not available for all cities, so additional analysis is needed to be undertaken for all cities using OSM public transport data.

### The following key-value tags were used to identify public transport stops in OSM destination datasets:   

| Key | Value |
| --- | --- |
| aos_nodes_30m_line | point |
| public_transport | platform |
| public_transport | stop_position | 
| highway | bus_stop | 
| highway | platform |  
| railway | platform | 
| public_transport | station | 
| amenity | ferry_terminal |  
| railway | tram_stop | 
| railway | stop | 

### The following criteria are used to determine frequent PT stops using GTFS datasets:  
**Four transit modes:**
- For train, stop = station (not platform)   
- For tram and bus, stop = physical stop (stops on opposite sides of the road are considered separately)  
- For ferry, stop = platform/wharf  
- Coach stops are not included by design (but would be excluded based on frequency criteria in any case)  

**Stops operate on a daily basis?**
- Operates during normal day time from 7am to 7pm  
- Has a maximum headway less than or equal to 20 min
- Operates during usual weekday (Monday to Friday). (note: It is difficult for some cities to determine a “usual weekday” – some modes run different timetables on different weekdays, and timetable changes are introduced frequently.)  

**Stops operate a “usual weekday”?**
- Select feeds from 2019 and aim for the same season or school term (e.g. Spring-Summer school term time) to ensure comparability and consistency as much as possible.   

- Select a usual or representative one week during the feeds period that should fully capture the provided PT services in a city. This temporal filtering approach is adopted from [Kujala et. al. (2018) A collection of public transport network data sets for 25 cities](https://www.nature.com/articles/sdata201889#Sec21). The goal is to obtain as 'usual' week as possible (exluding public holiday etc.), which should contain at least 0.9 (default) of the total maximum of trips.   



In [7]:
import pandas as pd
import geopandas as gpd
import os
import time   
import networkx as nx
import osmnx as ox
import numpy as np
import matplotlib.pyplot as plt

import calendar
import datetime
from datetime import timedelta

import urbanaccess as ua

# module with functions to calculate the average headway during the timeframe over departure
import gtfs_headway_analysis as gha
import gtfs_config


%matplotlib inline


In [2]:
import warnings
import matplotlib.cbook
#warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)


In [24]:
# list of global cities and its projection
cities = [{'cityname': 'adelaide', 'region': 'au', 'crs': 'epsg:7845'},
          {'cityname': 'auckland', 'region': 'nz', 'crs': 'epsg:2193'},
          {'cityname': 'baltimore', 'region': 'us', 'crs': 'epsg:32618'},
          {'cityname': 'bangkok', 'region': 'th', 'crs': 'epsg:32647'},
          {'cityname': 'barcelona', 'region': 'es', 'crs': 'epsg:25831'},
          {'cityname': 'belfast', 'region': 'gb', 'crs': 'epsg:29902'},
          {'cityname': 'bern', 'region': 'ch', 'crs': 'epsg:32633'},
          {'cityname': 'chennai', 'region': 'in', 'crs': 'epsg:32644'},
          {'cityname': 'cologne', 'region': 'de', 'crs': 'epsg:32631'},
          {'cityname': 'ghent', 'region': 'be', 'crs': 'epsg:32631'},
          {'cityname': 'graz', 'region': 'at', 'crs': 'epsg:32633'},
          {'cityname': 'hanoi', 'region': 'vn', 'crs': 'epsg:32648'},
          {'cityname': 'hong_kong', 'region': 'hk', 'crs': 'epsg:32650'},
          {'cityname': 'lisbon', 'region': 'pt', 'crs': 'epsg:3763'},
          {'cityname': 'melbourne', 'region': 'au', 'crs': 'epsg:7845'},
          {'cityname': 'mexico_city', 'region': 'mx', 'crs': 'epsg:32614'},
          {'cityname': 'odense', 'region': 'dk', 'crs': 'epsg:32632'},
          {'cityname': 'olomouc', 'region': 'cz', 'crs': 'epsg:32633'},
          {'cityname': 'phoenix', 'region': 'us', 'crs': 'epsg:32612'},
          {'cityname': 'sao_paulo', 'region': 'br', 'crs': 'epsg:32723'},
          {'cityname': 'seattle', 'region': 'us', 'crs': 'epsg:32610'},
          {'cityname': 'sydney', 'region': 'au', 'crs': 'epsg:7845'},
          {'cityname': 'valencia', 'region': 'es', 'crs': 'epsg:25830'},
          {'cityname': 'vic', 'region': 'es', 'crs': 'epsg:25831'}]

# Summarize OSM pt_any within 500m study region bbox

In [25]:
df_osm_pt = pd.DataFrame()
for i in range(len(cities)):
    # generate dict of study region input datasource parameters
    city = cities[i]['cityname']
    region = cities[i]['region']
    project_year = 2019
    distance = 1600

    geopackagePath = '../data/input/{city}_{region}_{project_year}_{distance}m_buffer.gpkg'.format(
            city=city, region=region, project_year=project_year, distance=distance)
    
    # load shapefile
    shape = gpd.GeoDataFrame.from_file(geopackagePath, 
                  layer='urban_study_region')
    polygon = shape['geometry'].iloc[0]
    # create buffer to consider edge effect
    polygon_buffered = polygon.buffer(500)      
    # get bounding box
    bbox = polygon_buffered.bounds
    
    #load destinations for public transport data
    destinations = gpd.read_file(geopackagePath, layer='destinations', bbox=bbox)
    gdf_pt = destinations[destinations['dest_name']=='pt_any']
    d = {'study_region': ['{}'.format(city)], 'OSM_pt_any_counts': ['{}'.format(len(gdf_pt))]}
    df = pd.DataFrame(data=d)
    
    df_osm_pt = df_osm_pt.append(df, ignore_index=True)


In [26]:
df_osm_pt

Unnamed: 0,study_region,OSM_pt_any_counts
0,adelaide,6765
1,auckland,5363
2,baltimore,3947
3,bangkok,1746
4,barcelona,5216
5,belfast,394
6,bern,1000
7,chennai,912
8,cologne,5365
9,ghent,2043


# Summarize GTFS stops within 500m study region bbox

In [18]:
# geopackage path where to load the gtfs processing layers
gtfs_gpkgPath = 'gtfs_frequent_transit_headway_202006_python.gpkg'

# cities with gtfs frequent stops data
gtfs_cities = ['adelaide', 'melbourne', 'sydney', 'phoenix', 'baltimore', 'auckland', 
 'seattle', 'cologne', 'lisbon_1', 'lisbon_2', 'lisbon_3', 'lisbon_4', 
 'lisbon_5', 'lisbon_6', 'lisbon_7']

df_gtfs_pt = pd.DataFrame()
GTFS = gtfs_config.GTFS
for city in gtfs_cities:
    city_config = GTFS['{}'.format(city)]
    start_date = city_config['start_date_mmdd']
    end_date = city_config['end_date_mmdd']
    gtfs_provider = GTFS['{}'.format(city)]['gtfs_provider']
    #load study region layers
    stop_frequent_gdf = gpd.GeoDataFrame.from_file(gtfs_gpkgPath,
                    layer='{}_stops_headway_{}_{}_{}'.format(
                        city, gtfs_provider, start_date, end_date),
                    driver='GPKG')
    
    gtfs_stops_tot = len(stop_frequent_gdf)
    gtfs_stops_20min = len(stop_frequent_gdf[stop_frequent_gdf['headway']<=20])
    gtfs_stops_30min = len(stop_frequent_gdf[stop_frequent_gdf['headway']<=30])
    
    e = {'study_region': ['{}'.format(city)], 
         'gtfs_provider': ['{}'.format(gtfs_provider)],
         'gtfs_pt_tot_counts': ['{}'.format(gtfs_stops_tot)], 
         'gtfs_pt_30min_counts': ['{}'.format(gtfs_stops_30min)],
        'gtfs_pt_20min_counts': ['{}'.format(gtfs_stops_20min)]}
    df = pd.DataFrame(data=e)
    
    df_gtfs_pt = df_gtfs_pt.append(df, ignore_index=True)


In [21]:
df_gtfs_pt['study_region'].replace({'lisbon_1': 'lisbon', 'lisbon_2': 'lisbon', 
                                    'lisbon_3': 'lisbon', 'lisbon_4': 'lisbon',
                                   'lisbon_5': 'lisbon', 'lisbon_6': 'lisbon',
                                   'lisbon_7': 'lisbon'}, inplace=True)
df_gtfs_pt

Unnamed: 0,study_region,gtfs_provider,gtfs_pt_tot_counts,gtfs_pt_30min_counts,gtfs_pt_20min_counts
0,adelaide,AdelaideMetro,6441,4290,2610
1,melbourne,PublicTransportVictoria,15998,11366,7463
2,sydney,NSW,21976,11368,6936
3,phoenix,Valleymetro,5956,4276,1555
4,baltimore,MarylandMTA,2990,2035,1465
5,auckland,AucklandTransport,5398,3825,2258
6,seattle,KingCountyMetro,5952,4061,2888
7,cologne,VRS,1104,871,738
8,lisbon,carris,2061,1428,1009
9,lisbon,metro-de-lisboa,49,0,0


# Merge to compare GTFS and OSM public transport stops counts

In [28]:
# merge with OSM pt data to compare directly
df_osm_gtfs_pt = pd.merge(df_osm_pt, df_gtfs_pt, on='study_region')
df_osm_gtfs_pt

Unnamed: 0,study_region,OSM_pt_any_counts,gtfs_provider,gtfs_pt_tot_counts,gtfs_pt_30min_counts,gtfs_pt_20min_counts
0,adelaide,6765,AdelaideMetro,6441,4290,2610
1,auckland,5363,AucklandTransport,5398,3825,2258
2,baltimore,3947,MarylandMTA,2990,2035,1465
3,cologne,5365,VRS,1104,871,738
4,lisbon,3647,carris,2061,1428,1009
5,lisbon,3647,metro-de-lisboa,49,0,0
6,lisbon,3647,Fertagus,4,4,4
7,lisbon,3647,MTS,4,0,0
8,lisbon,3647,Soflusa,1,1,1
9,lisbon,3647,transtejo,3,3,2


# Summarize GTFS stops by agency and route types within 500m study region bbox
This is to check which transit agencies are in the GTFS data for each city and check if these reflect real-world situation 

In [None]:
# get study region GTFS frequent stop parameters config
GTFS = gtfs_config.GTFS

df_stop_byagency = pd.DataFrame()
for city in GTFS.keys():
    city_config = GTFS['{}'.format(city)]
    gtfsfeed_path = city_config['gtfs_filename']

    bbox = GTFS['{}'.format(city)]['bbox']


    # load GTFS Feed using UrbanAccess load module
    # as the load functions in UrbanAccess actually load GTFS stops df by unique agencies and route types
    # these allow us to check if which transit agencies in each cities
    loaded_feeds = ua.gtfs.load.gtfsfeed_to_df(gtfsfeed_path=gtfsfeed_path, validation=True, bbox=bbox, remove_stops_outsidebbox=True)
    df_stop_byagency_1 = loaded_feeds.stops.groupby(['unique_feed_id','unique_agency_id', 'route_type'])[['stop_id']].count()
    df_stop_byagency_1['study_region'] = city
    df_stop_byagency = df_stop_byagency.append(df_stop_byagency_1)
    
    

In [7]:
# merge with OSM pt_any count
df_stop_byagency_osm = pd.merge(df_stop_byagency.reset_index(), df_osm_pt, left_on='study_region', right_on='study_region', how='outer')
df_stop_byagency_osm.to_csv('gtfs_stops_routetypes_agency.csv')