In [1]:
from pathlib import Path

import arcpy
from arcgis.features import GeoAccessor
import pandas as pd
import numpy as np

from gtfs_tools.gtfs import GtfsDataset
from gtfs_tools.utils.gtfs import add_agency_name_column, add_modality_descriptions, add_standarized_modality_column, add_location_descriptions

In [4]:
gtfs_parent = Path(r"D:\projects\gtfs-tools\data\raw\Esri_NL_gtfsnlzip_2024-08-13_16_01")

gtfs_dir = gtfs_parent / 'gtfs'

gdb_pth = gtfs_parent / 'gtfs.gdb'
line_pth = gdb_pth / 'lines'
stop_pth = gdb_pth / 'stops'

In [5]:
gtfs = GtfsDataset(gtfs_dir)

gtfs

GtfsDataset: D:\projects\gtfs-tools\data\raw\Esri_NL_gtfsnlzip_2024-08-13_16_01\gtfs

## Schema Creation

### Schema Definitions

In [6]:
line_dtype_dict = {
    'shape_id': 'string',
    'route_id': 'string',
    'agency_id': 'string',
    'agency_name': 'string',
    'route_short_name': 'string',
    'route_long_name': 'string',
    'route_desc': 'string',
    'route_type': 'string',
    'route_url': 'string',
    'route_color': 'string',
    'route_type_text': 'string',
    'esri_route_type_carto': 'string',
    'esri_route_type_carto_desc': 'string',
    'esri_contributor': 'string',
    'esri_date_received': 'datetime64',
    'esri_date_processed': 'datetime64',
    'esri_excluded': 'Int32'
}

stop_dtype_dict = {
    'stop_id': 'string',
    'stop_code': 'string',
    'stop_name': 'string',
    'tts_stop_name': 'string',
    'stop_desc': 'string',
    'stop_lat': 'Float64',
    'stop_lon': 'Float64',
    'zone_id': 'string',
    'stop_url': 'string',
    'location_type': 'string',
    'parent_station': 'string',
    'stop_timezone': 'string',
    'wheelchair_boarding': 'string',
    'level_id': 'string',
    'platform_code': 'string',
    'esri_contributor': 'string',
    'esri_date_received': 'datetime64[us]',
    'esri_stop_type': 'string',
    'esri_stop_type_desc': 'string',
    'esri_stop_type_carto': 'string',
    'esri_stop_type_carto_desc': 'string',
    'esri_location_type_desc': 'string',
    'agency_id': 'string',
    'agency_name': 'string',
    'esri_date_processed': 'datetime64[us]',
    'esri_excluded': 'Int32',
}

### Build Data

#### Stops

In [7]:
# get stops with modality
stops_df = gtfs.stops.sedf

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69945 entries, 0 to 69944
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   stop_id              69945 non-null  object  
 1   stop_code            57569 non-null  object  
 2   stop_name            69945 non-null  object  
 3   stop_lat             69945 non-null  Float64 
 4   stop_lon             69945 non-null  Float64 
 5   location_type        69945 non-null  object  
 6   parent_station       34694 non-null  object  
 7   stop_timezone        668 non-null    object  
 8   wheelchair_boarding  67256 non-null  float64 
 9   platform_code        3339 non-null   object  
 10  zone_id              12394 non-null  object  
 11  SHAPE                69945 non-null  geometry
dtypes: Float64(2), float64(1), geometry(1), object(8)
memory usage: 6.5+ MB


Unnamed: 0,stop_id,stop_code,stop_name,stop_lat,stop_lon,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code,zone_id,SHAPE
0,2323991,,Koln-Ehrenfeld,50.951582,6.917458,0,stoparea:177908,,,,IFF:kolne,"{""x"": 6.9174579, ""y"": 50.9515817, ""spatialRefe..."
1,2324425,,Sappemeer Oost,53.158998,6.795339,0,stoparea:18130,,,1.0,IFF:spm,"{""x"": 6.7953389883, ""y"": 53.1589979356, ""spati..."
2,2324426,,Sappemeer Oost,53.158848,6.796734,0,stoparea:18130,,,2.0,IFF:spm,"{""x"": 6.79673373699, ""y"": 53.1588483687, ""spat..."
3,2324427,,Sappemeer Oost,53.158834,6.796141,0,stoparea:18130,,,,IFF:spm,"{""x"": 6.7961409688, ""y"": 53.1588338945, ""spati..."
4,2380333,,Veenwouden,53.235293,5.989464,0,stoparea:377859,,,,IFF:vwd,"{""x"": 5.98946392536, ""y"": 53.2352926285, ""spat..."


In [8]:
# start creating the data frame by adding modalty and descriptions to the spatially enabled stop points data frame
stops_df = gtfs.stops.sedf.join(gtfs.stops.modalities, on='stop_id', how='left').rename(columns={'route_type': 'esri_stop_type'})
stops_df = add_modality_descriptions(stops_df, modality_codes_column='esri_stop_type', description_column='esri_stop_type_desc')

# add location description column
stops_df = add_location_descriptions(stops_df)

# add standardized modalities with descriptions to the data
stops_df = add_standarized_modality_column(stops_df, modality_column='esri_stop_type', standardized_modality_column='esri_stop_type_carto')
stops_df = add_modality_descriptions(stops_df, modality_codes_column='esri_stop_type_carto', description_column='esri_stop_type_carto_desc')

# add the agency
stops_df = add_agency_name_column(stops_df.merge(gtfs._crosstab_stop_agency, on='stop_id', how='left'), gtfs.agency.data)

# ensure level_id is added
if 'level_id' not in stops_df.columns:
    stops_df['level_id'] = None
    stops_df['level_id'] = stops_df['level_id'].astype('string')

# get any missing columns and make a filter for the ones that do exist
missing_cols = [c for c in stop_dtype_dict.keys() if c not in stops_df.columns]
keep_cols = [c for c in stop_dtype_dict.keys() if c in stops_df.columns] + ['SHAPE']

# set default excluded value
stops_df['esri_excluded'] = False

# add any missing columns to ensure consistent schema
for col in missing_cols:
    stops_df[col] = None
    stops_df[col] = stops_df[col].astype(stop_dtype_dict[col])

# reorganize schema and remove any potentially added duplicates
stops_df = stops_df[list(stop_dtype_dict.keys()) + ['SHAPE']]

# set the geometry so the spatially enabled data frame works
stops_df.spatial.set_geometry('SHAPE')

print(f'Missing Columns: {missing_cols}')
stops_df.info()
stops_df.head()

NameError: name 'DELIJN' is not defined

In [None]:
stops_df['esri_date_received'] = datetime.datetime(2024, 8, 13, 16, 1, 1)

stops_df.head()

In [12]:
stops_df.spatial.set_geometry('SHAPE')

In [15]:
if not arcpy.Exists(str(gdb_pth)):
    arcpy.management.CreateFileGDB(str(gdb_pth.parent), str(gdb_pth.stem))

In [16]:
stops_df.spatial.to_featureclass(stop_pth)

'\\\\DevBA00007\\data\\gtfs_publishing\\raw\\esri_switzerland_gtfsfp202520240923zip_2024-09-26_11_58\\gtfs.gdb\\stops'

#### Route Lines

In [7]:
# start creating the data frame by adding agency name to the spatially enabled route lines data frame
routes_df = add_agency_name_column(gtfs.routes.sedf, gtfs.agency.data)

# add the modality descriptions to the data
routes_df = add_modality_descriptions(routes_df , modality_codes_column='route_type', description_column='route_type_text')

# add standardized modalities with descriptions to the data
routes_df = add_standarized_modality_column(routes_df, modality_column='route_type', standardized_modality_column='esri_route_type_carto')
routes_df = add_modality_descriptions(routes_df, modality_codes_column='esri_route_type_carto', description_column='esri_route_type_carto_desc')

# add default route color
routes_df['route_color'] = routes_df['route_color'].fillna("828282")

# get any missing columns and make a filter for the ones that do exist
missing_cols = [c for c in line_dtype_dict.keys() if c not in routes_df.columns]
keep_cols = [c for c in routes_df.columns if c in line_dtype_dict.keys()] + ['SHAPE']

# reorganize schema
routes_df = routes_df.loc[:,keep_cols]

# set the geometry so the spatially enabled data frame works
routes_df.spatial.set_geometry('SHAPE')

print(f'Missing Columns: {missing_cols}')
routes_df.info()
routes_df.head()

Missing Columns: ['esri_contributor', 'esri_date_received', 'esri_date_processed', 'esri_excluded']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   agency_id                   15 non-null     object  
 1   route_id                    15 non-null     object  
 2   route_short_name            15 non-null     object  
 3   route_long_name             15 non-null     object  
 4   route_desc                  0 non-null      float64 
 5   route_type                  15 non-null     object  
 6   route_url                   0 non-null      float64 
 7   route_color                 15 non-null     object  
 8   shape_id                    15 non-null     object  
 9   agency_name                 15 non-null     object  
 10  route_type_text             15 non-null     object  
 11  esri_route_type_carto       15 non-nul

Unnamed: 0,agency_id,route_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,shape_id,agency_name,route_type_text,esri_route_type_carto,esri_route_type_carto_desc,SHAPE
0,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_177613,The Lift,bus,3,bus,"{""paths"": [[[-105.76172716672248, 39.883615422..."
1,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_787965,The Lift,bus,3,bus,"{""paths"": [[[-105.784927, 39.917198], [-105.78..."
2,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_177614,The Lift,bus,3,bus,"{""paths"": [[[-105.812309, 39.945946], [-105.81..."
3,806,10096,RED,Rendezvous/Red Line,,3,,fe011f,p_177617,The Lift,bus,3,bus,"{""paths"": [[[-105.76172716672248, 39.883615422..."
4,806,10103,GR,Granby Regional Commuter,,3,,808080,p_177627,The Lift,bus,3,bus,"{""paths"": [[[-105.92525558889, 40.061079254010..."


## Data Introspection

In [7]:
line_df = GeoAccessor.from_featureclass(line_pth)

line_dtypes = line_df.dtypes

line_dtypes

OBJECTID                               Int64
shape_id                      string[python]
route_id                      string[python]
agency_id                     string[python]
agency_name                   string[python]
route_short_name              string[python]
route_long_name               string[python]
route_desc                    string[python]
route_type                    string[python]
route_url                     string[python]
route_color                   string[python]
route_text_color              string[python]
route_type_text               string[python]
esri_route_type_carto         string[python]
esri_route_type_carto_desc    string[python]
esri_contributor              string[python]
esri_date_received            datetime64[us]
esri_date_processed           datetime64[us]
esri_excluded                          Int32
SHAPE                               geometry
dtype: object

In [9]:
stop_df = GeoAccessor.from_featureclass(stop_pth)

stop_dtypes = stop_df.dtypes

stop_dtypes

OBJECTID                              Int64
stop_id                      string[python]
stop_code                    string[python]
stop_name                    string[python]
tts_stop_name                string[python]
stop_desc                    string[python]
stop_lat                            Float64
stop_lon                            Float64
zone_id                      string[python]
stop_url                     string[python]
location_type                string[python]
parent_station               string[python]
stop_timezone                string[python]
wheelchair_boarding          string[python]
level_id                     string[python]
platform_code                string[python]
esri_contributor             string[python]
esri_date_received           datetime64[us]
esri_stop_type               string[python]
esri_stop_type_desc          string[python]
esri_stop_type_carto         string[python]
esri_stop_type_carto_desc    string[python]
esri_location_type_desc      str

In [15]:
for col, typ in zip(stop_df.columns, stop_df.dtypes):
    if col != 'OBJECTID':
        print(f"'{col}': '{typ}',")

'stop_id': 'string',
'stop_code': 'string',
'stop_name': 'string',
'tts_stop_name': 'string',
'stop_desc': 'string',
'stop_lat': 'Float64',
'stop_lon': 'Float64',
'zone_id': 'string',
'stop_url': 'string',
'location_type': 'string',
'parent_station': 'string',
'stop_timezone': 'string',
'wheelchair_boarding': 'string',
'level_id': 'string',
'platform_code': 'string',
'esri_contributor': 'string',
'esri_date_received': 'datetime64[us]',
'esri_stop_type': 'string',
'esri_stop_type_desc': 'string',
'esri_stop_type_carto': 'string',
'esri_stop_type_carto_desc': 'string',
'esri_location_type_desc': 'string',
'agency_id': 'string',
'agency_name': 'string',
'esri_date_processed': 'datetime64[us]',
'esri_excluded': 'Int32',
'SHAPE': 'geometry',
