In [1]:
from pathlib import Path

from arcgis.features import GeoAccessor
import pandas as pd
import numpy as np

from gtfs_tools.gtfs import GtfsDataset
from gtfs_tools.utils.gtfs import add_agency_name_column, add_modality_descriptions, add_standarized_modality_column, add_location_descriptions

In [2]:
gtfs_parent = Path(r'\\DevBA00007\data\gtfs_publishing\interim\Grand_County_Colorado_Bus_winterparkcousgtfszip_2024-06-19_10_43_48')

gtfs_dir = gtfs_parent / 'gtfs'

gdb_pth = gtfs_parent / 'gtfs.gdb'
line_pth = gdb_pth / 'lines'
stop_pth = gdb_pth / 'stops'

In [3]:
gtfs = GtfsDataset(gtfs_dir)

gtfs

GtfsDataset: \\DevBA00007\data\gtfs_publishing\interim\Grand_County_Colorado_Bus_winterparkcousgtfszip_2024-06-19_10_43_48\gtfs

## Schema Creation

### Schema Definitions

In [4]:
line_dtype_dict = {
    'shape_id': 'string',
    'route_id': 'string',
    'agency_id': 'string',
    'agency_name': 'string',
    'route_short_name': 'string',
    'route_long_name': 'string',
    'route_desc': 'string',
    'route_type': 'string',
    'route_url': 'string',
    'route_color': 'string',
    'route_type_text': 'string',
    'esri_route_type_carto': 'string',
    'esri_route_type_carto_desc': 'string',
    'esri_contributor': 'string',
    'esri_date_received': 'datetime64',
    'esri_date_processed': 'datetime64',
    'esri_excluded': 'Int32'
}

stop_dtype_dict = {
    'stop_id': 'string',
    'stop_code': 'string',
    'stop_name': 'string',
    'tts_stop_name': 'string',
    'stop_desc': 'string',
    'stop_lat': 'Float64',
    'stop_lon': 'Float64',
    'zone_id': 'string',
    'stop_url': 'string',
    'location_type': 'string',
    'parent_station': 'string',
    'stop_timezone': 'string',
    'wheelchair_boarding': 'string',
    'level_id': 'string',
    'platform_code': 'string',
    'esri_contributor': 'string',
    'esri_date_received': 'datetime64[us]',
    'esri_stop_type': 'string',
    'esri_stop_type_desc': 'string',
    'esri_stop_type_carto': 'string',
    'esri_stop_type_carto_desc': 'string',
    'esri_location_type_desc': 'string',
    'agency_id': 'string',
    'agency_name': 'string',
    'esri_date_processed': 'datetime64[us]',
    'esri_excluded': 'Int32',
}

### Build Data

#### Stops

In [5]:
# get stops with modality
stops_df = gtfs.stops.sedf

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   stop_id              128 non-null    object  
 1   stop_code            104 non-null    float64 
 2   platform_code        0 non-null      object  
 3   stop_name            128 non-null    object  
 4   stop_desc            0 non-null      float64 
 5   stop_lat             128 non-null    Float64 
 6   stop_lon             128 non-null    Float64 
 7   zone_id              0 non-null      float64 
 8   stop_url             0 non-null      float64 
 9   location_type        128 non-null    object  
 10  parent_station       0 non-null      object  
 11  stop_timezone        128 non-null    object  
 12  position             0 non-null      float64 
 13  direction            0 non-null      float64 
 14  wheelchair_boarding  128 non-null    int64   
 15  tts_stop_name        1 

Unnamed: 0,stop_id,stop_code,platform_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,position,direction,wheelchair_boarding,tts_stop_name,SHAPE
0,2436192,0.0,,Winter Park Resort,,39.883629,-105.761856,,,0,,America/Denver,,,0,,"{""x"": -105.7618562085, ""y"": 39.883629097501, ""..."
1,2436193,1.0,,Zephyr Lodge,,39.885233,-105.761376,,,0,,America/Denver,,,0,,"{""x"": -105.76137618519, ""y"": 39.885232743635, ..."
2,2436194,2.0,,Iron Horse,,39.880612,-105.760421,,,0,,America/Denver,,,0,,"{""x"": -105.7604213257, ""y"": 39.880612104385, ""..."
3,2436195,3.0,,The Vintage,,39.881755,-105.758128,,,0,,America/Denver,,,0,,"{""x"": -105.75812801914, ""y"": 39.881755303358, ..."
4,2436196,4.0,,Winter Park Mountain Lodge,,39.885524,-105.759543,,,0,,America/Denver,,,0,,"{""x"": -105.7595427238, ""y"": 39.885523796475, ""..."


In [6]:
# start creating the data frame by adding modalty and descriptions to the spatially enabled stop points data frame
stops_df = gtfs.stops.sedf.join(gtfs.stops.modalities, on='stop_id', how='left').rename(columns={'route_type': 'esri_stop_type'})
stops_df = add_modality_descriptions(stops_df, modality_codes_column='esri_stop_type', description_column='esri_stop_type_desc')

# add location description column
stops_df = add_location_descriptions(stops_df)

# add standardized modalities with descriptions to the data
stops_df = add_standarized_modality_column(stops_df, modality_column='esri_stop_type', standardized_modality_column='esri_stop_type_carto')
stops_df = add_modality_descriptions(stops_df, modality_codes_column='esri_stop_type_carto', description_column='esri_stop_type_carto_desc')

# add the agency
stops_df = add_agency_name_column(stops_df.merge(gtfs._crosstab_stop_agency, on='stop_id', how='left'), gtfs.agency.data)

# ensure level_id is added
if 'level_id' not in stops_df.columns:
    stops_df['level_id'] = None
    stops_df['level_id'] = stops_df['level_id'].astype('string')

# get any missing columns and make a filter for the ones that do exist
missing_cols = [c for c in stop_dtype_dict.keys() if c not in stops_df.columns]
keep_cols = [c for c in stop_dtype_dict.keys() if c in stops_df.columns] + ['SHAPE']

# reorganize schema and remove any potentially added duplicates
stops_df = stops_df[keep_cols]

# set the geometry so the spatially enabled data frame works
stops_df.spatial.set_geometry('SHAPE')

print(f'Missing Columns: {missing_cols}')
stops_df.info()
stops_df.head()

Missing Columns: ['esri_contributor', 'esri_date_received', 'esri_date_processed', 'esri_excluded']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   stop_id                    128 non-null    object  
 1   stop_code                  104 non-null    float64 
 2   stop_name                  128 non-null    object  
 3   tts_stop_name              1 non-null      object  
 4   stop_desc                  0 non-null      float64 
 5   stop_lat                   128 non-null    Float64 
 6   stop_lon                   128 non-null    Float64 
 7   zone_id                    0 non-null      float64 
 8   stop_url                   0 non-null      float64 
 9   location_type              128 non-null    object  
 10  parent_station             0 non-null      object  
 11  stop_timezone              128 non-null    object

Unnamed: 0,stop_id,stop_code,stop_name,tts_stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,...,level_id,platform_code,esri_stop_type,esri_stop_type_desc,esri_stop_type_carto,esri_stop_type_carto_desc,esri_location_type_desc,agency_id,agency_name,SHAPE
0,2436192,0.0,Winter Park Resort,,,39.883629,-105.761856,,,0,...,,,3.0,bus,3.0,bus,stop,806.0,The Lift,"{""x"": -105.7618562085, ""y"": 39.883629097501, ""..."
1,2436193,1.0,Zephyr Lodge,,,39.885233,-105.761376,,,0,...,,,,,,,stop,,,"{""x"": -105.76137618519, ""y"": 39.885232743635, ..."
2,2436194,2.0,Iron Horse,,,39.880612,-105.760421,,,0,...,,,,,,,stop,,,"{""x"": -105.7604213257, ""y"": 39.880612104385, ""..."
3,2436195,3.0,The Vintage,,,39.881755,-105.758128,,,0,...,,,,,,,stop,,,"{""x"": -105.75812801914, ""y"": 39.881755303358, ..."
4,2436196,4.0,Winter Park Mountain Lodge,,,39.885524,-105.759543,,,0,...,,,,,,,stop,,,"{""x"": -105.7595427238, ""y"": 39.885523796475, ""..."


#### Route Lines

In [7]:
# start creating the data frame by adding agency name to the spatially enabled route lines data frame
routes_df = add_agency_name_column(gtfs.routes.sedf, gtfs.agency.data)

# add the modality descriptions to the data
routes_df = add_modality_descriptions(routes_df , modality_codes_column='route_type', description_column='route_type_text')

# add standardized modalities with descriptions to the data
routes_df = add_standarized_modality_column(routes_df, modality_column='route_type', standardized_modality_column='esri_route_type_carto')
routes_df = add_modality_descriptions(routes_df, modality_codes_column='esri_route_type_carto', description_column='esri_route_type_carto_desc')

# add default route color
routes_df['route_color'] = routes_df['route_color'].fillna("828282")

# get any missing columns and make a filter for the ones that do exist
missing_cols = [c for c in line_dtype_dict.keys() if c not in routes_df.columns]
keep_cols = [c for c in routes_df.columns if c in line_dtype_dict.keys()] + ['SHAPE']

# reorganize schema
routes_df = routes_df.loc[:,keep_cols]

# set the geometry so the spatially enabled data frame works
routes_df.spatial.set_geometry('SHAPE')

print(f'Missing Columns: {missing_cols}')
routes_df.info()
routes_df.head()

Missing Columns: ['esri_contributor', 'esri_date_received', 'esri_date_processed', 'esri_excluded']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   agency_id                   15 non-null     object  
 1   route_id                    15 non-null     object  
 2   route_short_name            15 non-null     object  
 3   route_long_name             15 non-null     object  
 4   route_desc                  0 non-null      float64 
 5   route_type                  15 non-null     object  
 6   route_url                   0 non-null      float64 
 7   route_color                 15 non-null     object  
 8   shape_id                    15 non-null     object  
 9   agency_name                 15 non-null     object  
 10  route_type_text             15 non-null     object  
 11  esri_route_type_carto       15 non-nul

Unnamed: 0,agency_id,route_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,shape_id,agency_name,route_type_text,esri_route_type_carto,esri_route_type_carto_desc,SHAPE
0,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_177613,The Lift,bus,3,bus,"{""paths"": [[[-105.76172716672248, 39.883615422..."
1,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_787965,The Lift,bus,3,bus,"{""paths"": [[[-105.784927, 39.917198], [-105.78..."
2,806,10099,FRA,Fraser/Black Line,,3,,2a2a2a,p_177614,The Lift,bus,3,bus,"{""paths"": [[[-105.812309, 39.945946], [-105.81..."
3,806,10096,RED,Rendezvous/Red Line,,3,,fe011f,p_177617,The Lift,bus,3,bus,"{""paths"": [[[-105.76172716672248, 39.883615422..."
4,806,10103,GR,Granby Regional Commuter,,3,,808080,p_177627,The Lift,bus,3,bus,"{""paths"": [[[-105.92525558889, 40.061079254010..."


## Data Introspection

In [7]:
line_df = GeoAccessor.from_featureclass(line_pth)

line_dtypes = line_df.dtypes

line_dtypes

OBJECTID                               Int64
shape_id                      string[python]
route_id                      string[python]
agency_id                     string[python]
agency_name                   string[python]
route_short_name              string[python]
route_long_name               string[python]
route_desc                    string[python]
route_type                    string[python]
route_url                     string[python]
route_color                   string[python]
route_text_color              string[python]
route_type_text               string[python]
esri_route_type_carto         string[python]
esri_route_type_carto_desc    string[python]
esri_contributor              string[python]
esri_date_received            datetime64[us]
esri_date_processed           datetime64[us]
esri_excluded                          Int32
SHAPE                               geometry
dtype: object

In [9]:
stop_df = GeoAccessor.from_featureclass(stop_pth)

stop_dtypes = stop_df.dtypes

stop_dtypes

OBJECTID                              Int64
stop_id                      string[python]
stop_code                    string[python]
stop_name                    string[python]
tts_stop_name                string[python]
stop_desc                    string[python]
stop_lat                            Float64
stop_lon                            Float64
zone_id                      string[python]
stop_url                     string[python]
location_type                string[python]
parent_station               string[python]
stop_timezone                string[python]
wheelchair_boarding          string[python]
level_id                     string[python]
platform_code                string[python]
esri_contributor             string[python]
esri_date_received           datetime64[us]
esri_stop_type               string[python]
esri_stop_type_desc          string[python]
esri_stop_type_carto         string[python]
esri_stop_type_carto_desc    string[python]
esri_location_type_desc      str

In [15]:
for col, typ in zip(stop_df.columns, stop_df.dtypes):
    if col != 'OBJECTID':
        print(f"'{col}': '{typ}',")

'stop_id': 'string',
'stop_code': 'string',
'stop_name': 'string',
'tts_stop_name': 'string',
'stop_desc': 'string',
'stop_lat': 'Float64',
'stop_lon': 'Float64',
'zone_id': 'string',
'stop_url': 'string',
'location_type': 'string',
'parent_station': 'string',
'stop_timezone': 'string',
'wheelchair_boarding': 'string',
'level_id': 'string',
'platform_code': 'string',
'esri_contributor': 'string',
'esri_date_received': 'datetime64[us]',
'esri_stop_type': 'string',
'esri_stop_type_desc': 'string',
'esri_stop_type_carto': 'string',
'esri_stop_type_carto_desc': 'string',
'esri_location_type_desc': 'string',
'agency_id': 'string',
'agency_name': 'string',
'esri_date_processed': 'datetime64[us]',
'esri_excluded': 'Int32',
'SHAPE': 'geometry',
