In [1]:
from pathlib import Path

import arcpy
from arcgis.features import GeoAccessor
import pandas as pd
from sklearn.preprocessing import minmax_scale

from gtfs_tools.gtfs import GtfsDataset
from gtfs_tools.utils.gtfs import get_route_types_table, calculate_headway, add_modality_descriptions, add_standardized_modality_column
from gtfs_tools.utils import slugify

In [2]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'

dir_raw = dir_data / 'raw'
dir_int = dir_data / 'interim'
dir_ext = dir_data / 'external'

gdb_int = dir_int / 'interim.gdb'
gdb_ext = dir_ext / 'external.gdb'

gtfs_parent_dir = dir_raw / 'gtfs_sf'

gtfs_factor_dir = dir_int / f'{gtfs_parent_dir.name}_factors_dist_decay.parquet'
gtfs_stops_dir = dir_int / f'{gtfs_parent_dir.name}_stops_dist_decay.parquet'

stops_fc = gdb_int / f'{gtfs_parent_dir.name}_stops'
poi_fc = gdb_ext / f'cbsa_sf_h3_10_cnt'
poi_poly_fc = gdb_ext / f'cbsa_sf_h3_10'

arcpy.env.overwriteOutput = True

## Assemble Trip Factor and Stops Data

In [4]:
gtfs_pth_lst = [pth.parent for pth in gtfs_parent_dir.glob('**/agency.txt')]

In [5]:
for gtfs_cache_dir in [gtfs_factor_dir, gtfs_stops_dir]:
    if not gtfs_cache_dir.exists():
        gtfs_cache_dir.mkdir(parents=True)

In [6]:
rt_typ_df = get_route_types_table()

rt_typ_df.info()
rt_typ_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   route_type       92 non-null     object
 1   route_type_desc  92 non-null     object
 2   route_type_gtfs  92 non-null     object
dtypes: object(3)
memory usage: 2.3+ KB


Unnamed: 0,route_type,route_type_desc,route_type_gtfs
0,0,light rail,0
1,1,subway,1
2,2,rail,2
3,3,bus,3
4,4,ferry,4


In [68]:
%%time
for gtfs_pth in gtfs_pth_lst:

    # create a gtfs dataset 
    gtfs = GtfsDataset(gtfs_pth, standardize_route_types=True)

    # build the parquet part output path
    factor_pth = gtfs_factor_dir / ("factors_mdb_" + gtfs.gtfs_folder.name.split("=")[1] + ".part")
    stops_pth = gtfs_stops_dir / ("stops_mdb_" + gtfs.gtfs_folder.name.split("=")[1] + ".part")

    #### LATE BY STOP ####

    # get the needed columns for calculating overnight service
    tm_df = gtfs.stop_times.data.loc[:,['stop_id', 'arrival_time']]

    # calculate hour of the day by getting the timedelta hours and retaining the remainder when dividing by 24
    tm_df['hours'] = tm_df['arrival_time'].dt.components.hours % 24

    # calculate late night by evaluating if the stop time is after an hour in the evening or before an hour in the morning
    tm_df['late_night'] = (tm_df['hours'] <= 3) | (tm_df['hours'] >= 23)

    # get just the overnight boolean
    late_df = tm_df[['stop_id', 'late_night']].drop_duplicates()

    ### HEADWAY ###
    headway_df = gtfs.stop_times.headway[['stop_id', 'headway']].groupby('stop_id').mean().rename(columns={'headway': 'stop_headway_mean'})

    ### FIXED ROUTE TYPE AND AGENCY ###

    # get data frame of stops, routes and route types
    rt_df = gtfs.routes.data.loc[:,['route_id', 'route_type', 'agency_id']].drop_duplicates()

    # add modality descriptions
    rt_df = rt_df.merge(rt_typ_df[['route_type', 'route_type_desc']], on='route_type')

    # add the agency name
    rt_df = rt_df.merge(gtfs.agency.data[['agency_id', 'agency_name']], on='agency_id')

    # flag fixed types (not bus or school bus)
    rt_df['fixed_modality'] = ~rt_df['route_type'].isin(['3', '31'])

    # add factor for each route; fixed routes are weighted 3x more than bus routes
    rt_df['modality_factor'] = rt_df['fixed_modality'].apply(lambda val: 1 if val else 1/3)

    # create the combined factor dataframe
    factor_df = (gtfs.lookup_stop_trip
                 .merge(gtfs.lookup_stop_route, on='stop_id', how='left')
                 .merge(rt_df, on='route_id', how='left')
                 .merge(late_df, on='stop_id', how='left')
                 .merge(headway_df, on='stop_id', how='left')
                 .drop(columns=['agency_id'])
                 # .drop_duplicates(['stop_uid', 'trip_uid', 'route_uid'])
                )

    # create stop, trip and route uid column using the agency name
    agency_root = factor_df['agency_name'].apply(slugify)
    factor_df['stop_uid'] = agency_root + '_' + factor_df['stop_id'].apply(slugify)
    factor_df['trip_uid'] = agency_root + '_' + factor_df['trip_id'].apply(slugify)
    factor_df['route_uid'] = agency_root + '_' + factor_df['route_id'].apply(slugify)
    
    # create stops with uid and modality
    stops_df = (
        factor_df.drop(columns=['trip_id', 'trip_uid', 'route_id', 'route_uid'])
        .drop_duplicates()
        .reset_index(drop=True)
        .merge(gtfs.stops.sedf[['stop_id', 'SHAPE']], on='stop_id', how='left')
    )    
    
    #### SAVE OUTPUTS ####
    factor_df.to_parquet(factor_pth)
    stops_df.spatial.to_parquet(stops_pth)

Wall time: 5min 17s


In [69]:
factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80163 entries, 0 to 80162
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   stop_id            80163 non-null  object 
 1   trip_id            80163 non-null  object 
 2   route_id           80163 non-null  object 
 3   route_type         80163 non-null  object 
 4   route_type_desc    80163 non-null  object 
 5   agency_name        80163 non-null  object 
 6   fixed_modality     80163 non-null  bool   
 7   modality_factor    80163 non-null  float64
 8   late_night         80163 non-null  bool   
 9   stop_headway_mean  80116 non-null  float64
 10  stop_uid           80163 non-null  object 
 11  trip_uid           80163 non-null  object 
 12  route_uid          80163 non-null  object 
dtypes: bool(2), float64(2), object(9)
memory usage: 6.9+ MB


Unnamed: 0,stop_id,trip_id,route_id,route_type,route_type_desc,agency_name,fixed_modality,modality_factor,late_night,stop_headway_mean,stop_uid,trip_uid,route_uid
0,3000,272233,5,3,bus,San Joaquin Regional Transit District (RTD),False,0.333333,False,4.182609,san_joaquin_regional_transit_district_rtd_3000,san_joaquin_regional_transit_district_rtd_272233,san_joaquin_regional_transit_district_rtd_05
1,3000,272233,9,3,bus,San Joaquin Regional Transit District (RTD),False,0.333333,False,4.182609,san_joaquin_regional_transit_district_rtd_3000,san_joaquin_regional_transit_district_rtd_272233,san_joaquin_regional_transit_district_rtd_09
2,3000,272233,360,3,bus,San Joaquin Regional Transit District (RTD),False,0.333333,False,4.182609,san_joaquin_regional_transit_district_rtd_3000,san_joaquin_regional_transit_district_rtd_272233,san_joaquin_regional_transit_district_rtd_360
3,3000,272233,378,3,bus,San Joaquin Regional Transit District (RTD),False,0.333333,False,4.182609,san_joaquin_regional_transit_district_rtd_3000,san_joaquin_regional_transit_district_rtd_272233,san_joaquin_regional_transit_district_rtd_378
4,3000,272233,385,3,bus,San Joaquin Regional Transit District (RTD),False,0.333333,False,4.182609,san_joaquin_regional_transit_district_rtd_3000,san_joaquin_regional_transit_district_rtd_272233,san_joaquin_regional_transit_district_rtd_385


In [70]:
%%time
# bulk load stops
stops_df = pd.concat(GeoAccessor.from_parquet(pqt_pth) for pqt_pth in gtfs_stops_dir.glob('*.part'))

# set the geometry
stops_df.spatial.set_geometry('SHAPE')

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 33397 entries, 0 to 908
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   stop_id            33397 non-null  object  
 1   route_type         32106 non-null  object  
 2   route_type_desc    32106 non-null  object  
 3   agency_name        32106 non-null  object  
 4   fixed_modality     32106 non-null  object  
 5   modality_factor    32106 non-null  float64 
 6   late_night         31930 non-null  object  
 7   stop_headway_mean  31087 non-null  float64 
 8   stop_uid           33397 non-null  object  
 9   SHAPE              33397 non-null  geometry
dtypes: float64(2), geometry(1), object(7)
memory usage: 2.8+ MB
Wall time: 16.2 s


Unnamed: 0,stop_id,route_type,route_type_desc,agency_name,fixed_modality,modality_factor,late_night,stop_headway_mean,stop_uid,SHAPE
0,ABB,3,bus,Coach Usa,False,0.333333,False,210.0,coach_usa_abb,"{""x"": -90.314667, ""y"": 44.928553, ""spatialRefe..."
1,ABE,2,rail,MARC,True,1.0,False,28.0,marc_abe,"{""x"": -76.16326, ""y"": 39.508447, ""spatialRefer..."
2,ABE,2,rail,Amtrak,True,1.0,False,28.0,amtrak_abe,"{""x"": -76.16326, ""y"": 39.508447, ""spatialRefer..."
3,ABQ,2,rail,Amtrak,True,1.0,False,100.666667,amtrak_abq,"{""x"": -106.647975, ""y"": 35.082061, ""spatialRef..."
4,ACA,2,rail,Amtrak,True,1.0,False,79.8,amtrak_aca,"{""x"": -121.816024, ""y"": 38.0177, ""spatialRefer..."


In [71]:
# save to a feature class
stops_df.spatial.to_featureclass(stops_fc)

'D:\\projects\\gtfs-tools\\data\\interim\\interim.gdb\\gtfs_sf_stops'

## Proximity - Near Table

In [3]:
stops_df = GeoAccessor.from_featureclass(stops_fc, fields=['OBJECTID', 'stop_uid']).drop_duplicates('stop_uid')

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 24555 entries, 0 to 33396
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   OBJECTID  24555 non-null  Int64   
 1   stop_uid  24555 non-null  string  
 2   SHAPE     24555 non-null  geometry
dtypes: Int64(1), geometry(1), string(1)
memory usage: 791.3 KB


Unnamed: 0,OBJECTID,stop_uid,SHAPE
0,1,coach_usa_abb,"{""x"": -90.31466699999999, ""y"": 44.928553000000..."
1,2,marc_abe,"{""x"": -76.16325999999998, ""y"": 39.508447000000..."
2,3,amtrak_abe,"{""x"": -76.16325999999998, ""y"": 39.508447000000..."
3,4,amtrak_abq,"{""x"": -106.64797499999997, ""y"": 35.08206100000..."
4,5,amtrak_aca,"{""x"": -121.81602399999997, ""y"": 38.01770000000..."


In [4]:
poi_df = GeoAccessor.from_featureclass(poi_fc, fields=['OBJECTID', 'GRID_ID'])

poi_df.info()
poi_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   OBJECTID  419510 non-null  Int64   
 1   GRID_ID   419510 non-null  string  
 2   SHAPE     419510 non-null  geometry
dtypes: Int64(1), geometry(1), string(1)
memory usage: 10.0 MB


Unnamed: 0,OBJECTID,GRID_ID,SHAPE
0,1,8a2830100007fff,"{""x"": -122.18726885499996, ""y"": 38.03733354700..."
1,2,8a283010000ffff,"{""x"": -122.18832398399996, ""y"": 38.03816791200..."
2,3,8a2830100017fff,"{""x"": -122.18577712299998, ""y"": 38.03767358500..."
3,4,8a283010001ffff,"{""x"": -122.18683225399997, ""y"": 38.03850796400..."
4,5,8a2830100027fff,"{""x"": -122.18770544499995, ""y"": 38.03615912000..."


In [5]:
near_df = GeoAccessor.from_table(
    arcpy.analysis.GenerateNearTable(
        str(poi_fc), 
        near_features=str(stops_fc), 
        out_table='memory/near_tbl', 
        search_radius='5 miles', 
        method='geodesic'
    )[0]
)

near_df['dist_miles'] = near_df['NEAR_DIST'] * 0.00062137

near_df.drop(columns=['OBJECTID', 'NEAR_DIST'], inplace=True)

near_df = poi_df.join(near_df.set_index('IN_FID'), on='OBJECTID', how='left')

near_df['NEAR_FID'] = near_df['NEAR_FID'].astype('Int64')

near_df = near_df.join(stops_df.set_index('OBJECTID')['stop_uid'], on='NEAR_FID', how='left').loc[:,['GRID_ID', 'stop_uid', 'dist_miles']]

near_df.info()
near_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   GRID_ID     419510 non-null  string 
 1   stop_uid    332093 non-null  string 
 2   dist_miles  332093 non-null  float64
dtypes: float64(1), string(2)
memory usage: 9.6 MB


Unnamed: 0,GRID_ID,stop_uid,dist_miles
0,8a2830100007fff,soltrans_253,1.859308
1,8a283010000ffff,soltrans_253,1.851372
2,8a2830100017fff,soltrans_253,1.791282
3,8a283010001ffff,soltrans_253,1.781174
4,8a2830100027fff,soltrans_253,1.93797


### Calculate Distance Decay Factor

In [6]:
from typing import Union

import numpy as np

def get_sigmoid_distance_decay_index(
    distance: Union[float, int], steepness: Union[float, int], offset: Union[float, int]
) -> float:
    """
    Get sigmoid distance decay index.
    
    Args:
        distance: Distance to calculate decay for.
        steepness:
        offset:
    """
    distance_index = 1 / (1 + np.exp(steepness * (distance - offset)))

    return distance_index


def get_bus_stop_distance_decay_index(distance: Union[float, int]) -> float:
    """
    Get the distance decay coefficient for a bus stop.
    
    Args:
        distance: Walking distance in miles to the bus stop.
    """
    distance_index = get_sigmoid_distance_decay_index(distance, 5.8, 0.65)
    return distance_index


def get_light_rail_stop_distance_decay_index(distance: Union[float, int]) -> float:
    """
    Get the distance decay coefficient for a light rail stop.
    
    Args:
        distance: Walking distance in miles to the light rail stop or station.
    """
    distance_index = get_sigmoid_distance_decay_index(distance, 4.8, 1.3) * 0.98
    return distance_index

def get_distance_decay(distance: Union[float, int], modality: Union[str, int]):
    """
    Get correct distance decay based on modality code.
    
    Args:
        distance: Walking distance in miles to transit station.
        modality: Modality code for transit type being accessed.
    """
    # cast modality for consistency
    if isinstance(modality, int):
        modality = str(modality)
        
    # based on modality apply correct distance decay
    if modality in ['3', '31']:
        idx = get_bus_stop_distance_decay_index(distance)
    else:
        idx = get_light_rail_stop_distance_decay_index(distance)
        
    return idx    

In [7]:
factor_raw_df = pd.concat((pd.read_parquet(prt_pth) for prt_pth in gtfs_factor_dir.glob('*.part'))).drop(columns=['agency_name', 'stop_id', 'trip_id', 'route_id']).drop_duplicates()

factor_raw_df.info()
factor_raw_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 20205280 entries, 0 to 80162
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   route_type         object 
 1   route_type_desc    object 
 2   fixed_modality     object 
 3   modality_factor    float64
 4   late_night         object 
 5   stop_headway_mean  float64
 6   stop_uid           object 
 7   trip_uid           object 
 8   route_uid          object 
dtypes: float64(2), object(7)
memory usage: 1.5+ GB


Unnamed: 0,route_type,route_type_desc,fixed_modality,modality_factor,late_night,stop_headway_mean,stop_uid,trip_uid,route_uid
0,3,bus,False,0.333333,False,210.0,coach_usa_abb,coach_usa_105680,coach_usa_42935
1,3,bus,False,0.333333,False,210.0,coach_usa_abb,coach_usa_105683,coach_usa_42935
2,2,rail,True,1.0,False,28.0,marc_abe,marc_101939,marc_42985
3,2,rail,True,1.0,False,28.0,amtrak_abe,amtrak_101939,amtrak_88
4,2,rail,True,1.0,False,28.0,marc_abe,marc_101941,marc_42985


In [8]:
factor_df = (
    near_df.merge(factor_raw_df, on='stop_uid', how='right')
    .dropna()
    .rename(columns={
        'GRID_ID': 'h3_idx',
        'route_type': 'modality',
        'route_type_desc': 'modality_desc'
    })
)

factor_df = factor_df[[
    'h3_idx', 'stop_uid', 'trip_uid', 'route_uid', 'modality', 'modality_factor', 'late_night', 'stop_headway_mean', 'dist_miles'
]]

factor_df['distance_decay_coeff'] = factor_df[['dist_miles', 'modality']].apply(lambda r: get_distance_decay(*r), axis=1)
factor_df['headway_factor'] = 1 - minmax_scale(factor_df['stop_headway_mean']) 

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 81917689 entries, 319 to 90601577
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            object 
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: float64(5), object(5), string(1)
memory usage: 7.3+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
319,8a2830cf5647fff,amtrak_aca,amtrak_105452,amtrak_26025,2,1.0,False,79.8,0.03974,0.977693,0.87456
320,8a2830cf5667fff,amtrak_aca,amtrak_105452,amtrak_26025,2,1.0,False,79.8,0.048358,0.977596,0.87456
321,8a2830cf5647fff,amtrak_aca,amtrak_105452,amtrak_26025,2,1.0,True,79.8,0.03974,0.977693,0.87456
322,8a2830cf5667fff,amtrak_aca,amtrak_105452,amtrak_26025,2,1.0,True,79.8,0.048358,0.977596,0.87456
323,8a2830cf5647fff,amtrak_aca,amtrak_105453,amtrak_26025,2,1.0,False,79.8,0.03974,0.977693,0.87456


### Bus Factors (less than 1.5 miles)

In [9]:
factor_bus_df = (
    factor_df[
        (
            (factor_df['modality'] == '3') 
            | (factor_df['modality'] == '31')
        )
        & (factor_df['dist_miles'] <= 1.5)
    ].sort_values(['h3_idx', 'trip_uid', 'dist_miles'], ascending=False)
)

factor_bus_df.info()
factor_bus_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 59799216 entries, 79091388 to 90317476
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            object 
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: float64(5), object(5), string(1)
memory usage: 5.3+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
79091388,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79094543,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79085078,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79088233,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79078768,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865046,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543


### Fixed (non-bus) Factors (less than 2.5 miles)

In [10]:
factor_fixed_df = (
    factor_df[
        ~(
            (factor_df['modality'] == '3') 
            | (factor_df['modality'] == '31')
        )
        & (factor_df['dist_miles'] <= 2.5)
    ].sort_values(['h3_idx', 'trip_uid', 'dist_miles'], ascending=False)
)

factor_fixed_df.info()
factor_fixed_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2376156 entries, 313968 to 62163692
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            object 
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: float64(5), object(5), string(1)
memory usage: 217.5+ MB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
313968,8a2836a6b7affff,altamont_corridor_express_74827,altamont_corridor_express_ace10,altamont_corridor_express_acetrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
313960,8a2836a6b7affff,altamont_corridor_express_74827,altamont_corridor_express_ace08,altamont_corridor_express_acetrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
313952,8a2836a6b7affff,altamont_corridor_express_74827,altamont_corridor_express_ace07,altamont_corridor_express_acetrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
313944,8a2836a6b7affff,altamont_corridor_express_74827,altamont_corridor_express_ace06,altamont_corridor_express_acetrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
313936,8a2836a6b7affff,altamont_corridor_express_74827,altamont_corridor_express_ace05,altamont_corridor_express_acetrain,2,1.0,False,125.0,0.085897,0.977123,0.80336


### Reassemble Factor Table

In [11]:
factor_df = pd.concat((factor_bus_df, factor_fixed_df))

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 62175372 entries, 79091388 to 62163692
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            object 
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: float64(5), object(5), string(1)
memory usage: 5.6+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
79091388,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79094543,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79085078,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79088233,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
79078768,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865046,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543


### Consolidate into Stops

In [12]:
stops_factor_df = factor_df.drop_duplicates('stop_uid')

stops_factor_df

Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
79091388,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
71497090,8a2836a6db77fff,livermore_amador_valley_transit_authority_3_049,livermore_amador_valley_transit_authority_864130,livermore_amador_valley_transit_authority_18,3,0.333333,False,40.000000,1.401283,0.012649,0.937254
72254980,8a2836a6da9ffff,livermore_amador_valley_transit_authority_4_030,livermore_amador_valley_transit_authority_863787,livermore_amador_valley_transit_authority_14,3,0.333333,False,43.000000,1.466565,0.008697,0.932528
71637341,8a2836a6d7b7fff,livermore_amador_valley_transit_authority_4_011,livermore_amador_valley_transit_authority_865004,livermore_amador_valley_transit_authority_30r,3,0.333333,False,14.731343,1.332104,0.018776,0.977057
72050271,8a2836a6d7affff,livermore_amador_valley_transit_authority_4_015,livermore_amador_valley_transit_authority_865004,livermore_amador_valley_transit_authority_30r,3,0.333333,False,14.731343,1.296708,0.022957,0.977057
...,...,...,...,...,...,...,...,...,...,...,...
66502007,8a28308142dffff,san_francisco_bay_ferry_7215,san_francisco_bay_ferry_t_5910478_b_83117_tn_0,san_francisco_bay_ferry_77075,4,1.000000,False,12.903226,0.062356,0.977429,0.979937
66303528,8a283080ddaffff,blue_gold_fleet_2483574,blue_gold_fleet_t_374224_b_28421_tn_0,blue_gold_fleet_11643,4,1.000000,False,19.358974,0.085666,0.977126,0.969768
66493201,8a283080dd17fff,san_francisco_bay_ferry_7210,san_francisco_bay_ferry_t_5902181_b_83034_tn_0,san_francisco_bay_ferry_19310,4,1.000000,False,42.916667,0.014861,0.977952,0.932659
66303252,8a283080d75ffff,alcatraz_cruises_hornblower_2483552,alcatraz_cruises_hornblower_t_372495_b_28416_tn_0,alcatraz_cruises_hornblower_11639,4,1.000000,False,35.000000,0.072908,0.977296,0.945130


### Get Trip Count by Point of Interest (H3 index)

In [13]:
trip_idx_df = factor_df[['h3_idx', 'trip_uid']].groupby(['h3_idx']).nunique().rename(columns={'trip_uid': 'trip_count'})

trip_idx_df['trip_count_factor'] = minmax_scale(trip_idx_df['trip_count'])

trip_idx_df.info()
trip_idx_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 174803 entries, 8a283010020ffff to 8a2836a6eadffff
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   trip_count         174803 non-null  int64  
 1   trip_count_factor  174803 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 4.0 MB


Unnamed: 0_level_0,trip_count,trip_count_factor
h3_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
8a283010020ffff,70,0.015554
8a283010022ffff,70,0.015554
8a2830100247fff,70,0.015554
8a283010024ffff,70,0.015554
8a2830100257fff,70,0.015554


In [14]:
factor_df = factor_df.merge(trip_idx_df, on='h3_idx', how='outer')

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62175372 entries, 0 to 62175371
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            object 
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
 11  trip_count            int64  
 12  trip_count_factor     float64
dtypes: float64(6), int64(1), object(5), string(1)
memory usage: 6.0+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor,trip_count,trip_count_factor
0,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
1,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865048,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
2,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
3,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865047,livermore_amador_valley_transit_authority_30r,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
4,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,livermore_amador_valley_transit_authority_865046,livermore_amador_valley_transit_authority_20x,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112


In [15]:
factor_df = factor_df[['h3_idx', 'stop_uid', 'headway_factor', 'trip_count_factor', 'modality_factor', 'late_night', 'distance_decay_coeff']]

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62175372 entries, 0 to 62175371
Data columns (total 7 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   headway_factor        float64
 3   trip_count_factor     float64
 4   modality_factor       float64
 5   late_night            object 
 6   distance_decay_coeff  float64
dtypes: float64(4), object(2), string(1)
memory usage: 3.2+ GB


Unnamed: 0,h3_idx,stop_uid,headway_factor,trip_count_factor,modality_factor,late_night,distance_decay_coeff
0,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.987543,0.246112,0.333333,False,0.010336
1,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.987543,0.246112,0.333333,False,0.010336
2,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.987543,0.246112,0.333333,False,0.010336
3,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.987543,0.246112,0.333333,False,0.010336
4,8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.987543,0.246112,0.333333,False,0.010336


In [16]:
dist_coeff_df = factor_df[['h3_idx', 'stop_uid', 'distance_decay_coeff']].drop_duplicates().set_index(['h3_idx', 'stop_uid'])

dist_coeff_df

Unnamed: 0_level_0,Unnamed: 1_level_0,distance_decay_coeff
h3_idx,stop_uid,Unnamed: 2_level_1
8a2836a6eadffff,livermore_amador_valley_transit_authority_4_100,0.010336
8a2836a6eacffff,livermore_amador_valley_transit_authority_4_100,0.010734
8a2836a6e76ffff,livermore_amador_valley_transit_authority_4_100,0.009627
8a2836a6e75ffff,livermore_amador_valley_transit_authority_4_100,0.009393
8a2836a6e74ffff,livermore_amador_valley_transit_authority_4_100,0.012158
...,...,...
8a28308096cffff,golden_gate_transit_43002,0.432731
8a2830805b77fff,tideline_water_taxi_818729,0.974447
8a2830805b6ffff,tideline_water_taxi_818729,0.976256
8a2830805b57fff,tideline_water_taxi_818729,0.972534


In [17]:
modality_fctr_df = factor_df[['h3_idx', 'stop_uid', 'modality_factor']].groupby(['h3_idx', 'stop_uid']).sum()
headway_fctr_df = factor_df[['h3_idx', 'stop_uid', 'headway_factor']].groupby(['h3_idx', 'stop_uid']).mean()

h3_df = headway_fctr_df.join(modality_fctr_df).join(dist_coeff_df).reset_index().drop(columns='stop_uid')

h3_df.head()

Unnamed: 0,h3_idx,headway_factor,modality_factor,distance_decay_coeff
0,8a283010020ffff,0.956109,23.333333,0.007851
1,8a283010022ffff,0.956109,23.333333,0.007317
2,8a2830100247fff,0.956109,23.333333,0.017606
3,8a283010024ffff,0.956109,23.333333,0.027047
4,8a2830100257fff,0.956109,23.333333,0.01209


In [18]:
h3_df['transit_index'] = (h3_df['headway_factor'] + h3_df['modality_factor']) / 2 * h3_df['distance_decay_coeff']

h3_df = h3_df[['h3_idx', 'transit_index']].groupby('h3_idx').sum().reset_index()

h3_df

Unnamed: 0,h3_idx,transit_index
0,8a283010020ffff,0.095346
1,8a283010022ffff,0.088869
2,8a2830100247fff,0.213821
3,8a283010024ffff,0.328480
4,8a2830100257fff,0.146833
...,...,...
174798,8a2836a6e74ffff,4.374733
174799,8a2836a6e75ffff,3.379678
174800,8a2836a6e76ffff,3.464148
174801,8a2836a6eacffff,3.862436


In [19]:
h3_poly_df = GeoAccessor.from_featureclass(poi_poly_fc, fields=['GRID_ID']).rename(columns={'GRID_ID': 'h3_idx'})

h3_poly_df.info()
h3_poly_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   h3_idx  419510 non-null  string  
 1   SHAPE   419510 non-null  geometry
dtypes: geometry(1), string(1)
memory usage: 6.4 MB


Unnamed: 0,h3_idx,SHAPE
0,8a2830100007fff,"{""rings"": [[[-122.18706267999994, 38.036663945..."
1,8a283010000ffff,"{""rings"": [[[-122.18811780199997, 38.037498319..."
2,8a2830100017fff,"{""rings"": [[[-122.18557095499995, 38.037003980..."
3,8a283010001ffff,"{""rings"": [[[-122.18662607899995, 38.037838368..."
4,8a2830100027fff,"{""rings"": [[[-122.18749926999999, 38.035489514..."


In [20]:
h3_poly_df = h3_poly_df.merge(h3_df, on='h3_idx', how='outer')
h3_poly_df.spatial.set_geometry('SHAPE')
h3_poly_df['transit_index'] = h3_poly_df['transit_index'].fillna(0)

assert h3_poly_df.spatial.validate()
h3_poly_df.info()
h3_poly_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   h3_idx         419510 non-null  string  
 1   SHAPE          419510 non-null  geometry
 2   transit_index  419510 non-null  float64 
dtypes: float64(1), geometry(1), string(1)
memory usage: 9.6 MB


Unnamed: 0,h3_idx,SHAPE,transit_index
0,8a2830100007fff,"{""rings"": [[[-122.18706267999994, 38.036663945...",0.0
1,8a283010000ffff,"{""rings"": [[[-122.18811780199997, 38.037498319...",0.0
2,8a2830100017fff,"{""rings"": [[[-122.18557095499995, 38.037003980...",0.0
3,8a283010001ffff,"{""rings"": [[[-122.18662607899995, 38.037838368...",0.0
4,8a2830100027fff,"{""rings"": [[[-122.18749926999999, 38.035489514...",0.0


In [21]:
h3_poly_df.spatial.to_featureclass(gdb_int / f'cbsa_sf_h3_v01_idx')

'D:\\projects\\gtfs-tools\\data\\interim\\interim.gdb\\cbsa_sf_h3_v01_idx'