In [8]:
from pathlib import Path

import arcpy
from arcgis.features import GeoAccessor
import pandas as pd
from sklearn.preprocessing import minmax_scale

from gtfs_tools.gtfs import GtfsDataset
from gtfs_tools.utils.gtfs import calculate_headway, add_modality_descriptions, add_standardized_modality_column

In [2]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'

dir_raw = dir_data / 'raw'
dir_int = dir_data / 'interim'
dir_ext = dir_data / 'external'

gdb_int = dir_int / 'interim.gdb'
gdb_ext = dir_ext / 'external.gdb'

gtfs_parent_dir = dir_raw / 'gtfs_sf'

gtfs_factor_dir = dir_int / f'{gtfs_parent_dir.name}_factors_dist_decay.parquet'
gtfs_stops_dir = dir_int / f'{gtfs_parent_dir.name}_stops_dist_decay.parquet'

stops_fc = gdb_int / f'{gtfs_parent_dir.name}_stops'
poi_fc = gdb_ext / f'cbsa_sf_h3_10_cnt'
poi_poly_fc = gdb_ext / f'cbsa_sf_h3_10'

arcpy.env.overwriteOutput = True

## Assemble Trip Factor and Stops Data

In [3]:
gtfs_pth_lst = [pth.parent for pth in gtfs_parent_dir.glob('**/agency.txt')]

In [4]:
for gtfs_cache_dir in [gtfs_factor_dir, gtfs_stops_dir]:
    if not gtfs_cache_dir.exists():
        gtfs_cache_dir.mkdir(parents=True)

In [5]:
for gtfs_pth in gtfs_pth_lst:

    # create a gtfs dataset 
    gtfs = GtfsDataset(gtfs_pth, standardize_route_types=True)

    # build the parquet part output path
    factor_pth = gtfs_factor_dir / ("factors_mdb_" + gtfs.gtfs_folder.name.split("=")[1] + ".part")
    stops_pth = gtfs_stops_dir / ("stops_mdb_" + gtfs.gtfs_folder.name.split("=")[1] + ".part")

    #### LATE BY STOP ####

    # get the needed columns for calculating overnight service
    tm_df = gtfs.stop_times.data.loc[:,['stop_id', 'arrival_time']]

    # calculate hour of the day by getting the timedelta hours and retaining the remainder when dividing by 24
    tm_df['hours'] = tm_df['arrival_time'].dt.components.hours % 24

    # calculate late night by evaluating if the stop time is after an hour in the evening or before an hour in the morning
    tm_df['late_night'] = (tm_df['hours'] <= 3) | (tm_df['hours'] >= 23)

    # get just the overnight boolean
    late_df = tm_df[['stop_id', 'late_night']].drop_duplicates()

    ### HEADWAY ###
    headway_df = gtfs.stop_times.headway[['stop_id', 'headway']].groupby('stop_id').mean().rename(columns={'headway': 'stop_headway_mean'})

    ### FIXED ROUTE TYPE AND AGENCY ###

    # get data frame of stops, routes and route types
    rt_df = gtfs.routes.data.loc[:,['route_id', 'route_type', 'agency_id']].drop_duplicates()

    # add modality descriptions
    rt_df = add_modality_descriptions(rt_df)

    # add the agency name
    rt_df = rt_df.merge(gtfs.agency.data[['agency_id', 'agency_name']], on='agency_id')

    # flag fixed types (not bus or school bus)
    rt_df['fixed_modality'] = ~rt_df['route_type'].isin(['3', '31'])

    # add factor for each route; fixed routes are weighted 3x more than bus routes
    rt_df['modality_factor'] = rt_df['fixed_modality'].apply(lambda val: 1 if val else 1/3)

    # create the combined factor dataframe
    factor_df = (gtfs._crosstab_stop_trip
                 .merge(gtfs._crosstab_stop_route, on='stop_id')
                 .merge(rt_df, on='route_id')
                 .merge(late_df, on='stop_id')
                 .merge(headway_df, on='stop_id')
                 .drop(columns=['agency_id'])
                )

    # create stop, trip and route uid column using the agency name
    agency_root = factor_df['agency_name'].str.lower().str.findall(r'\w+').str.join('') + '_'
    factor_df['stop_uid'] = agency_root + factor_df['stop_id']
    factor_df['trip_uid'] = agency_root + factor_df['trip_id']
    factor_df['route_uid'] = agency_root + factor_df['route_id']

    #### SAVE OUTPUT ####
    factor_df.to_parquet(factor_pth)

    # create stops with uid
    stops_df = (gtfs.stops.sedf[['stop_id', 'stop_name', 'SHAPE']]
                .merge(gtfs._crosstab_stop_agency, on='stop_id', how='left')
                .merge(gtfs.agency.data[['agency_id', 'agency_name']], on='agency_id', how='left')
                .drop(columns='agency_id')
               )

    stops_df['stop_uid'] = stops_df['agency_name'].str.lower().str.findall(r'\w+').str.join('') + '_' + stops_df['stop_id']
    # save stops
    stops_df.spatial.to_parquet(stops_pth)
    
    break

In [9]:
rt_df

Unnamed: 0,route_id,route_type,agency_id,route_type_desc,agency_name,fixed_modality,modality_factor
0,42935,3,1227,bus,Coach Usa,False,0.333333
1,18650,3,117,bus,Executive Transportation,False,0.333333
2,42920,3,1220,bus,Beeline Express,False,0.333333
3,43,3,43,bus,Arrow Trailways,False,0.333333
4,23565,3,147,bus,Roadrunner Shuttle,False,0.333333
...,...,...,...,...,...,...,...
58,42994,2,51,rail,Amtrak,True,1.000000
59,42948,2,1230,rail,Shore Line East,True,1.000000
60,42954,3,1232,bus,Amtrak Chartered Vehicle,False,0.333333
61,42958,3,1233,bus,Sunway Charters,False,0.333333


In [32]:
# bulk load stops
stops_df = pd.concat(GeoAccessor.from_parquet(pqt_pth) for pqt_pth in gtfs_stops_dir.glob('*.part'))

# set the geometry
stops_df.spatial.set_geometry('SHAPE')

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 24022 entries, 0 to 908
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   stop_id      24022 non-null  object  
 1   stop_name    24022 non-null  object  
 2   SHAPE        24022 non-null  geometry
 3   agency_name  24022 non-null  object  
 4   stop_uid     24022 non-null  object  
dtypes: geometry(1), object(4)
memory usage: 1.1+ MB


Unnamed: 0,stop_id,stop_name,SHAPE,agency_name,stop_uid
0,ABB,Abbotsford-Colby,"{""x"": -90.314667, ""y"": 44.928553, ""spatialRefe...",Coach Usa,coachusa_ABB
1,CHP,Chippewa Falls Amtrak Bus Stop,"{""x"": -91.427794, ""y"": 44.883096, ""spatialRefe...",Coach Usa,coachusa_CHP
2,EUC,Eau Claire Amtrak Bus Stop,"{""x"": -91.506339, ""y"": 44.796558, ""spatialRefe...",Coach Usa,coachusa_EUC
3,GBY,Green Bay Amtrak Station,"{""x"": -88.00276, ""y"": 44.517053, ""spatialRefer...",Coach Usa,coachusa_GBY
4,MKE,Milwaukee,"{""x"": -87.917104, ""y"": 43.034518, ""spatialRefe...",Coach Usa,coachusa_MKE


In [6]:
# save to a feature class
stops_df.spatial.to_featureclass(stops_fc)

'D:\\projects\\gtfs-tools\\data\\interim\\interim.gdb\\gtfs_sf_stops'

## Proximity - Near Table

In [7]:
stops_df = GeoAccessor.from_featureclass(stops_fc)

stops_df.info()
stops_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24022 entries, 0 to 24021
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   OBJECTID     24022 non-null  Int64   
 1   stop_id      24022 non-null  string  
 2   stop_name    24022 non-null  string  
 3   agency_name  24022 non-null  string  
 4   stop_uid     24022 non-null  string  
 5   SHAPE        24022 non-null  geometry
dtypes: Int64(1), geometry(1), string(4)
memory usage: 1.1 MB


Unnamed: 0,OBJECTID,stop_id,stop_name,agency_name,stop_uid,SHAPE
0,1,ABB,Abbotsford-Colby,Coach Usa,coachusa_ABB,"{""x"": -90.31466699999999, ""y"": 44.928553000000..."
1,2,CHP,Chippewa Falls Amtrak Bus Stop,Coach Usa,coachusa_CHP,"{""x"": -91.42779399999995, ""y"": 44.883096000000..."
2,3,EUC,Eau Claire Amtrak Bus Stop,Coach Usa,coachusa_EUC,"{""x"": -91.50633899999997, ""y"": 44.796558000000..."
3,4,GBY,Green Bay Amtrak Station,Coach Usa,coachusa_GBY,"{""x"": -88.00275999999997, ""y"": 44.517053000000..."
4,5,MKE,Milwaukee,Coach Usa,coachusa_MKE,"{""x"": -87.91710399999994, ""y"": 43.034518000000..."


In [14]:
poi_df = GeoAccessor.from_featureclass(poi_fc)

poi_df.info()
poi_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   STORE_ID  419510 non-null  string  
 1   GRID_ID   419510 non-null  string  
 2   OBJECTID  419510 non-null  Int64   
 3   SHAPE     419510 non-null  geometry
dtypes: Int64(1), geometry(1), string(2)
memory usage: 13.2 MB


Unnamed: 0,STORE_ID,GRID_ID,OBJECTID,SHAPE
0,1,8a2830100007fff,1,"{""x"": -122.18726885499996, ""y"": 38.03733354700..."
1,2,8a283010000ffff,2,"{""x"": -122.18832398399996, ""y"": 38.03816791200..."
2,3,8a2830100017fff,3,"{""x"": -122.18577712299998, ""y"": 38.03767358500..."
3,4,8a283010001ffff,4,"{""x"": -122.18683225399997, ""y"": 38.03850796400..."
4,5,8a2830100027fff,5,"{""x"": -122.18770544499995, ""y"": 38.03615912000..."


In [15]:
near_df = GeoAccessor.from_table(
    arcpy.analysis.GenerateNearTable(
        str(poi_fc), 
        near_features=str(stops_fc), 
        out_table='memory/near_tbl', 
        search_radius='5 miles', 
        method='geodesic'
    )[0]
)

near_df['dist_miles'] = near_df['NEAR_DIST'] * 0.00062137

near_df.drop(columns=['OBJECTID', 'NEAR_DIST'], inplace=True)

near_df = poi_df[['OBJECTID', 'STORE_ID', 'GRID_ID']].join(near_df.set_index('IN_FID'), on='OBJECTID', how='left')

near_df['NEAR_FID'] = near_df['NEAR_FID'].astype('Int64')

near_df = near_df.join(stops_df.set_index('OBJECTID')['stop_uid'], on='NEAR_FID', how='left').loc[:,['GRID_ID', 'stop_uid', 'dist_miles']]

near_df.info()
near_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   GRID_ID     419510 non-null  string 
 1   stop_uid    329681 non-null  string 
 2   dist_miles  329681 non-null  float64
dtypes: float64(1), string(2)
memory usage: 9.6 MB


Unnamed: 0,GRID_ID,stop_uid,dist_miles
0,8a2830100007fff,soltrans_253,1.859308
1,8a283010000ffff,soltrans_253,1.851372
2,8a2830100017fff,soltrans_253,1.791282
3,8a283010001ffff,soltrans_253,1.781174
4,8a2830100027fff,soltrans_253,1.93797


### Calculate Distance Decay Factor

In [16]:
from typing import Union

import numpy as np

def get_sigmoid_distance_decay_index(
    distance: Union[float, int], steepness: Union[float, int], offset: Union[float, int]
) -> float:
    """
    Get sigmoid distance decay index.
    
    Args:
        distance: Distance to calculate decay for.
        steepness:
        offset:
    """
    distance_index = 1 / (1 + np.exp(steepness * (distance - offset)))

    return distance_index


def get_bus_stop_distance_decay_index(distance: Union[float, int]) -> float:
    """
    Get the distance decay coefficient for a bus stop.
    
    Args:
        distance: Walking distance in miles to the bus stop.
    """
    distance_index = get_sigmoid_distance_decay_index(distance, 5.8, 0.65)
    return distance_index


def get_light_rail_stop_distance_decay_index(distance: Union[float, int]) -> float:
    """
    Get the distance decay coefficient for a light rail stop.
    
    Args:
        distance: Walking distance in miles to the light rail stop or station.
    """
    distance_index = get_sigmoid_distance_decay_index(distance, 4.8, 1.3) * 0.98
    return distance_index

def get_distance_decay(distance: Union[float, int], modality: Union[str, int]):
    """
    Get correct distance decay based on modality code.
    
    Args:
        distance: Walking distance in miles to transit station.
        modality: Modality code for transit type being accessed.
    """
    # cast modality for consistency
    if isinstance(modality, int):
        modality = str(modality)
        
    # based on modality apply correct distance decay
    if modality in ['3', '31']:
        idx = get_bus_stop_distance_decay_index(distance)
    else:
        idx = get_light_rail_stop_distance_decay_index(distance)
        
    return idx    

In [17]:
factor_raw_df = pd.concat((pd.read_parquet(prt_pth) for prt_pth in gtfs_factor_dir.glob('*.part'))).drop(columns=['agency_name', 'stop_id', 'trip_id', 'route_id']).drop_duplicates()

factor_raw_df.info()
factor_raw_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 19320162 entries, 0 to 80115
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   route_type         object 
 1   route_type_desc    object 
 2   fixed_modality     bool   
 3   modality_factor    float64
 4   late_night         bool   
 5   stop_headway_mean  float64
 6   stop_uid           object 
 7   trip_uid           object 
 8   route_uid          object 
dtypes: bool(2), float64(2), object(5)
memory usage: 1.2+ GB


Unnamed: 0,route_type,route_type_desc,fixed_modality,modality_factor,late_night,stop_headway_mean,stop_uid,trip_uid,route_uid
0,2,rail,True,1.0,False,10.581699,shorelineeast_NHV,shorelineeast_100256,shorelineeast_42948
1,2,rail,True,1.0,True,10.581699,shorelineeast_NHV,shorelineeast_100256,shorelineeast_42948
2,2,rail,True,1.0,False,10.581699,shorelineeast_NHV,shorelineeast_100258,shorelineeast_42948
3,2,rail,True,1.0,True,10.581699,shorelineeast_NHV,shorelineeast_100258,shorelineeast_42948
4,2,rail,True,1.0,False,10.581699,shorelineeast_NHV,shorelineeast_100260,shorelineeast_42948


In [18]:
factor_df = (
    near_df.merge(factor_raw_df, on='stop_uid', how='inner')
    .rename(columns={
        'GRID_ID': 'h3_idx',
        'route_type': 'modality',
        'route_type_desc': 'modality_desc'
    })
    # .drop(columns=['IN_FID', 'NEAR_FID', 'STORE_ID'])
)

factor_df = factor_df[[
    'h3_idx', 'stop_uid', 'trip_uid', 'route_uid', 'modality', 'modality_factor', 'late_night', 'stop_headway_mean', 'dist_miles'
]]

factor_df['distance_decay_coeff'] = factor_df[['dist_miles', 'modality']].apply(lambda r: get_distance_decay(*r), axis=1)
factor_df['headway_factor'] = 1 - minmax_scale(factor_df['stop_headway_mean']) 

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86419195 entries, 0 to 86419194
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            bool   
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: bool(1), float64(5), object(4), string(1)
memory usage: 6.5+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
0,8a2830100007fff,soltrans_253,soltrans_1035,soltrans_17,3,0.333333,False,119.15,1.859308,0.000898,0.812575
1,8a2830100007fff,soltrans_253,soltrans_963,soltrans_17,3,0.333333,False,119.15,1.859308,0.000898,0.812575
2,8a2830100007fff,soltrans_253,soltrans_969,soltrans_17,3,0.333333,False,119.15,1.859308,0.000898,0.812575
3,8a2830100007fff,soltrans_253,soltrans_979,soltrans_17,3,0.333333,False,119.15,1.859308,0.000898,0.812575
4,8a2830100007fff,soltrans_253,soltrans_986,soltrans_17,3,0.333333,False,119.15,1.859308,0.000898,0.812575


### Bus Factors (less than 1.5 miles)

In [19]:
factor_bus_df = (
    factor_df[
        (
            (factor_df['modality'] == '3') 
            | (factor_df['modality'] == '31')
        )
        & (factor_df['dist_miles'] <= 1.5)
    ].sort_values(['h3_idx', 'trip_uid', 'dist_miles'], ascending=False)
)

factor_bus_df.info()
factor_bus_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 62318185 entries, 81072643 to 1420
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            bool   
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: bool(1), float64(5), object(4), string(1)
memory usage: 5.2+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
81072643,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81073721,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81072642,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81073720,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81072641,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865046,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543


### Fixed (non-bus) Factors (less than 2.5 miles)

In [20]:
factor_fixed_df = (
    factor_df[
        ~(
            (factor_df['modality'] == '3') 
            | (factor_df['modality'] == '31')
        )
        & (factor_df['dist_miles'] <= 2.5)
    ].sort_values(['h3_idx', 'trip_uid', 'dist_miles'], ascending=False)
)

factor_fixed_df.info()
factor_fixed_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3160020 entries, 86400146 to 1697670
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            bool   
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: bool(1), float64(5), object(4), string(1)
memory usage: 268.2+ MB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
86400146,8a2836a6b7affff,altamontcorridorexpress_74827,altamontcorridorexpress_ACE10,altamontcorridorexpress_ACETrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
86400145,8a2836a6b7affff,altamontcorridorexpress_74827,altamontcorridorexpress_ACE08,altamontcorridorexpress_ACETrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
86400144,8a2836a6b7affff,altamontcorridorexpress_74827,altamontcorridorexpress_ACE07,altamontcorridorexpress_ACETrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
86400143,8a2836a6b7affff,altamontcorridorexpress_74827,altamontcorridorexpress_ACE06,altamontcorridorexpress_ACETrain,2,1.0,False,125.0,0.085897,0.977123,0.80336
86400142,8a2836a6b7affff,altamontcorridorexpress_74827,altamontcorridorexpress_ACE05,altamontcorridorexpress_ACETrain,2,1.0,False,125.0,0.085897,0.977123,0.80336


### Reassemble Factor Table

In [21]:
factor_df = pd.concat((factor_bus_df, factor_fixed_df))

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 65478205 entries, 81072643 to 1697670
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            bool   
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
dtypes: bool(1), float64(5), object(4), string(1)
memory usage: 5.4+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor
81072643,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81073721,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81072642,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81073720,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543
81072641,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865046,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543


### Consolidate into Stops

In [33]:
stops_factor_df = factor_df.drop_duplicates('stop_uid')

### Get Trip Count by Point of Interest (H3 index)

In [22]:
trip_idx_df = factor_df[['h3_idx', 'trip_uid']].groupby(['h3_idx']).nunique().rename(columns={'trip_uid': 'trip_count'})

trip_idx_df['trip_count_factor'] = minmax_scale(trip_idx_df['trip_count'])

trip_idx_df.info()
trip_idx_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 181896 entries, 8a283010020ffff to 8a2836a6eadffff
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   trip_count         181896 non-null  int64  
 1   trip_count_factor  181896 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 4.2 MB


Unnamed: 0_level_0,trip_count,trip_count_factor
h3_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
8a283010020ffff,70,0.015554
8a283010022ffff,70,0.015554
8a2830100247fff,70,0.015554
8a283010024ffff,70,0.015554
8a2830100257fff,70,0.015554


In [23]:
factor_df = factor_df.merge(trip_idx_df, on='h3_idx', how='outer')

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65478205 entries, 0 to 65478204
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   trip_uid              object 
 3   route_uid             object 
 4   modality              object 
 5   modality_factor       float64
 6   late_night            bool   
 7   stop_headway_mean     float64
 8   dist_miles            float64
 9   distance_decay_coeff  float64
 10  headway_factor        float64
 11  trip_count            int64  
 12  trip_count_factor     float64
dtypes: bool(1), float64(6), int64(1), object(4), string(1)
memory usage: 5.9+ GB


Unnamed: 0,h3_idx,stop_uid,trip_uid,route_uid,modality,modality_factor,late_night,stop_headway_mean,dist_miles,distance_decay_coeff,headway_factor,trip_count,trip_count_factor
0,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
1,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865048,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
2,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
3,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865047,livermoreamadorvalleytransitauthority_30R,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112
4,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,livermoreamadorvalleytransitauthority_865046,livermoreamadorvalleytransitauthority_20X,3,0.333333,False,8.074627,1.436511,0.010336,0.987543,1078,0.246112


In [24]:
factor_df = factor_df[['h3_idx', 'stop_uid', 'headway_factor', 'trip_count_factor', 'modality_factor', 'late_night', 'distance_decay_coeff']]

factor_df.info()
factor_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65478205 entries, 0 to 65478204
Data columns (total 7 columns):
 #   Column                Dtype  
---  ------                -----  
 0   h3_idx                string 
 1   stop_uid              object 
 2   headway_factor        float64
 3   trip_count_factor     float64
 4   modality_factor       float64
 5   late_night            bool   
 6   distance_decay_coeff  float64
dtypes: bool(1), float64(4), object(1), string(1)
memory usage: 3.0+ GB


Unnamed: 0,h3_idx,stop_uid,headway_factor,trip_count_factor,modality_factor,late_night,distance_decay_coeff
0,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.987543,0.246112,0.333333,False,0.010336
1,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.987543,0.246112,0.333333,False,0.010336
2,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.987543,0.246112,0.333333,False,0.010336
3,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.987543,0.246112,0.333333,False,0.010336
4,8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.987543,0.246112,0.333333,False,0.010336


In [25]:
dist_coeff_df = factor_df[['h3_idx', 'stop_uid', 'distance_decay_coeff']].drop_duplicates().set_index(['h3_idx', 'stop_uid'])

dist_coeff_df

Unnamed: 0_level_0,Unnamed: 1_level_0,distance_decay_coeff
h3_idx,stop_uid,Unnamed: 2_level_1
8a2836a6eadffff,livermoreamadorvalleytransitauthority_4-100,0.010336
8a2836a6eacffff,livermoreamadorvalleytransitauthority_4-100,0.010734
8a2836a6e76ffff,livermoreamadorvalleytransitauthority_4-100,0.009627
8a2836a6e75ffff,livermoreamadorvalleytransitauthority_4-100,0.009393
8a2836a6e74ffff,livermoreamadorvalleytransitauthority_4-100,0.012158
...,...,...
8a28301330effff,capitolcorridorjointpowersauthority_MTZ,0.975655
8a28301330e7fff,capitolcorridorjointpowersauthority_MTZ,0.976343
8a28301330cffff,capitolcorridorjointpowersauthority_MTZ,0.973528
8a2830133057fff,capitolcorridorjointpowersauthority_MTZ,0.977059


In [27]:
modality_fctr_df = factor_df[['h3_idx', 'stop_uid', 'modality_factor']].groupby(['h3_idx', 'stop_uid']).sum()
headway_fctr_df = factor_df[['h3_idx', 'stop_uid', 'headway_factor']].groupby(['h3_idx', 'stop_uid']).mean()

h3_df = headway_fctr_df.join(modality_fctr_df).join(dist_coeff_df).reset_index().drop(columns='stop_uid')

h3_df.head()

Unnamed: 0,h3_idx,headway_factor,modality_factor,distance_decay_coeff
0,8a283010020ffff,0.956109,23.333333,0.007851
1,8a283010022ffff,0.956109,23.333333,0.007317
2,8a2830100247fff,0.956109,23.333333,0.017606
3,8a283010024ffff,0.956109,23.333333,0.027047
4,8a2830100257fff,0.956109,23.333333,0.01209


In [28]:
h3_df['transit_index'] = (h3_df['headway_factor'] + h3_df['modality_factor']) / 2 * h3_df['distance_decay_coeff']

h3_df = h3_df[['h3_idx', 'transit_index']].groupby('h3_idx').sum().reset_index()

h3_df

Unnamed: 0,h3_idx,transit_index
0,8a283010020ffff,0.095346
1,8a283010022ffff,0.088869
2,8a2830100247fff,0.213821
3,8a283010024ffff,0.328480
4,8a2830100257fff,0.146833
...,...,...
181891,8a2836a6e74ffff,4.374733
181892,8a2836a6e75ffff,3.379678
181893,8a2836a6e76ffff,3.464148
181894,8a2836a6eacffff,3.862436


In [29]:
h3_poly_df = GeoAccessor.from_featureclass(poi_poly_fc, fields=['GRID_ID']).rename(columns={'GRID_ID': 'h3_idx'})

h3_poly_df.info()
h3_poly_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype   
---  ------  --------------   -----   
 0   h3_idx  419510 non-null  string  
 1   SHAPE   419510 non-null  geometry
dtypes: geometry(1), string(1)
memory usage: 6.4 MB


Unnamed: 0,h3_idx,SHAPE
0,8a2830100007fff,"{""rings"": [[[-122.18706267999994, 38.036663945..."
1,8a283010000ffff,"{""rings"": [[[-122.18811780199997, 38.037498319..."
2,8a2830100017fff,"{""rings"": [[[-122.18557095499995, 38.037003980..."
3,8a283010001ffff,"{""rings"": [[[-122.18662607899995, 38.037838368..."
4,8a2830100027fff,"{""rings"": [[[-122.18749926999999, 38.035489514..."


In [30]:
h3_poly_df = h3_poly_df.merge(h3_df, on='h3_idx', how='outer')
h3_poly_df.spatial.set_geometry('SHAPE')
h3_poly_df['transit_index'] = h3_poly_df['transit_index'].fillna(0)

assert h3_poly_df.spatial.validate()
h3_poly_df.info()
h3_poly_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 419510 entries, 0 to 419509
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   h3_idx         419510 non-null  string  
 1   SHAPE          419510 non-null  geometry
 2   transit_index  419510 non-null  float64 
dtypes: float64(1), geometry(1), string(1)
memory usage: 9.6 MB


Unnamed: 0,h3_idx,SHAPE,transit_index
0,8a2830100007fff,"{""rings"": [[[-122.18706267999994, 38.036663945...",0.0
1,8a283010000ffff,"{""rings"": [[[-122.18811780199997, 38.037498319...",0.0
2,8a2830100017fff,"{""rings"": [[[-122.18557095499995, 38.037003980...",0.0
3,8a283010001ffff,"{""rings"": [[[-122.18662607899995, 38.037838368...",0.0
4,8a2830100027fff,"{""rings"": [[[-122.18749926999999, 38.035489514...",0.0


In [31]:
h3_poly_df.spatial.to_featureclass(gdb_int / f'cbsa_sf_h3_v01_idx')

'D:\\projects\\gtfs-tools\\data\\interim\\interim.gdb\\sf_h3_v01_idx'