# Initialize things

In [30]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import geopandas as gpd
import importlib as imp
import fastparquet
import pyarrow

from Transit_Quality_Study import transit_quality_study
from Transit_Quality_Study.transit_quality_study.config import *

imp.reload(transit_quality_study.config)

import Transit_Quality_Study.transit_quality_study.custom_funcs as tqs
from gtfs_functions import Feed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# Create feed
# busiest_date must be false, time_windows to group them by the whole day
feed = Feed(gtfs_feed_path, busiest_date = False, time_windows=[0, 24])

# Create GTFS df files
gtfs = tqs.Gtfs(feed)

INFO:root:Reading "routes.txt".
INFO:root:accessing trips
INFO:root:Start date is None. You should either specify a start date or set busiest_date to True.
INFO:root:Reading "trips.txt".
INFO:root:Reading "stop_times.txt".
INFO:root:_trips is defined in stop_times
INFO:root:Reading "stops.txt".
INFO:root:computing patterns
INFO:root:Reading "shapes.txt".


In [32]:
# Create combined census
census_data = pd.read_csv(census_data_path)
census_map = gpd.read_file(census_map_path)

census = tqs.merge_census(census_data, census_map)

# In what DA is this stop?


In [33]:
# Drop unnecessary columns
stop_location = gtfs.stops.drop(
    ['stop_id', 'stop_name', 'stop_url', 'wheelchair_boarding'],
    axis=1,
    inplace=False)

# Using spatial join
stop_location = gpd.sjoin(stop_location, census, how='left', predicate='within')

# Some stations have multiple instances.
stop_location = stop_location.drop_duplicates(['stop_code'], keep='first')

# Set index
stop_location.set_index('stop_code', inplace=True)
stop_location

Unnamed: 0_level_0,stop_lat,stop_lon,location_type,parent_station,geometry,id,a,t,dw,hh,pop,Type,Area (sq km),Population,Dwellings,Households,v_CA21_6: Population density per square kilometre,v_CA21_906: Median total income of household in 2020 ($)
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
10118,45.446466,-73.603118,1,,POINT (-73.60312 45.44647),24663382.0,0.9907,DA,0,0,0,DA,0.9907,0.0,0.0,0.0,0.0,
10120,45.451158,-73.593242,1,,POINT (-73.59324 45.45116),24661003.0,0.0526,DA,276,258,564,DA,0.0526,564.0,276.0,258.0,10722.4,65000.0
10122,45.457010,-73.581691,1,,POINT (-73.58169 45.45701),24661023.0,0.0772,DA,256,244,474,DA,0.0772,474.0,256.0,244.0,6139.9,54800.0
10124,45.459441,-73.572021,1,,POINT (-73.57202 45.45944),24661179.0,0.067,DA,572,551,1003,DA,0.0670,1003.0,572.0,551.0,14970.1,64000.0
10126,45.461894,-73.567074,1,,POINT (-73.56707 45.46189),24661222.0,0.1247,DA,648,619,1066,DA,0.1247,1066.0,648.0,619.0,8548.5,70000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60987,45.433289,-73.907005,0,,POINT (-73.907 45.43329),24663118.0,9.2531,DA,438,393,1095,DA,9.2531,1095.0,438.0,393.0,118.3,149000.0
61121,45.411347,-73.949240,0,,POINT (-73.94924 45.41135),24663119.0,0.2103,DA,7,6,365,DA,0.2103,365.0,7.0,6.0,1735.6,
61253,45.403753,-73.940191,0,,POINT (-73.94019 45.40375),24663118.0,9.2531,DA,438,393,1095,DA,9.2531,1095.0,438.0,393.0,118.3,149000.0
61274,45.610343,-73.662089,0,,POINT (-73.66209 45.61034),24650240.0,0.625,DA,253,245,510,DA,0.6250,510.0,253.0,245.0,816.0,56000.0


# What routes serve this stop?
We will use stop_times, where every row indicates a bus or train stopping at a certain stop at a certain time. Obviously, we don't need to know every time the bus stops at a stop, so we will drop the multiple instances of the same line and stop, and group the routes into lists.

# Group stop_routes


In [34]:
# Drop stop_times where stop_code and route_id duplicate
stop_routes = gtfs.stop_times.drop_duplicates(['stop_code', 'route_id'], keep='first')
stop_routes = stop_routes.groupby(['stop_code', 'stop_name', 'stop_url', ]).agg(list)

# Drop the fluff
stop_routes.drop(
    ['arrival_time', 'trip_id', 'service_id', 'direction_id', 'shape_id', 'wheelchair_boarding', 'stop_id', 'stop_sequence', 'departure_time', 'geometry', 'stop_lat', 'stop_lon', 'location_type'],
    axis=1,
    inplace=True)

stop_routes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,route_id,route_name,parent_station
stop_code,stop_name,stop_url,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10118,Station Angrignon,http://www.stm.info/fr/infos/reseaux/metro/angrignon,[1],[1 Ligne 1 - Verte],[STATION_M118]
10120,Station Monk,http://www.stm.info/fr/infos/reseaux/metro/monk,[1],[1 Ligne 1 - Verte],[STATION_M120]
10122,Station Jolicoeur,http://www.stm.info/fr/infos/reseaux/metro/jolicoeur,[1],[1 Ligne 1 - Verte],[STATION_M122]
10124,Station Verdun,http://www.stm.info/fr/infos/reseaux/metro/verdun,[1],[1 Ligne 1 - Verte],[STATION_M124]
10126,Station De l'Église,http://www.stm.info/fr/infos/reseaux/metro/de-l-eglise,[1],[1 Ligne 1 - Verte],[STATION_M126]
...,...,...,...,...,...
62373,Henri-Bourassa / de l'Esplanade,https://www.stm.info/fr/recherche#stq=62373,"[380, 171, 164, 135]","[380 Henri-Bourassa, 171 Henri-Bourassa, 164 D...","[nan, nan, nan, nan]"
62374,Henri-Bourassa / du Bois-de-Boulogne,https://www.stm.info/fr/recherche#stq=62374,"[380, 171, 164, 135, 180]","[380 Henri-Bourassa, 171 Henri-Bourassa, 164 D...","[nan, nan, nan, nan, nan]"
62375,Saint-Laurent / Lighthall,https://www.stm.info/fr/recherche#stq=62375,[69],[69 Gouin],[nan]
62376,Hickmore / Mega,https://www.stm.info/fr/recherche#stq=62376,[100],[100 Crémazie],[nan]


In [35]:
stop_data = stop_routes.merge(stop_location, how='left', on=['stop_code'])
stop_data.drop(['Type', 't', 'hh', 'dw', 'a', 'pop'], axis=1, inplace=True)
stop_data

Unnamed: 0_level_0,route_id,route_name,parent_station_x,stop_lat,stop_lon,location_type,parent_station_y,geometry,id,Area (sq km),Population,Dwellings,Households,v_CA21_6: Population density per square kilometre,v_CA21_906: Median total income of household in 2020 ($)
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
10118,[1],[1 Ligne 1 - Verte],[STATION_M118],45.446466,-73.603118,1,,POINT (-73.60312 45.44647),24663382.0,0.9907,0.0,0.0,0.0,0.0,
10120,[1],[1 Ligne 1 - Verte],[STATION_M120],45.451158,-73.593242,1,,POINT (-73.59324 45.45116),24661003.0,0.0526,564.0,276.0,258.0,10722.4,65000.0
10122,[1],[1 Ligne 1 - Verte],[STATION_M122],45.457010,-73.581691,1,,POINT (-73.58169 45.45701),24661023.0,0.0772,474.0,256.0,244.0,6139.9,54800.0
10124,[1],[1 Ligne 1 - Verte],[STATION_M124],45.459441,-73.572021,1,,POINT (-73.57202 45.45944),24661179.0,0.0670,1003.0,572.0,551.0,14970.1,64000.0
10126,[1],[1 Ligne 1 - Verte],[STATION_M126],45.461894,-73.567074,1,,POINT (-73.56707 45.46189),24661222.0,0.1247,1066.0,648.0,619.0,8548.5,70000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62373,"[380, 171, 164, 135]","[380 Henri-Bourassa, 171 Henri-Bourassa, 164 D...","[nan, nan, nan, nan]",45.547323,-73.672934,0,,POINT (-73.67293 45.54732),24662522.0,0.0712,540.0,303.0,285.0,7584.3,68500.0
62374,"[380, 171, 164, 135, 180]","[380 Henri-Bourassa, 171 Henri-Bourassa, 164 D...","[nan, nan, nan, nan, nan]",45.537554,-73.679091,0,,POINT (-73.67909 45.53755),24662544.0,0.0599,656.0,300.0,288.0,10951.6,53600.0
62375,[69],[69 Gouin],[nan],45.550630,-73.671607,0,,POINT (-73.67161 45.55063),24662525.0,0.0952,583.0,276.0,256.0,6123.9,82000.0
62376,[100],[100 Crémazie],[nan],45.476435,-73.693871,0,,POINT (-73.69387 45.47644),24663419.0,15.9573,467.0,203.0,194.0,29.3,94000.0


In [36]:
stop_data = gpd.GeoDataFrame(stop_data, geometry=stop_data.geometry)
stop_data.to_parquet(stop_data_path)

In [45]:
gtfs.to_parquet()

In [46]:
gtfs.has_parquet()

True