# Experimenting how to get schedule data from HSL gtfs package

In [118]:
import numpy as np
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile

Fetching data

In [119]:
r = requests.get('http://dev.hsl.fi/gtfs/hsl.zip')
r

<Response [200]>

In [120]:
gtfs_zip = ZipFile(BytesIO(r.content))
gtfs_zip.namelist()

['agency.txt',
 'calendar.txt',
 'calendar_dates.txt',
 'call_line_phone_numbers.txt',
 'fare_attributes.txt',
 'fare_rules.txt',
 'feed_info.txt',
 'routes.txt',
 'shapes.txt',
 'stops.txt',
 'stop_times.txt',
 'transfers.txt',
 'translations.txt',
 'trips.txt']

In [121]:
df = pd.read_csv(gtfs_zip.open('trips.txt'))
calendar = pd.read_csv(gtfs_zip.open('calendar.txt'))
df.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,bikes_allowed,max_delay
0,1001,1001_20200508_20200614_Ke,1001_20200508_Ke_1_0551,Käpylä,0,1001_20200504_1,1,2,5
1,1001,1001_20200508_20200614_Ke,1001_20200508_Ke_1_0606,Käpylä,0,1001_20200504_1,1,2,5
2,1001,1001_20200508_20200614_Ke,1001_20200508_Ke_1_0621,Käpylä,0,1001_20200504_1,1,2,4
3,1001,1001_20200508_20200614_Ke,1001_20200508_Ke_1_0636,Käpylä,0,1001_20200504_1,1,2,4
4,1001,1001_20200508_20200614_Ke,1001_20200508_Ke_1_0651,Käpylä,0,1001_20200504_1,1,2,4


Joining trip data with calendar so date can be specified (removing duplicate schedules).

In [122]:
df_wdates = df.set_index('service_id').join(calendar.set_index('service_id'))
df_wdates.head()

Unnamed: 0_level_0,route_id,trip_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,bikes_allowed,max_delay,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0531,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,20200508,20200614
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0546,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,20200508,20200614
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0600,Käpylä,0,1001 5_20200504_1,1,2,4,0,0,1,0,0,0,0,20200508,20200614
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0843,Käpylä,0,1001 5_20200504_1,1,2,0,0,0,1,0,0,0,0,20200508,20200614
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_1105,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,20200508,20200614


In [123]:
date_select = pd.to_datetime('today')
df_wdates.start_date = pd.to_datetime(df_wdates.start_date, format='%Y%m%d')
df_wdates.end_date = pd.to_datetime(df_wdates.end_date, format='%Y%m%d')
df_wdates.head()

Unnamed: 0_level_0,route_id,trip_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,bikes_allowed,max_delay,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0531,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,2020-05-08,2020-06-14
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0546,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,2020-05-08,2020-06-14
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0600,Käpylä,0,1001 5_20200504_1,1,2,4,0,0,1,0,0,0,0,2020-05-08,2020-06-14
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_0843,Käpylä,0,1001 5_20200504_1,1,2,0,0,0,1,0,0,0,0,2020-05-08,2020-06-14
1001 5_20200508_20200614_Ke,1001,1001 5_20200508_Ke_1_1105,Käpylä,0,1001 5_20200504_1,1,2,3,0,0,1,0,0,0,0,2020-05-08,2020-06-14


Filtering the dataframe to have data from a single day.

In [124]:
df_wdates.loc[df_wdates.monday==1, 'weekday'] = 0; df_wdates.loc[df_wdates.tuesday==1, 'weekday'] = 1; df_wdates.loc[df_wdates.wednesday==1, 'weekday'] = 2; df_wdates.loc[df_wdates.thursday==1, 'weekday'] = 3; df_wdates.loc[df_wdates.friday==1, 'weekday'] = 4; df_wdates.loc[df_wdates.saturday==1, 'weekday'] = 5; df_wdates.loc[df_wdates.sunday==1, 'weekday'] = 6
df_oneday = df_wdates[(df_wdates.weekday==date_select.weekday()) & (df_wdates.start_date<=date_select) & (df_wdates.end_date>=date_select)]
df_oneday.head()

Unnamed: 0_level_0,route_id,trip_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,bikes_allowed,max_delay,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,weekday
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1001 5_20200508_20200614_Ma,1001,1001 5_20200508_Ma_1_0531,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0
1001 5_20200508_20200614_Ma,1001,1001 5_20200508_Ma_1_0546,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0
1001 5_20200508_20200614_Ma,1001,1001 5_20200508_Ma_1_0600,Käpylä,0,1001 5_20200504_1,1,2,4,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0
1001 5_20200508_20200614_Ma,1001,1001 5_20200508_Ma_1_0843,Käpylä,0,1001 5_20200504_1,1,2,0,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0
1001 5_20200508_20200614_Ma,1001,1001 5_20200508_Ma_1_1105,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0


Exctracting the departure times for each record.

In [125]:
df_oneday['d_time'] = df_oneday.trip_id.str[-4:]
df_oneday['d_time'] = pd.to_datetime(df_oneday['d_time'], errors='coerce', format='%H%M').dt.time
print(df_oneday.head())

                            route_id                    trip_id trip_headsign  \
service_id                                                                      
1001 5_20200508_20200614_Ma     1001  1001 5_20200508_Ma_1_0531        Käpylä   
1001 5_20200508_20200614_Ma     1001  1001 5_20200508_Ma_1_0546        Käpylä   
1001 5_20200508_20200614_Ma     1001  1001 5_20200508_Ma_1_0600        Käpylä   
1001 5_20200508_20200614_Ma     1001  1001 5_20200508_Ma_1_0843        Käpylä   
1001 5_20200508_20200614_Ma     1001  1001 5_20200508_Ma_1_1105        Käpylä   

                             direction_id           shape_id  \
service_id                                                     
1001 5_20200508_20200614_Ma             0  1001 5_20200504_1   
1001 5_20200508_20200614_Ma             0  1001 5_20200504_1   
1001 5_20200508_20200614_Ma             0  1001 5_20200504_1   
1001 5_20200508_20200614_Ma             0  1001 5_20200504_1   
1001 5_20200508_20200614_Ma             0  1001 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oneday['d_time'] = df_oneday.trip_id.str[-4:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oneday['d_time'] = pd.to_datetime(df_oneday['d_time'], errors='coerce', format='%H%M').dt.time


Adding vehicle types to the data. Type specification: https://developers.google.com/transit/gtfs/reference/extended-route-types

In [126]:
route_data = pd.read_csv(gtfs_zip.open('routes.txt'))
route_data.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url
0,1001,HSL,1,Eira - Töölö - Sörnäinen (M) - Käpylä,,0,http://aikataulut.hsl.fi/linjat/fi/h1_1a.html
1,1001H,HSL,1H,Käpylä - Koskelan halli,,0,http://aikataulut.hsl.fi/linjat/fi/h1_1a.html
2,1002,HSL,2,Olympiaterminaali - Töölö - Pasila as.,,0,http://aikataulut.hsl.fi/linjat/fi/h2.html
3,1002H,HSL,2H,Pasila as. - Töölön halli,,0,http://aikataulut.hsl.fi/linjat/fi/h2.html
4,1002X,HSL,2X,Lasipalatsi - Töölö - Pasila as. - Messukeskus,,701,http://aikataulut.hsl.fi/linjat/fi/b2X.html


In [127]:
df_1dwtypes = pd.merge(df_oneday, route_data[['route_id', 'route_type']], on='route_id', how='left')
df_1dwtypes.head()

Unnamed: 0,route_id,trip_id,trip_headsign,direction_id,shape_id,wheelchair_accessible,bikes_allowed,max_delay,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,weekday,d_time,route_type
0,1001,1001 5_20200508_Ma_1_0531,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0,05:31:00,0
1,1001,1001 5_20200508_Ma_1_0546,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0,05:46:00,0
2,1001,1001 5_20200508_Ma_1_0600,Käpylä,0,1001 5_20200504_1,1,2,4,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0,06:00:00,0
3,1001,1001 5_20200508_Ma_1_0843,Käpylä,0,1001 5_20200504_1,1,2,0,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0,08:43:00,0
4,1001,1001 5_20200508_Ma_1_1105,Käpylä,0,1001 5_20200504_1,1,2,3,1,0,0,0,0,0,0,2020-05-08,2020-06-14,0.0,11:05:00,0


Slicing the final timetable from the result.

In [128]:
timetable = df_1dwtypes.reset_index().loc[:,['shape_id', 'd_time', 'route_type']]
timetable.head()

Unnamed: 0,shape_id,d_time,route_type
0,1001 5_20200504_1,05:31:00,0
1,1001 5_20200504_1,05:46:00,0
2,1001 5_20200504_1,06:00:00,0
3,1001 5_20200504_1,08:43:00,0
4,1001 5_20200504_1,11:05:00,0


Done with processing!