## **Exploratory analysis of the Buenos Aires GTFS for buses**

In [55]:
import polars as pl
import glob
import os

In [56]:
calendar_dates = pl.read_csv(os.path.join('source','calendar_dates.txt'))
trips = pl.read_csv(os.path.join('source','trips.txt'))
routes = pl.read_csv(os.path.join('source','routes.txt'))
shapes = pl.read_csv(os.path.join('source','shapes.txt'))
stop_times = pl.read_csv(os.path.join('source','stop_times.txt'))

Search for null values in each DataFrame

In [57]:
path = os.path.join('source', '*.txt')
files = [os.path.basename(f)[0:-4] for f in glob.glob(path)]
files.pop(0) # This file will not be used, although it could be useful

'agency'

In [58]:
for table in files:
    nulls = 0
    columns = globals()[table].get_columns()
    for col in columns:
        nulls += col.is_null().sum()
    print(f'{nulls} null values in the table {table.upper()}')

0 null values in the table CALENDAR_DATES
0 null values in the table ROUTES
0 null values in the table SHAPES
0 null values in the table STOP_TIMES
0 null values in the table TRIPS


---
### Structure of `calendar_dates`

In [59]:
calendar_dates.head()

service_id,date,exception_type
i64,i64,i64
1,20190928,1
2,20190929,1
3,20190930,1
3,20191001,1
3,20191002,1


There are **4 types** of services attached to each trip, with service 3 being the one with most days associated 

In [60]:
calendar_dates.group_by('service_id').len('total_days').sort('total_days', descending=True)

service_id,total_days
i64,u32
3,63
1,15
2,13
4,4


In [61]:
# Cast 'date' column to str
calendar_dates = calendar_dates.with_columns(
    pl.col('date').cast(pl.Utf8)
)

In [62]:
# Partition the 'date' column into year, month and day columns
calendar_dates = calendar_dates.with_columns([
    pl.col('date').str.strptime(pl.Date, '%Y%m%d').dt.year().alias('year'),
    pl.col('date').str.strptime(pl.Date, '%Y%m%d').dt.month().alias('month'),
    pl.col('date').str.strptime(pl.Date, '%Y%m%d').dt.day().alias('day')
])

Comparing the list of days for each month in each service in the `services_calendar` DataFrame with the Argentinian national calendar, the following attributes of each service type can be inferred:

* Service 1 is for saturdays and holiday vespers
* Service 2 is for sundays
* Service 3 is for week days
* Service 4 is for holidays

In [63]:
services_calendar = (
    calendar_dates
    .group_by(['service_id', 'year', 'month'])
    .agg(pl.col('day').alias('days'))
    .sort(['service_id', 'year', 'month'])
)

Let's take for example service 4:

In [64]:
# Here, each day for each month is a national holiday in Argentina for the given year
services_calendar.filter(pl.col('service_id') == 4)

service_id,year,month,days
i64,i32,i8,list[i8]
4,2019,10,[12]
4,2019,11,[18]
4,2019,12,"[8, 25]"


---
### Structure of `trips`

In [65]:
trips.head()

route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,exceptional
i64,i64,str,str,str,i64,str,i64,i64
100,1,"""1-1""","""a Pque. Avellaneda""","""100SI0001""",0,"""100SI0001""",1,0
100,1,"""2-1""","""a Pque. Avellaneda""","""100SI0002""",0,"""100SI0002""",1,0
100,1,"""3-1""","""a Pque. Avellaneda""","""100SI0003""",0,"""100SI0003""",1,0
100,1,"""4-1""","""a Pque. Avellaneda""","""100SI0004""",0,"""100SI0004""",1,0
100,1,"""5-1""","""a Pque. Avellaneda""","""100SI0005""",0,"""100SI0005""",1,0


The number of trips per day for each route can be analyzed using COUNT to later be joined with the `routes` file with each bus information, taking into account inbound and outbound trips as seen in the next example using route 100:

In [66]:
# Note that the number of trips for the inbound and outbound directions is unequal
trips.group_by('route_id','direction_id').len('total_trips').filter(pl.col('route_id') == 100)

route_id,direction_id,total_trips
i64,i64,u32
100,0,366
100,1,364


In [67]:
# Instead, the mean can be taken from both
mean_trips_per_route = trips. \
    group_by('route_id', 'direction_id'). \
    len('total_trips'). \
    group_by('route_id'). \
    agg(pl.col('total_trips').mean().alias('route_trips')). \
    with_columns(
        pl.col('route_trips').round().cast(pl.Int32)
    )

---
### Structure of `routes`

In [68]:
routes.head()

route_id,agency_id,route_short_name,route_long_name,route_desc,route_type
i64,i64,str,str,str,i64
4167,110,"""505R3""","""JMALBR505""","""Ramal 3 - San Francisco Solano…",3
4168,110,"""505R4""","""JMALBR505""","""Ramal 4 - San Francisco Solano…",3
4169,110,"""505R5""","""JMALBR505""","""Ramal 5 - San Francisco Solano…",3
4170,110,"""505R6""","""JMALBR505""","""Ramal 6 - San Francisco Solano…",3
4171,110,"""506R1""","""JMALBR506""","""Ramal 1 - Est. Glew - Tapin y …",3


To make aggregations over the bus lines without taking into account each route branch ("ramal"), the column route_short_name should be used to create a new column with only the bus line number or name, using conditional statements to separate both cases, as seen in the next example:

In [69]:
routes = routes.with_columns(
    pl.when(pl.col('route_short_name').str.contains(r'^\d+'))
      .then(pl.col('route_short_name').str.extract(r'^(\d+)', 1))  # Digits before letters
      .otherwise(pl.col('route_short_name').str.extract(r'^([A-Za-z]+)', 1)) # Letters at the beginning
      .alias('bus_line')
)

Then, for example, the count of route branches for each bus line can be calculated and sorted

In [70]:
routes.group_by('bus_line').len('branches').sort('branches', descending=True).head()

bus_line,branches
str,u32
"""96""",35
"""60""",20
"""57""",19
"""620""",19
"""129""",18


---

_The following cell will be useful to measure attributes of routes and bus lines taking into account only one representative trip for each route in both directions_

In [71]:
trip_of_routes = trips. \
    group_by('route_id', 'direction_id'). \
    agg(pl.col('trip_id').first())

---
### Structure of `shapes`

In [72]:
shapes.head()

shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
i64,f64,f64,i64,f64
418,-34.601438,-58.454088,53,3628.46
418,-34.602127,-58.455055,54,3745.43
418,-34.602792,-58.455998,55,3859.09
418,-34.603757,-58.457437,56,4028.91
418,-34.604258,-58.458182,57,4117.0


The total distance completed by each route can be obtained by getting the largest shape_dist_traveled value of the shape_id associated with each trip of a route, as seen in the following example:

In [73]:
total_dist_per_shape = shapes. \
    group_by('shape_id'). \
    agg(pl.col('shape_dist_traveled').max().alias('total_dist'))

total_dist_per_shape.sort('shape_id').head()

shape_id,total_dist
i64,f64
1,17012.0
2,19238.0
3,9038.0
4,11164.0
5,12816.0


This measurement can be later joined with individual representative trips for each route in both directions (`trip_of_routes`), calculating the average from both, and finally be joined with the `routes` table

In [74]:
total_dist_per_route = total_dist_per_shape. \
    join(trips, 'shape_id'). \
    join(trip_of_routes, 'trip_id'). \
    group_by('route_id'). \
    agg(pl.col('total_dist').mean().alias('total_route_distance'))

---
### Structure of `stop_times`

In [75]:
stop_times.head()

trip_id,arrival_time,departure_time,stop_id,stop_sequence,timepoint,shape_dist_traveled
str,str,str,i64,i64,i64,i64
"""1-1""","""00:26:00""","""00:26:00""",205696,1,1,0
"""1-1""","""00:28:00""","""00:28:00""",204229,2,0,514
"""1-1""","""00:29:08""","""00:29:08""",204191,3,0,803
"""1-1""","""00:31:06""","""00:31:06""",205517,4,0,1306
"""1-1""","""00:34:12""","""00:34:12""",205528,5,0,2093


It can be seen from the next cell that the number of stops is equal for some trips. This can be due to coincidence or due to the trips belonging to one specific route

In [76]:
stops_per_trip = stop_times.group_by('trip_id').len('trip_stops')
stops_per_trip.sort('trip_stops').head()

trip_id,trip_stops
str,u32
"""24876-1""",2
"""134099-1""",2
"""316683-1""",2
"""316714-1""",2
"""24856-1""",2


To segregate both cases, the trips are grouped by route_id and direction_id, and only one of them is taken into account to perform aggregation operations over the stops for each route branch and bus line (`trip_of_routes`)

In [77]:
# Note that grouping by direction in trips_of_routes is essential because the stops are different
stops_per_trip_by_route = trip_of_routes.join(stops_per_trip, 'trip_id')

In [78]:
stops_per_route = stops_per_trip_by_route. \
    group_by('route_id'). \
    agg(pl.col('trip_stops').sum().alias('route_stops'))

stops_per_line = stops_per_route. \
    join(routes, 'route_id'). \
    group_by('bus_line'). \
    agg(pl.col('route_stops').sum().alias('line_stops_sum'), 
        pl.col('route_stops').mean().alias('line_stops_mean')). \
    with_columns(pl.col('line_stops_mean').round().cast(pl.Int32))

---
### **Taking most parameters into account**

In [79]:
df = routes. \
    join(total_dist_per_route, 'route_id'). \
    join(stops_per_route, 'route_id'). \
    join(stops_per_line, 'bus_line'). \
    join(mean_trips_per_route, 'route_id'). \
    with_columns(
        pl.col('route_id').count().over('bus_line').alias('bus_routes')
    )

In [80]:
df = df.select('route_id','bus_line','bus_routes','route_short_name','route_trips',
               'route_stops','total_route_distance','line_stops_sum','line_stops_mean','agency_id')

In [81]:
# Resulting DataFrame example filtering by bus_line, matching line 130
df.filter(pl.col('bus_line') == '130')

route_id,bus_line,bus_routes,route_short_name,route_trips,route_stops,total_route_distance,line_stops_sum,line_stops_mean,agency_id
i64,str,u32,str,i32,u32,f64,u32,i32,i64
143,"""130""",2,"""130A""",588,139,28579.5,280,140,32
144,"""130""",2,"""130B""",420,141,29636.5,280,140,32
