# Filter gtfs-feed
A notebook with which timetables from a gtfs-feed can be filtered

In [1]:
import os
import csv
import codecs
from zipfile import ZipFile
import tempfile
import shutil
import requests
import io
import glob
from xlsxwriter.workbook import Workbook

In [2]:
os.getcwd()

'D:\\dev\\learning-pt-routing\\notebooks'

In [3]:
# url to the input gtfs-feed
url_to_gtfs_feed = "https://opentransportdata.swiss/dataset/6f55f96d-7644-4901-b927-e9cf05a8c7f0/resource/a81c59c2-6fd7-47c8-b7b6-90a045a90aae/download/gtfsfp20202020-01-22.zip"

# folder and file-name for the filtered gtfs-feed
output_gtfs_small_dir = os.path.join("..", "resources")
output_gtfs_small_file_name = "small_gtfs_feed"  # .zip extension is added automatically

## Filter gtfs-feed
Given a list of routes we filter all information in the input-gtfs-feed which is connected to one of these routes

In [4]:
io_gtfs_feed = io.BytesIO(requests.get(url_to_gtfs_feed).content)

In [5]:
# define the routes
route_id_bus_10_bern = "6-10-j20-1"
route_id_ic_geneve_stgallen = "26-1-A-j20-1"
route_id_chur_stmoritz = "59-9-Y-j20-1"
route_ids = [route_id_bus_10_bern, route_id_ic_geneve_stgallen, route_id_chur_stmoritz]

In [6]:
def filter_gtfs_feed(input_gtfs_feed_io, output_gtfs_feed_dir, output_gtfs_feed_filename, routes_to_filter):
    """filters the gtfs-data in input_gtfs_feed_io connected to the routes in routes_to_filter. returns a gtfs-zip- and an excel-file with the filtered data."""
    # create temporary directory
    path_tmp_dict = os.path.join(output_gtfs_feed_dir, "tmp")
    if os.path.exists(path_tmp_dict) and os.path.isdir(path_tmp_dict):
        shutil.rmtree(path_tmp_dict)
    os.mkdir(path_tmp_dict)
    
    def filter_by_id(key, values, zip_file, feed_file, new_keys, out_folder):
        print("start processing {}".format(feed_file))
        """filters the rows from feed_file if the value of the field key is contained in vlaues (all other rows are skipped)"""
        with zip_file.open(feed_file, "r") as gtfs_file:
            reader = csv.DictReader(io.TextIOWrapper(gtfs_file, "utf-8-sig"))
            res = []
            new_values = []
            for l in reader:
                if l[key] in values:
                    res += [l]
                    new_values += [[l[new_key] for new_key in new_keys]]
            with open(os.path.join(out_folder, feed_file), "w", newline='', encoding="utf8") as g:
                writer = csv.DictWriter(g, fieldnames=reader.fieldnames, quotechar='"', quoting=csv.QUOTE_ALL)
                writer.writeheader()
                for l in res:
                    writer.writerow(l)
        print("end processing {}".format(feed_file))
        return [set(x) for x in zip(*new_values)]
    

    def filter_transfers(stop_ids, zip_file, out_folder):
        """filters the rows from transfers.txt if from_stop_id and to_stop_id is are contained in stop_ids (all other rows are skipped)"""
        print("start processing transfers.txt")
        with zip_file.open("transfers.txt", "r") as gtfs_file:
            reader = csv.DictReader(io.TextIOWrapper(gtfs_file, "utf-8-sig"))
            res = []
            for l in reader:
                if l["from_stop_id"] in stop_ids and l["to_stop_id"] in stop_ids:
                    res += [l]
            with open(os.path.join(out_folder, "transfers.txt"), "w", newline='', encoding="utf8") as g:
                writer = csv.DictWriter(g, fieldnames=reader.fieldnames, quotechar='"', quoting=csv.QUOTE_ALL)
                writer.writeheader()
                for l in res:
                    writer.writerow(l)
        print("end processing transfers.txt")
    
    def add_parent_stations_to_stop_ids(stop_ids, zip_file):
        """adds the ids of the parent stations to the stop-ids in stop_ids""" 
        print("start processing parent stops")
        parent_stations_to_add = set()
        with zip_file.open("stops.txt", "r") as gtfs_file:
            reader = csv.DictReader(io.TextIOWrapper(gtfs_file, "utf-8-sig"))
            for l in reader:
                if l.get("stop_id") in stop_ids and l.get("parent_station", None):
                    parent_stations_to_add.add(l["parent_station"])
        print("end processing parent stops")
        return stop_ids.union(parent_stations_to_add)
    
    # process step by step
    with ZipFile(io_gtfs_feed, "r") as zip_file:
        trip_ids, service_ids = filter_by_id("route_id", route_ids, zip_file, "trips.txt", ["trip_id", "service_id"], path_tmp_dict)
        stop_ids = filter_by_id("trip_id", trip_ids, zip_file, "stop_times.txt", ["stop_id"], path_tmp_dict)[0]
        stop_ids = add_parent_stations_to_stop_ids(stop_ids, zip_file)
        filter_by_id("stop_id", stop_ids, zip_file, "stops.txt", [], path_tmp_dict)
        filter_by_id("service_id", service_ids, zip_file, "calendar.txt", [], path_tmp_dict)
        filter_by_id("service_id", service_ids, zip_file, "calendar_dates.txt", [], path_tmp_dict)
        agency_ids = filter_by_id("route_id", route_ids, zip_file, "routes.txt", ["agency_id"], path_tmp_dict)[0]
        filter_by_id("agency_id", agency_ids, zip_file, "agency.txt", [], path_tmp_dict)
        filter_transfers(stop_ids, zip_file, path_tmp_dict)
   
    # zip the text-files to a zip-file
    shutil.make_archive(os.path.join(output_gtfs_feed_dir, output_gtfs_feed_filename), "zip", path_tmp_dict)
    
    # write the text-files to a excel-file
    workbook = Workbook(os.path.join(output_gtfs_feed_dir, "{}.xlsx".format(output_gtfs_small_file_name)))
    for csvfile in glob.glob(os.path.join(output_gtfs_feed_dir, "tmp", '*.txt')):
        worksheet = workbook.add_worksheet(os.path.basename(csvfile)[:-4])
        with open(csvfile, 'rt', encoding='utf8') as f:
            reader = csv.reader(f)
            for r, row in enumerate(reader):
                for c, col in enumerate(row):
                    worksheet.write(r, c, col)
    workbook.close()
    
    # remove tmp-data
    if os.path.exists(path_tmp_dict) and os.path.isdir(path_tmp_dict):
        shutil.rmtree(path_tmp_dict)

In [7]:
filter_gtfs_feed(io_gtfs_feed, output_gtfs_small_dir, output_gtfs_small_file_name, route_ids)

start processing trips.txt
end processing trips.txt
start processing stop_times.txt
end processing stop_times.txt
start processing parent stops
end processing parent stops
start processing stops.txt
end processing stops.txt
start processing calendar.txt
end processing calendar.txt
start processing calendar_dates.txt
end processing calendar_dates.txt
start processing routes.txt
end processing routes.txt
start processing agency.txt
end processing agency.txt
start processing transfers.txt
end processing transfers.txt
