# GTFS to parquet
Read GTFS from Internet and convert to parquet with duckdb.

In [None]:
import urllib.request

from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import duckdb


In [None]:
# Do a fresh download of GTFS data from Entur
fresh_download = False


In [None]:
gtfs_url = "https://storage.googleapis.com/marduk-production/outbound/gtfs/rb_norway-aggregated-gtfs.zip"
gtfs_root = Path().resolve().parent / "data/gtfs/rb_norway-aggregated-gtfs"
gtfs_root.mkdir(exist_ok=True, parents=True)
local_gtfs_zip = gtfs_root.parent / "rb_norway-aggregated-gtfs.zip"

if not local_gtfs_zip.exists() or fresh_download:
    resp = urllib.request.urlretrieve(gtfs_url, local_gtfs_zip)
    print(resp)

In [None]:
myzip = ZipFile(local_gtfs_zip)

print(myzip.namelist())
for name in myzip.namelist():
    print(f"{name}: {myzip.getinfo(name).compress_size:_} {myzip.getinfo(name).file_size:_}")

In [None]:
table_names = ['feed_info',
 'stop_times',
 'calendar',
 'shapes',
 'agency',
 'transfers',
 'stops',
 'trips',
 'calendar_dates',
 'routes']

## CSV (txt) to Parquet

In [None]:
for table_name in table_names:
    parquet_file = gtfs_root / f"{table_name}.parquet"
    if not parquet_file.exists() or fresh_download:
        csv = duckdb.read_csv(myzip.extract(f"{table_name}.txt"))
        duckdb.sql(f"""COPY(SELECT * FROM csv) TO '{parquet_file}' (FORMAT 'parquet'); """)