# Read GTFS with duckdb 
Since gtfs_functions package crash on Entur data

In [None]:
from pathlib import Path

import duckdb

In [None]:
gtfs_root = Path().resolve().parent / "data/gtfs/rb_norway-aggregated-gtfs"
assert gtfs_root.exists()

table_names = [
    "calendar",
    "calendar_dates",
    "stops",
    "stop_times",
    "routes",
    "transfers",
    "trips",
    "shapes",
]
skip_exists = True

## CSV to Parquet

In [None]:
for table in table_names:
    parquet_file = gtfs_root / f"{table}.parquet"
    csv_file = gtfs_root / f"{table}.txt"
    if not parquet_file.exists() or not skip_exists:
        duckdb.sql(
            f"COPY(SELECT * FROM read_csv('{csv_file}')) TO '{parquet_file}' (FORMAT 'parquet');"
        )

## Creating duckdb tables from CSV
1. Fills database, hence memory or disk with copies
1. Adds read time at startup if memory

In [None]:
for table in ["shapes"]:  # table_names:
    duckdb.sql(
        f"CREATE TABLE IF NOT EXISTS {table} AS FROM read_csv('{gtfs_root / f"{table}.txt"}');"
    )

In [None]:
duckdb.sql("SELECT COUNT() FROM shapes;")

In [None]:
duckdb.sql(f"SELECT COUNT() FROM read_csv('{gtfs_root / "shapes.txt"}');")

In [None]:
duckdb.sql(f"SELECT COUNT() FROM '{gtfs_root / "shapes.parquet"}';")

### Stops

In [None]:
stops = duckdb.read_csv(gtfs_root / "stops.txt")
stops.show(max_width=250)

### Stop times

In [None]:
stop_times = duckdb.read_csv(gtfs_root / "stop_times.txt")
stop_times.show(max_width=250)

## routes

In [None]:
routes = duckdb.read_csv(gtfs_root / "routes.txt")
routes.show(max_width=250)

### transfers

In [None]:
transfers = duckdb.read_csv(gtfs_root / "transfers.txt")
transfers.show(max_width=250)

### trips

In [None]:
trips = duckdb.read_csv(gtfs_root / "trips.txt")
trips.show(max_width=250)

## Shapes

In [None]:
shapes_parquet = gtfs_root / "shapes.parquet"
shapes = duckdb.read_parquet(str(shapes_parquet))
shapes.show()

In [None]:
res = duckdb.sql(f"SELECT COUNT(*) from '{shapes_parquet}';")
res.show()

In [None]:
shapes_parquet_meta = duckdb.sql(f"SELECT * FROM parquet_metadata('{shapes_parquet}')")
shapes_parquet_meta.show(max_width=500)

In [None]:
res = duckdb.sql(f"SELECT * from '{shapes_parquet}' LIMIT 10;")
res.show()

In [None]:
res = duckdb.sql(
    f"SELECT * from '{shapes_parquet}' WHERE shape_id == 'AKT:JourneyPattern:58013_1';"
)
res.to_df()

### Longest shape

In [None]:
res = duckdb.sql(
    f"SELECT max_by(shape_id, shape_dist_traveled) from '{shapes_parquet}';"
)
res.to_df()

In [None]:
res = duckdb.sql(
    f"SELECT * from '{shapes_parquet}' WHERE shape_id == 'HAV:JourneyPattern:2024-11-02';"
)
res.to_df()

### Longest shapes

In [None]:
res = duckdb.sql(
    f"SELECT shape_id, max(shape_dist_traveled) AS total_dist from '{shapes_parquet}' GROUP BY shape_id ORDER BY total_dist DESC LIMIT 40;"
)
res.to_df()
# GROUP BY shape_id ORDER BY shape_dist_traveled LIMIT 10