# Simplify shapes with duckdb spatial

In [None]:
from pathlib import Path

import duckdb
import geopandas
import pandas as pd

In [None]:
gtfs_root = Path().resolve().parent / "data/gtfs/rb_norway-aggregated-gtfs"
duckdb.connect(str(gtfs_root / "gtfs.ducksb"))
duckdb.load_extension("SPATIAL")
duckdb.install_extension("SPATIAL")

In [None]:
shapes_parquet = gtfs_root / "shapes.parquet"
print(shapes_parquet)
assert shapes_parquet.exists()
shapes = duckdb.read_parquet(str(shapes_parquet))

In [None]:
num_shapes = duckdb.sql("""SELECT COUNT(DISTINCT shape_id) FROM shapes;""")
num_shapes

In [None]:
shapes.show()

In [None]:
shapes_as_point = duckdb.sql(
    """
        SELECT
            shape_id, shape_pt_sequence, shape_dist_traveled, ST_Point(shape_pt_lon, shape_pt_lat) AS geometry
        FROM shapes;
    """
)
shapes_as_point.show()

## Simplify shapes geom
### Experiments

In [None]:
per_iter = 2_000_000

dfs = []
for offset in [x * per_iter for x in range(len(shapes_as_point) // per_iter + 1)]:
    res = duckdb.sql(
        f"""
                SELECT
                    shape_id, MAX(shape_pt_sequence), MAX(shape_dist_traveled ), ST_AsText(ST_SimplifyPreserveTopology(ST_MakeLine(list(geometry ORDER BY shape_pt_sequence ASC)), 0.001)) as Geom
                FROM (SELECT * FROM shapes_as_point LIMIT {per_iter} OFFSET {offset})
                GROUP BY shape_id
        """
    )
    dfs.append(res.to_df())
    print(f"{offset=}, num_shapes {len(res)=}")

In [None]:
df = pd.concat(dfs).reset_index(drop=True)
df

In [None]:
geometries = geopandas.GeoSeries.from_wkt(df["Geom"].to_list())
print(len(geometries))
gdf = geopandas.GeoDataFrame(df, geometry=geometries).drop(columns="Geom")

In [None]:
gdf.set_crs("epsg:4326").explore()

In [None]:
gdf.to_parquet(gtfs_root / "shapes_linestring_simple.parquet")

In [None]:
gdf_read = geopandas.read_parquet(gtfs_root / "shapes_linestring_simple.parquet")
gdf_read

In [None]:
gdf_read.plot()

### Simplify shapes to lines and save
Fails on laptop with wsl and 8 GB RAM

In [None]:
# res = duckdb.sql(
#     f"""
#         COPY (
#             SELECT
#                 shape_id, MAX(shape_pt_sequence), MAX(shape_dist_traveled ), ST_SimplifyPreserveTopology(ST_MakeLine(list(geometry ORDER BY shape_pt_sequence ASC)), 0.001) as Geom
#             FROM (SELECT * FROM shapes_as_point)
#             GROUP BY shape_id
#         ) TO '{gtfs_root / "shapes_linestring_simple.parquet"}';
#     """
# )
