<a href="https://colab.research.google.com/github/kavyajeetbora/modern_geospatial_stack/blob/master/notebooks/DuckDB_in_Jupyter_Notebooks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DuckDB in Jupyter Notebooks
A streamlined workflow for SQL analysis with DuckDB and Jupyter

## Library Import and Configuration

In [9]:
!pip install --quiet duckdb
!pip install --quiet jupysql
!pip install --quiet duckdb-engine
!pip install --quiet pandas
!pip install --quiet matplotlib
!pip install -q osmnx
!pip install -q pydeck

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.2/107.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
import duckdb
import pandas as pd
import geopandas as gpd
import shapely
import osmnx as ox
import pydeck as pdk
# No need to import sqlalchemy or duckdb_engine
#  JupySQL will use SQLAlchemy to auto-detect the driver needed based on your connection string!

# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


We configure jupysql to return data as a Pandas dataframe and have less verbose output

In [3]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

## Connecting to DuckDB
Connect jupysql to DuckDB using a SQLAlchemy-style connection string. You may either connect to an in memory DuckDB, or a file backed db.

In [4]:
%sql duckdb:///:memory:
# %sql duckdb:///path/to/file.db

In [5]:
%%sql
INSTALL httpfs;
INSTALL spatial;

Unnamed: 0,Success


## Downloading Builings in small area

In [23]:
W,S,E,N =  72.824548,19.19574,72.869386,19.231531

In [None]:
%%time

%%sql
LOAD spatial;
LOAD httpfs;

COPY (
    SELECT
        id,
        level,
        height,
        ST_GeomFromWkb(geometry) AS geometry
    FROM read_parquet('s3://overturemaps-us-west-2/release/2024-06-13-beta.0/theme=buildings/type=*/*', filename=true, hive_partitioning=1)
    WHERE
        bbox.xmin > 72.824548
        AND bbox.xmax < 72.869386
        AND bbox.ymin > 19.19574
        AND bbox.ymax < 19.231531
) TO 'buildings_mumbai.geojson'
WITH (FORMAT GDAL, DRIVER 'GeoJSON', SRS 'EPSG:4326');

In [17]:
def create_map(W,S,E,N, geojson_file=None):
    bbox_geom = shapely.geometry.box(W,S,E,N)
    boundary_json = eval(gpd.GeoSeries(bbox_geom).to_json())

    boundary_layer = pdk.Layer(
        "GeoJsonLayer",
        boundary_json,
        opacity=1,
        stroked=True,
        filled=False,
        get_line_color=[100, 0, 0]
    )
    if geojson_file is not None:
        layer = pdk.Layer(
            "GeoJsonLayer",
            geojson_file,
            opacity=1,
            stroked=True,
            filled=True,
            get_fill_color=[100, 200, 0],
            get_line_color=[0,100,0],
            pickable=True
        )
        layers = [boundary_layer, layer]
    else:
        layers = [boundary_layer]


    C = bbox_geom.centroid
    view_state = pdk.ViewState(latitude=C.y, longitude=C.x, zoom=11, bearing=0, pitch=45)
    # Render

    r = pdk.Deck(layers=layers, initial_view_state=view_state, tooltip = True)
    return r

In [19]:
Map = create_map(W,S,E,N, geojson_file="buildings_mumbai.geojson")
Map

<IPython.core.display.Javascript object>