<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/09_Overturemaps_Divisions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring Foursquare POI Data


1. [Foursquare's 104M Points of Interest](https://tech.marksblogg.com/foursquare-open-global-poi-dataset.html)

In [None]:
!pip install --quiet duckdb
!pip install --quiet jupysql
!pip install --quiet duckdb-engine

import duckdb
import os
import pandas as pd
import geopandas as gpd
import numpy as np

## Setup
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

## Download India Boundary

In [None]:
# !wget https://raw.githubusercontent.com/udit-001/india-maps-data/main/geojson/india.geojson -O india.geojson
# gdf = gpd.read_file('india.geojson')
# india_gdf = gdf.dissolve()
# india_gdf.to_parquet('india.parquet')
# india_gdf.plot()

## Division Schema


Promminence zoom levels for places:

| Prominence | Priority    | Typical place type             | Approx min_zoom |
|------------|-------------|--------------------------------|-----------------|
| 0          | Highest     | National capital, megacity     | 3               |
| 1          | Very high   | Large capital/regional capital | 4               |
| 2          | High        | City > 1M                      | 4–5             |
| 3          | High        | City 500k–1M                   | 5               |
| 4          | Medium-high | City 100k–500k                 | 5–6             |
| 5          | Medium      | City/Town 50k–100k             | 6               |
| 6          | Medium      | Town 20k–50k                   | 7               |
| 7          | Low         | Town 10k–20k                   | 8               |
| 8          | Low         | Village 5k–10k                 | 9               |
| 9          | Low         | Village/Suburb                 | 10              |
| 10         | Very low    | Hamlet/Neighbourhood           | 11              |
| 11         | Very low    | Minor locality                 | 12              |
| 12         | Lowest      | Micro/local name               | 13+             |

In [None]:
latest_release = "2025-11-19.0"

In [None]:
# # Initialize DuckDB connection
# con = duckdb.connect()

# # Load the spatial extension if not already loaded
# con.execute("INSTALL spatial;")
# con.execute("LOAD spatial;")

# # Define prominence→zoom mapping as a dictionary of ranges
# prominence_to_zoom = {
#     (80, 100): 5,
#     (70, 79): 6,
#     (60, 69): 7,
#     (50, 59): 8,
#     (40, 49): 9,
#     (30, 39): 10,
#     (20, 29): 11,
#     (10, 19): 12,
#     (1, 9): 13,
# }

# # Register mapping as an in-memory table for the query
# prom_map_df = pd.DataFrame(
#     [{'min_prom': lo, 'max_prom': hi, 'min_zoom': z} for (lo, hi), z in prominence_to_zoom.items()]
# )
# con.register('prom_map', prom_map_df)

# # Define the output parquet file name
# output_parquet_file = 'poi_india.parquet'

# # Execute the query and directly copy the results to a parquet file
# # total_count = con.execute(f"""
# #     SELECT
# #         COUNT(*)
# #  FROM read_parquet('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=True) WHERE country = 'IN'
# # """).df()

# sample_data = con.execute(f"""
#   SELECT
#     d.names.primary as place_name,
#     d.class,
#     d.subtype,
#     d.country,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),2), 'name') AS region,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),3), 'name') AS district,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),4), 'name') AS area_name,
#     d.population,
#     COALESCE(pm.min_zoom, 14) AS min_zoom,
#     ST_Y(CASE
#         WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
#         ELSE ST_PointOnSurface(d.geometry)
#     END) AS lat,
#     ST_X(CASE
#         WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
#         ELSE ST_PointOnSurface(d.geometry)
#     END) AS lon
#   FROM parquet_scan('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=true) d
#   LEFT JOIN prom_map pm
#     ON d.cartography.prominence BETWEEN pm.min_prom AND pm.max_prom
#   WHERE d.country = 'IN'
# """).df()
# sample_data = sample_data[sample_data['place_name'].notna()].copy()

# # Close the DuckDB connection
# con.close()

In [None]:
# sample_data.to_parquet("places_india.parquet")

## Export Places Data to GeoJSON

In [32]:
# Assuming latest_release is defined, e.g., latest_release = '2025-11-19.0'

con = duckdb.connect()
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

con.execute("SET s3_region='us-west-2';")

prominence_to_zoom = {
    (80, 100): 5,
    (70, 79): 6,
    (60, 69): 7,
    (50, 59): 8,
    (40, 49): 9,
    (30, 39): 10,
    (20, 29): 11,
    (10, 19): 12,
    (1, 9): 13,
}
prom_map_df = pd.DataFrame(
    [{'min_prom': lo, 'max_prom': hi, 'min_zoom': z} for (lo, hi), z in prominence_to_zoom.items()]
)
con.register('prom_map', prom_map_df)

con.execute(f"""
COPY (
  WITH src AS (
    SELECT
      CAST(d.names.primary AS VARCHAR) AS place_name,
      CAST(d.class AS VARCHAR) AS class,
      CAST(d.subtype AS VARCHAR) AS subtype,
      TRY_CAST(d.country AS VARCHAR) AS country,
      CAST(d.population AS BIGINT) AS population,
      d.hierarchies AS hierarchies,
      d.cartography AS cartography,
      CASE
        WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
        ELSE ST_PointOnSurface(d.geometry)
      END AS geom
    FROM parquet_scan('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=true) d
  )
  SELECT
    place_name,
    class,
    subtype,
    country,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 2), 'name') AS VARCHAR) AS region,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 3), 'name') AS VARCHAR) AS district,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 4), 'name') AS VARCHAR) AS area_name,
    population,
    -- Alias to tippecanoe:minzoom for native handling (fallback to 14 for no prominence)
    COALESCE(
      (SELECT pm.min_zoom
       FROM prom_map pm
       WHERE TRY_CAST(cartography.prominence AS INTEGER) BETWEEN pm.min_prom AND pm.max_prom
       LIMIT 1),
      14
    ) AS "tippecanoe:minzoom",  -- GDAL will write this as a GeoJSON property
    geom
  FROM src
  WHERE country = 'IN' AND place_name IS NOT NULL
  LIMIT 10000  -- Uncomment for testing; remove for full export
) TO 'places.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON');
""")

con.close()

# Converting to PMTiles Using Tippecanoe

## Installing Tippecanoe

In [None]:
%%shell

## Install duckdb CLI quietly
curl https://install.duckdb.org | sh > /dev/null 2>&1

## Install tippecanoe quietly

# Clone the tippecanoe repository from GitHub quietly
git clone --quiet https://github.com/mapbox/tippecanoe.git
# Enter the tippecanoe folder
cd tippecanoe
# Build tippecanoe using multiple cores (-j) and silently (-s)
make -j -s > /dev/null 2>&1
# Install tippecanoe in the system silently
make install -s > /dev/null 2>&1
# Go back to the previous directory
cd ..

## Check if duckdb & tippecanoe are installed (minimal output)
echo "Installation complete."
echo "Tippecanoe version: $(/content/tippecanoe/tippecanoe --version 2>/dev/null || echo 'Not found')"

## Install pmtiles package

Refer to this page: [go-pmtiles/releases](https://github.com/protomaps/go-pmtiles/releases)

In [None]:
%%shell

# Download go-pmtiles for Linux x86_64 quietly
wget -q https://github.com/protomaps/go-pmtiles/releases/download/v1.28.3/go-pmtiles_1.28.3_Linux_x86_64.tar.gz

# Verify SHA256 checksum
echo "06cf492adc2c7fcd23c4f11a98a5292f4cbe04d3afc3a6b38a07bb47452daca2 go-pmtiles_1.28.3_Linux_x86_64.tar.gz" | sha256sum --check --quiet

# Extract quietly
tar -xzf go-pmtiles_1.28.3_Linux_x86_64.tar.gz -C /tmp/ > /dev/null 2>&1

# Install binary to /usr/local/bin (assuming binary is named 'pmtiles')
sudo mv /tmp/pmtiles /usr/local/bin/pmtiles > /dev/null 2>&1

# Clean up
rm go-pmtiles_1.28.3_Linux_x86_64.tar.gz

# Check installation
echo "Installation complete."
pmtiles version

## Convert GeoJSON to PMTiles

In [None]:
%%shell
set -euo pipefail

# Quick check: Verify tippecanoe is in PATH
tippecanoe --version || { echo "Error: tippecanoe not found in PATH. Re-run installation if needed."; exit 1; }

IN="places.geojson"        # Input GeoJSON
MB_OUT="india_places.mbtiles"  # Temporary MBTiles
PM_OUT="india_places.pmtiles"  # Final PMTiles
LAYER="places"
MAX_Z="15"
MIN_Z="0"

# Check if input exists
[[ -f "$IN" ]] || { echo "Error: $IN not found."; exit 1; }

# Generate MBTiles with tippecanoe (quietly, suppressing verbose output)
tippecanoe \
  -o "$MB_OUT" \
  -l "$LAYER" \
  -Z "$MIN_Z" -z "$MAX_Z" \
  --force \
  --read-parallel \
  --drop-densest-as-needed \
  --extend-zooms-if-still-dropping \
  "$IN" > /dev/null 2>&1

# Convert MBTiles to PMTiles (quietly)
pmtiles convert "$MB_OUT" "$PM_OUT" > /dev/null 2>&1

# # Clean up temporary MBTiles
# rm "$MB_OUT"

echo "Conversion complete: $PM_OUT"
pmtiles show "$PM_OUT"