<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/09_Overturemaps_Divisions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring Foursquare POI Data


1. [Foursquare's 104M Points of Interest](https://tech.marksblogg.com/foursquare-open-global-poi-dataset.html)

# Setup Environment

In [None]:
!pip install --quiet duckdb
!pip install --quiet jupysql
!pip install --quiet duckdb-engine

import duckdb
import os
import pandas as pd
import geopandas as gpd
import numpy as np

## Setup
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

## Installing Tippecanoe

In [None]:
%%shell

## Install duckdb CLI quietly
curl https://install.duckdb.org | sh > /dev/null 2>&1

## Install tippecanoe quietly

# Clone the tippecanoe repository from GitHub quietly
git clone --quiet https://github.com/mapbox/tippecanoe.git
# Enter the tippecanoe folder
cd tippecanoe
# Build tippecanoe using multiple cores (-j) and silently (-s)
make -j -s > /dev/null 2>&1
# Install tippecanoe in the system silently
make install -s > /dev/null 2>&1
# Go back to the previous directory
cd ..

## Check if duckdb & tippecanoe are installed (minimal output)
echo "Installation complete."
echo "Tippecanoe version: $(/content/tippecanoe/tippecanoe --version 2>/dev/null || echo 'Not found')"

## Installing Pmtile CLI
Refer to this page: [go-pmtiles/releases](https://github.com/protomaps/go-pmtiles/releases)

In [None]:
%%shell

# Download go-pmtiles for Linux x86_64 quietly
wget -q https://github.com/protomaps/go-pmtiles/releases/download/v1.28.3/go-pmtiles_1.28.3_Linux_x86_64.tar.gz

# Verify SHA256 checksum
echo "06cf492adc2c7fcd23c4f11a98a5292f4cbe04d3afc3a6b38a07bb47452daca2 go-pmtiles_1.28.3_Linux_x86_64.tar.gz" | sha256sum --check --quiet

# Extract quietly
tar -xzf go-pmtiles_1.28.3_Linux_x86_64.tar.gz -C /tmp/ > /dev/null 2>&1

# Install binary to /usr/local/bin (assuming binary is named 'pmtiles')
sudo mv /tmp/pmtiles /usr/local/bin/pmtiles > /dev/null 2>&1

# Clean up
rm go-pmtiles_1.28.3_Linux_x86_64.tar.gz

# Check installation
echo "Installation complete."
pmtiles version

##

## Download India Boundary

In [None]:
# !wget https://raw.githubusercontent.com/udit-001/india-maps-data/main/geojson/india.geojson -O india.geojson
# gdf = gpd.read_file('india.geojson')
# india_gdf = gdf.dissolve()
# india_gdf.to_parquet('india.parquet')
# india_gdf.plot()

## Division Schema


Promminence zoom levels for places:

| Prominence | Priority    | Typical place type             | Approx min_zoom |
|------------|-------------|--------------------------------|-----------------|
| 0          | Highest     | National capital, megacity     | 3               |
| 1          | Very high   | Large capital/regional capital | 4               |
| 2          | High        | City > 1M                      | 4–5             |
| 3          | High        | City 500k–1M                   | 5               |
| 4          | Medium-high | City 100k–500k                 | 5–6             |
| 5          | Medium      | City/Town 50k–100k             | 6               |
| 6          | Medium      | Town 20k–50k                   | 7               |
| 7          | Low         | Town 10k–20k                   | 8               |
| 8          | Low         | Village 5k–10k                 | 9               |
| 9          | Low         | Village/Suburb                 | 10              |
| 10         | Very low    | Hamlet/Neighbourhood           | 11              |
| 11         | Very low    | Minor locality                 | 12              |
| 12         | Lowest      | Micro/local name               | 13+             |

In [None]:
# # Initialize DuckDB connection
# con = duckdb.connect()

# # Load the spatial extension if not already loaded
# con.execute("INSTALL spatial;")
# con.execute("LOAD spatial;")

# # Define prominence→zoom mapping as a dictionary of ranges
# prominence_to_zoom = {
#     (80, 100): 5,
#     (70, 79): 6,
#     (60, 69): 7,
#     (50, 59): 8,
#     (40, 49): 9,
#     (30, 39): 10,
#     (20, 29): 11,
#     (10, 19): 12,
#     (1, 9): 13,
# }

# # Register mapping as an in-memory table for the query
# prom_map_df = pd.DataFrame(
#     [{'min_prom': lo, 'max_prom': hi, 'min_zoom': z} for (lo, hi), z in prominence_to_zoom.items()]
# )
# con.register('prom_map', prom_map_df)

# # Define the output parquet file name
# output_parquet_file = 'poi_india.parquet'

# # Execute the query and directly copy the results to a parquet file
# # total_count = con.execute(f"""
# #     SELECT
# #         COUNT(*)
# #  FROM read_parquet('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=True) WHERE country = 'IN'
# # """).df()

# sample_data = con.execute(f"""
#   SELECT
#     d.names.primary as place_name,
#     d.class,
#     d.subtype,
#     d.country,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),2), 'name') AS region,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),3), 'name') AS district,
#     struct_extract(list_extract(list_extract(d.hierarchies,1),4), 'name') AS area_name,
#     d.population,
#     COALESCE(pm.min_zoom, 14) AS min_zoom,
#     ST_Y(CASE
#         WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
#         ELSE ST_PointOnSurface(d.geometry)
#     END) AS lat,
#     ST_X(CASE
#         WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
#         ELSE ST_PointOnSurface(d.geometry)
#     END) AS lon
#   FROM parquet_scan('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=true) d
#   LEFT JOIN prom_map pm
#     ON d.cartography.prominence BETWEEN pm.min_prom AND pm.max_prom
#   WHERE d.country = 'IN'
# """).df()
# sample_data = sample_data[sample_data['place_name'].notna()].copy()

# # Close the DuckDB connection
# con.close()

In [None]:
# sample_data.to_parquet("places_india.parquet")

## Export Places Data to GeoJSON

In [None]:
latest_release = "2025-11-19.0"

# Assuming latest_release is defined, e.g., latest_release = '2025-11-19.0'

con = duckdb.connect()
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

con.execute("SET s3_region='us-west-2';")

prominence_to_zoom = {
    (80, 100): 5,
    (70, 79): 6,
    (60, 69): 7,
    (50, 59): 8,
    (40, 49): 9,
    (30, 39): 10,
    (20, 29): 11,
    (10, 19): 12,
    (1, 9): 13,
}
prom_map_df = pd.DataFrame(
    [{'min_prom': lo, 'max_prom': hi, 'min_zoom': z} for (lo, hi), z in prominence_to_zoom.items()]
)
con.register('prom_map', prom_map_df)

con.execute(f"""
COPY (
  WITH src AS (
    SELECT
      CAST(d.names.primary AS VARCHAR) AS place_name,
      CAST(d.class AS VARCHAR) AS class,
      CAST(d.subtype AS VARCHAR) AS subtype,
      TRY_CAST(d.country AS VARCHAR) AS country,
      CAST(d.population AS BIGINT) AS population,
      d.hierarchies AS hierarchies,
      d.cartography AS cartography,
      CASE
        WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
        ELSE ST_PointOnSurface(d.geometry)
      END AS geom
    FROM parquet_scan('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=true) d
  ),
  ranked_places AS (
    SELECT
      *,
      -- Determine hierarchy level (1=admin, 2=state, 3=district, 4=local)
      CASE
        WHEN class = 'admin' AND subtype IN ('country', 'national') THEN 1
        WHEN class = 'admin' AND subtype IN ('state', 'province', 'region') THEN 2
        WHEN class = 'admin' AND subtype IN ('county', 'district') THEN 3
        WHEN class IN ('locality', 'populated') OR subtype IN ('village', 'town', 'city') THEN 4
        ELSE 5  -- Other/fallback
      END AS hierarchy_level,
      -- Base minzoom by hierarchy/class (realistic: Delhi as admin capital ~z3-4)
      CASE
        WHEN hierarchy_level = 1 THEN 2  -- National capitals
        WHEN hierarchy_level = 2 THEN 4  -- States/provinces
        WHEN hierarchy_level = 3 THEN 7  -- Districts/counties
        WHEN hierarchy_level = 4 AND population > 100000 THEN 6  -- Large cities/towns
        WHEN hierarchy_level = 4 AND population > 10000 THEN 8   -- Medium towns
        WHEN hierarchy_level = 4 THEN 10  -- Small villages/localities
        ELSE 12  -- Default for unknowns
      END AS base_minzoom,
      -- Adjust minzoom by population if available (lower for bigger)
      CASE
        WHEN population > 10000000 THEN LEAST(base_minzoom, 3)  -- Megacities like Delhi
        WHEN population > 1000000 THEN LEAST(base_minzoom, 4)
        WHEN population > 100000 THEN LEAST(base_minzoom, 6)
        WHEN population > 10000 THEN LEAST(base_minzoom, 8)
        WHEN population > 1000 THEN LEAST(base_minzoom, 10)
        WHEN population IS NULL OR population < 1000 THEN base_minzoom + 2  -- Penalize small/unknown
        ELSE base_minzoom
      END AS adjusted_minzoom,
      -- Fallback to prominence if no good hierarchy/pop (e.g., natural features)
      COALESCE(
        adjusted_minzoom,
        (SELECT pm.min_zoom
         FROM prom_map pm
         WHERE TRY_CAST(cartography.prominence AS INTEGER) BETWEEN pm.min_prom AND pm.max_prom
         LIMIT 1),
        14
      ) AS final_minzoom
    FROM src
  )
  SELECT
    place_name,
    class,
    subtype,
    country,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 2), 'name') AS VARCHAR) AS region,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 3), 'name') AS VARCHAR) AS district,
    CAST(struct_extract(list_extract(list_extract(hierarchies, 1), 4), 'name') AS VARCHAR) AS area_name,
    population,
    final_minzoom AS "tippecanoe:minzoom",
    -- Maxzoom: Higher for small places (stay visible deep zoom); lower for majors (deprioritize after mid-zoom)
    -- E.g., Delhi max ~9 (disappears at z10+ to show locals); villages max 18
    CASE
      WHEN final_minzoom <= 4 THEN final_minzoom + 5  -- Majors: short range (e.g., z2-7)
      WHEN final_minzoom <= 7 THEN final_minzoom + 6  -- Mid: z4-10
      WHEN final_minzoom <= 10 THEN final_minzoom + 7 -- Towns: z6-13
      ELSE final_minzoom + 8  -- Small: long range (z10-18)
    END AS "tippecanoe:maxzoom",
    geom
  FROM ranked_places
  WHERE country = 'IN' AND place_name IS NOT NULL
  -- LIMIT 10000  -- Uncomment for testing; remove for full export
) TO 'places.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON');
""")

con.close()

# Converting to PMTiles Using Tippecanoe

In [None]:
%%shell
set -euo pipefail

# Verify tippecanoe
tippecanoe --version || { echo "Error: tippecanoe not found."; exit 1; }

IN="places.geojson"
MB_OUT="india_places.mbtiles"
PM_OUT="india_places.pmtiles"
LAYER="places"
MAX_Z="18"
MIN_Z="5"

# Tile with per-feature minzoom honored (no drops) - SHOW OUTPUT for debugging
tippecanoe \
  -o "$MB_OUT" \
  -l "$LAYER" \
  -Z "$MIN_Z" -z "$MAX_Z" \
  --force \
  --read-parallel \
  --no-feature-limit \
  --no-tile-size-limit \
  "$IN" > /dev/null 2>&1

# If tippecanoe succeeds, proceed
if [[ -f "$MB_OUT" ]]; then
  # Convert to PMTiles (quietly, no pipe to avoid SIGPIPE)
  pmtiles convert "$MB_OUT" "$PM_OUT" > /dev/null 2>&1

  # Cleanup
  # rm "$MB_OUT"

  echo "PMTiles ready: $PM_OUT (all features preserved with zoom thresholds)"
  pmtiles show "$PM_OUT"
else
  echo "Error: MBTiles not created - tippecanoe failed."
  exit 1
fi