<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/09_Overturemaps_Divisions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring Foursquare POI Data


1. [Foursquare's 104M Points of Interest](https://tech.marksblogg.com/foursquare-open-global-poi-dataset.html)

In [None]:
!pip install --quiet duckdb
!pip install --quiet jupysql
!pip install --quiet duckdb-engine

import duckdb
import os
import pandas as pd
import geopandas as gpd
import numpy as np

## Setup
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

## Download India Boundary

In [None]:
!wget https://raw.githubusercontent.com/udit-001/india-maps-data/main/geojson/india.geojson -O india.geojson

In [None]:
gdf = gpd.read_file('india.geojson')
india_gdf = gdf.dissolve()
india_gdf.to_parquet('india.parquet')
india_gdf.plot()

## Division Schema


Promminence zoom levels for places:

| Prominence | Priority    | Typical place type             | Approx min_zoom |
|------------|-------------|--------------------------------|-----------------|
| 0          | Highest     | National capital, megacity     | 3               |
| 1          | Very high   | Large capital/regional capital | 4               |
| 2          | High        | City > 1M                      | 4–5             |
| 3          | High        | City 500k–1M                   | 5               |
| 4          | Medium-high | City 100k–500k                 | 5–6             |
| 5          | Medium      | City/Town 50k–100k             | 6               |
| 6          | Medium      | Town 20k–50k                   | 7               |
| 7          | Low         | Town 10k–20k                   | 8               |
| 8          | Low         | Village 5k–10k                 | 9               |
| 9          | Low         | Village/Suburb                 | 10              |
| 10         | Very low    | Hamlet/Neighbourhood           | 11              |
| 11         | Very low    | Minor locality                 | 12              |
| 12         | Lowest      | Micro/local name               | 13+             |

In [None]:
latest_release = "2025-11-19.0"

In [None]:
import duckdb
import pandas as pd

# Initialize DuckDB connection
con = duckdb.connect()

# Load the spatial extension if not already loaded
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

# Define prominence→zoom mapping as a dictionary of ranges
prominence_to_zoom = {
    (80, 100): 5,
    (70, 79): 6,
    (60, 69): 7,
    (50, 59): 8,
    (40, 49): 9,
    (30, 39): 10,
    (20, 29): 11,
    (10, 19): 12,
    (1, 9): 13,
}

# Register mapping as an in-memory table for the query
prom_map_df = pd.DataFrame(
    [{'min_prom': lo, 'max_prom': hi, 'min_zoom': z} for (lo, hi), z in prominence_to_zoom.items()]
)
con.register('prom_map', prom_map_df)

# Define the output parquet file name
output_parquet_file = 'poi_india.parquet'

# Execute the query and directly copy the results to a parquet file
# total_count = con.execute(f"""
#     SELECT
#         COUNT(*)
#  FROM read_parquet('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=True) WHERE country = 'IN'
# """).df()

sample_data = con.execute(f"""
  SELECT
    d.names.primary as place_name,
    d.class,
    d.subtype,
    d.country,
    struct_extract(list_extract(list_extract(d.hierarchies,1),2), 'name') AS region,
    struct_extract(list_extract(list_extract(d.hierarchies,1),3), 'name') AS district,
    struct_extract(list_extract(list_extract(d.hierarchies,1),4), 'name') AS area_name,
    d.population,
    COALESCE(pm.min_zoom, 14) AS min_zoom,
    ST_Y(CASE
        WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
        ELSE ST_PointOnSurface(d.geometry)
    END) AS lat,
    ST_X(CASE
        WHEN ST_GeometryType(d.geometry) = 'POINT' THEN d.geometry
        ELSE ST_PointOnSurface(d.geometry)
    END) AS lon
  FROM parquet_scan('s3://overturemaps-us-west-2/release/{latest_release}/theme=divisions/*/*', union_by_name=true) d
  LEFT JOIN prom_map pm
    ON d.cartography.prominence BETWEEN pm.min_prom AND pm.max_prom
  WHERE d.country = 'IN'
""").df()
sample_data = sample_data[sample_data['place_name'].notna()].copy()

# Close the DuckDB connection
con.close()

In [None]:
sample_data.head()

In [None]:
sample_data.info()

In [None]:
sample_data.to_parquet("places_india.parquet")