# Exploring Foursquare POI Data


1. [Foursquare's 104M Points of Interest](https://tech.marksblogg.com/foursquare-open-global-poi-dataset.html)

In [None]:
!pip install --quiet duckdb
!pip install --quiet jupysql
!pip install --quiet duckdb-engine

In [None]:
import duckdb
import os
import pandas as pd
import geopandas as gpd

Foursquare POI count in India
1. Feb 2025 Release = `12.87` Lakhs
2. Aug 2025 Relase = `13.67` Lakhs

In [None]:
!wget https://raw.githubusercontent.com/udit-001/india-maps-data/main/geojson/india.geojson -O india.geojson

In [None]:
gdf = gpd.read_file('india.geojson')
india_gdf = gdf.dissolve()
india_gdf.to_parquet('india.parquet')
india_gdf.plot()

In [None]:
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

## Get Starbucks in India

In [None]:
%%time

# Initialize DuckDB connection
con = duckdb.connect()

# Load the spatial extension if not already loaded
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

# Define the output parquet file name
output_parquet_file = 'poi_india.parquet'

# Execute the query and directly copy the results to a parquet file
query = f"""
    COPY (
        SELECT * FROM read_parquet('s3://overturemaps-us-west-2/release/2025-07-23.0/theme=places/*/*') AS POI
        JOIN read_parquet('india.parquet') as INDIA
        ON ST_Within(POI.geometry, INDIA.geometry) LIMIT 100
    ) TO '{output_parquet_file}' (FORMAT PARQUET);
"""
con.execute(query)

print(f"Query results saved to {output_parquet_file}")

# Close the DuckDB connection
con.close()

In [None]:
df = pd.read_parquet("poi_india.parquet")
df['categories'].iloc[0]

## Caategory Wise Count

In [None]:
%%time

# Initialize DuckDB connection
con = duckdb.connect()

s3_path = ''
df = duckdb.sql(
    f"""
        INSTALL spatial;
        LOAD spatial;

        SELECT
            POI.categories.primary as Category,
            COUNT(POI.id) as Total_Count
        FROM read_parquet('s3://overturemaps-us-west-2/release/2025-07-23.0/theme=places/*/*') AS POI
        JOIN read_parquet('india.parquet') as INDIA
        ON ST_Within(POI.geometry, INDIA.geometry)
        GROUP BY POI.categories.primary;
    """
)

print(df)

# Close the DuckDB connection
con.close()

In [None]:
df.df()

## Categories Master Table

In [None]:
# Initialize DuckDB connection
con = duckdb.connect()

# Load the httpfs extension to enable S3 access
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")

s3_path = f's3://fsq-os-places-us-east-1/release/dt=2025-07-08/categories/parquet/categories.zstd.parquet'
# Execute the COPY command to read from S3 and write to a Parquet file
query = f"COPY (SELECT * FROM read_parquet('{s3_path}')) TO 'categories.parquet' (FORMAT PARQUET);"
con.execute(query)

# Close the DuckDB connection
con.close()

In [None]:
df = pd.read_parquet('categories.parquet')
df.shape

In [None]:
print(df['category_label'].iloc[10])

In [None]:
df[df['category_level']==3]

In [None]:
categories = df['category_label'].str.split(">").to_list()
max_cols = max([len(i) for i in categories])

df_cat = pd.DataFrame(categories, columns=[f'category_level_{i}' for i in range(1,max_cols+1)])
for col in df_cat.columns:
    df_cat[col] = df_cat[col].str.strip()
df_cat.head()

In [None]:
df_cat['category_level_1'].value_counts()

In [None]:
df_cat[df_cat['category_level_1']=='Travel and Transportation']['category_level_2'].unique()