<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/12_market_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import duckdb
from tqdm import tqdm
import pandas as pd  # if you need it later
import geopandas as gpd

import urllib.request
import zipfile
import os
from glob import glob

## Download India Boundary

In [None]:
%%shell
wget "https://raw.githubusercontent.com/kavyajeetbora/foursquare_ai/22c6962e1a6d1f95239e9c896e21d60824282dbd/india.geojson" -O india.geojson

In [None]:
con = duckdb.connect()
# Install and load the spatial extension (idempotent - safe to run multiple times)
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")



## Download Brand specific POIs

In [None]:
url = "https://alltheplaces-data.openaddresses.io/runs/2025-12-06-13-33-06/output.zip"
zip_path = "output.zip"
dir_path = "atp_data"

# Remove old directory if it exists (to ensure clean extract)
if os.path.exists(dir_path):
    !rm -rf {dir_path}

# Download with progress bar (quiet in terms of no verbose output, but shows progress)
print("Downloading the large zip file (~1-2 GB compressed)... This may take several minutes.")
urllib.request.urlretrieve(url, zip_path)

# Get file size
size_gb = os.path.getsize(zip_path) / (1024**3)
print(f"Download completed: {size_gb:.2f} GB")

# Unzip quietly
print("Extracting files quietly... This will also take a few minutes.")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(dir_path)

# Count extracted GeoJSON files
geojson_files = [f for f in os.listdir(dir_path) if f.endswith('.geojson')]
print(f"Extraction complete: {len(geojson_files)} GeoJSON files extracted to '{dir_path}/'")

## Filter POIs within India


- Using the [`ST_Read`](https://duckdb.org/docs/stable/core_extensions/spatial/functions#st_read) from duckdb to read the GeoJSON files

- Using `spatial_filter` parameter to filter the geojson data by WKB geometry

In [None]:
files = glob('/content/atp_data/output/*.geojson')
print(len(files))

In [None]:
# Establish connection
con = duckdb.connect()
file_path = r"/content/atp_data/output/accor.geojson"
# Install and load the spatial extension (idempotent - safe to run multiple times)
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

con.execute("""
    CREATE OR REPLACE TABLE india_features AS
    SELECT * FROM ST_Read('india.geojson')
""")

query = f"""
            SELECT *
            FROM ST_Read('{file_path}')
        """
df = con.execute(query).df()

con.close()

df.shape

In [None]:
# Establish connection
con = duckdb.connect()

# Install and load the spatial extension (idempotent - safe to run multiple times)
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")
con.execute("SET enable_progress_bar = false;")
con.execute("SET enable_progress_bar_print = false;")

# List to store individual GeoDataFrames/DataFrames
dfs = []
errors = []

# Modern tqdm style with real-time updates
for file_path in tqdm(files,
                      desc="Reading GeoJSON files",
                      unit="file",
                      colour="#00ff00",          # Bright green
                      bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
                      dynamic_ncols=True,
                      smoothing=0.1):

    try:
        df = con.execute(f"""
            SELECT *
            FROM ST_Read('{file_path}', spatial_filter='{india_wkb}')
        """).df()

        dfs.append(df)

    except Exception as e:
        errors.append(f"\nFailed to read {file_path}: {e}")

# Close connection when done
con.close()

print(f"\nSuccessfully read {len(dfs)} GeoJSON files into list 'dfs'")

In [None]:
file_path