<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/12_market_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import geopandas as gpd

link = "https://raw.githubusercontent.com/kavyajeetbora/foursquare_ai/22c6962e1a6d1f95239e9c896e21d60824282dbd/india.geojson"
india_boundary = gpd.read_file(link)

india_wkb = india_boundary['geometry'].iloc[0].wkb

In [None]:
import urllib.request
import zipfile
import os

url = "https://alltheplaces-data.openaddresses.io/runs/2025-12-06-13-33-06/output.zip"
zip_path = "output.zip"
dir_path = "atp_data"

# Remove old directory if it exists (to ensure clean extract)
if os.path.exists(dir_path):
    !rm -rf {dir_path}

# Download with progress bar (quiet in terms of no verbose output, but shows progress)
print("Downloading the large zip file (~1-2 GB compressed)... This may take several minutes.")
urllib.request.urlretrieve(url, zip_path)

# Get file size
size_gb = os.path.getsize(zip_path) / (1024**3)
print(f"Download completed: {size_gb:.2f} GB")

# Unzip quietly
print("Extracting files quietly... This will also take a few minutes.")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(dir_path)

# Count extracted GeoJSON files
geojson_files = [f for f in os.listdir(dir_path) if f.endswith('.geojson')]
print(f"Extraction complete: {len(geojson_files)} GeoJSON files extracted to '{dir_path}/'")

In [None]:
from glob import glob

files = glob('atp_data/output/*.geojson')
print(len(files))

In [None]:
import duckdb
from tqdm import tqdm
import pandas as pd  # if you need it later

# Establish connection
con = duckdb.connect()

# Install and load the spatial extension (idempotent - safe to run multiple times)
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

# Assume 'files' is a list of file paths to your GeoJSON files, e.g.:
# files = ['data/file1.geojson', 'data/file2.geojson', ...]

# List to store individual GeoDataFrames/DataFrames
dfs = []

# Modern tqdm style with real-time updates
for file_path in tqdm(files,
                      desc="Reading GeoJSON files",
                      unit="file",
                      colour="#00ff00",          # Bright green
                      bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]",
                      dynamic_ncols=True,
                      smoothing=0.1):

    try:
        df = con.execute(f"""
            SELECT *
            FROM ST_Read('{file_path}')
        """).df()

        dfs.append(df)

    except Exception as e:
        print(f"\nFailed to read {file_path}: {e}")

# Close connection when done
con.close()

print(f"\nSuccessfully read {len(dfs)} GeoJSON files into list 'dfs'")