<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/07_duckdb_ai_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet duckdb jupysql duckdb-engine

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import duckdb
import os
import pandas as pd

In [3]:
# Initialize DuckDB connection
con = duckdb.connect()

# Load required extensions
con.execute("INSTALL httpfs; LOAD httpfs; INSTALL spatial; LOAD spatial;")

s3_places_path = 's3://fsq-os-places-us-east-1/release/dt=2025-09-09/places/parquet/places-*.zstd.parquet'
s3_categories_path = 's3://fsq-os-places-us-east-1/release/dt=2025-09-09/categories/parquet/categories.zstd.parquet'

# Execute the SELECT query and create a view
con.execute(f"""
CREATE OR REPLACE VIEW places_with_categories AS
WITH places AS (
    SELECT
        DISTINCT UNNEST(P.fsq_category_ids) as fsq_category_id,
        name,
        postcode,
        address,
        region,
        ST_Point(longitude, latitude) AS geom
    FROM read_parquet('{s3_places_path}') AS P
    WHERE latitude IS NOT NULL AND longitude IS NOT NULL AND country='IN'
),
places_with_categories AS (
    SELECT
        P.name AS name,
        C.level1_category_name AS category_level_1,
        C.level2_category_name AS category_level_2,
        postcode,
        address,
        region,
        P.geom
    FROM places AS P
    JOIN read_parquet('{s3_categories_path}') AS C
    ON P.fsq_category_id = C.category_id
)
SELECT
    name,
    category_level_1,
    category_level_2,
    address,
    region,
    postcode,
    geom
FROM places_with_categories;
""")

# Export the view to GeoParquet
con.execute("COPY (SELECT * FROM places_with_categories) TO 'output.geoparquet' WITH (FORMAT PARQUET, CODEC ZSTD);")

## Check the total count of the database:
# con.execute("SELECT COUNT(*) FROM places_with_categories;")
# result = con.fetchone()[0]
# print(result)
## Around 1358392 points are there

# Close the connection
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

# AI ChatBot

-

In [41]:
def execute_sql(sql_query: str, con):
    try:
        # Replace {s3_path} if not already in the query
        result = con.execute(sql_query).fetchall()
        return result
    except Exception as e:
        return f"Error executing SQL: {str(e)}"

def get_duckdb_connection():
    con = duckdb.connect(database=':memory:')  # In-memory for simplicity; use a file path for persistence if needed
    con.execute("INSTALL httpfs;")
    con.execute("LOAD httpfs;")
    # Optional: Set S3 region if needed (public bucket, so usually not required)
    # con.execute("SET s3_region='us-east-1';")
    return con

def get_db_schema(DATA_PATH, columns, duckdb_conn, limit=5):

    data_schema = f"""Columns:\n"""

    sql_query = f"SELECT {",".join(columns)} FROM read_parquet('{DATA_PATH}') WHERE 1=1"

    for column in columns:
        sql_query += f" AND {column} IS NOT NULL"

    sql_query += f" LIMIT {limit};"
    sample_result = execute_sql(sql_query, duckdb_conn)

    schema_details = execute_sql(f'DESCRIBE {sql_query}', duckdb_conn)

    for i, column in enumerate(columns):
        data_type = schema_details[i][1]
        sample_values = ",".join([str(r[i]) for r in sample_result])
        data_schema += f"{i+1}. Name: {column} | Data Type: {data_type} | Sample values: {sample_values}"
        data_schema += "\n"
    return data_schema

In [42]:
duckdb_conn = get_duckdb_connection()

DATA_PATH = "/content/output.geoparquet"
columns = ['name', 'category_level_1', 'category_level_2', 'address', 'region', 'postcode']

print(get_db_schema(DATA_PATH, columns, duckdb_conn, limit=10))

Columns:
1. Name: name | Data Type: VARCHAR | Sample values: Indane - Boham Gramin Vitrak,Gulf Carstop - Gogoi Automobile,Indane - Wangcha Gas Agency,Axis Bank,Axis Bank ATM,HDFC ERGO Insurance Agent: Partha P Bhattacharjee,HDFC ERGO Insurance Agent: Mintu Roy,HDFC Bank ATM,HDFC Bank,Punjab National Bank
2. Name: category_level_1 | Data Type: VARCHAR | Sample values: Travel and Transportation,Retail,Travel and Transportation,Business and Professional Services,Business and Professional Services,Business and Professional Services,Business and Professional Services,Business and Professional Services,Business and Professional Services,Business and Professional Services
3. Name: category_level_2 | Data Type: VARCHAR | Sample values: Fuel Station,Automotive Retail,Fuel Station,Financial Service,Financial Service,Financial Service,Financial Service,Financial Service,Financial Service,Financial Service
4. Name: address | Data Type: VARCHAR | Sample values: Ground Floor,Ground Floor,Bagh Moria 