<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/08_semantic_search_for_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup the Python Environment

In [None]:
!pip install --quiet duckdb jupysql duckdb-engine
## Langchain Framework
!pip install --quiet langchain langchain-community langchain-openai langgraph "langchain[openai]" sentence-transformers randomname chromadb

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import duckdb
import os, glob,shutil
import randomname

## Distinct Categories

In [None]:
# Initialize DuckDB connection
con = duckdb.connect()

# Load required extensions
con.execute("INSTALL httpfs; LOAD httpfs; INSTALL spatial; LOAD spatial;")

s3_places_path = 's3://fsq-os-places-us-east-1/release/dt=2025-09-09/places/parquet/places-*.zstd.parquet'

# Execute the SELECT query and create a view
result = con.execute(f"""
  SELECT
      UNNEST(fsq_category_labels) as category
  FROM read_parquet('{s3_places_path}') WHERE country='IN';
""").df()

con.close()

display(result)

In [None]:
# Using the MiniLM model from Sentence Transformers
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Get the list of documents (each row)
documents = result['category'].tolist()

# Create embeddings (each sentence -> vector)
embeddings = embedding_model.encode(documents, convert_to_tensor=False)


## Now Store the Embeddings in Vector Store
os.makedirs("db", exist_ok=True)

random_suffix = randomname.get_name()

persistent_directory = f"db/chroma-({random_suffix})"

## If already there, delete and create a new one
for folder in glob("db/chroma*"):
    if os.path.exists(folder):
        shutil.rmtree(folder)

os.mkdir(persistent_directory)

# Create ChromaDB vector store (in-memory example)
vector_db = Chroma.from_documents(
    documents = documents,
    collection_name = "poi_category_embeddings",
    embedding = embeddings,
    persist_directory = persistent_directory
)