<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/08_semantic_search_for_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup the Python Environment

In [None]:
!pip install --quiet duckdb jupysql duckdb-engine
## Langchain Framework
!pip install --quiet langchain langchain-community langchain-openai langgraph "langchain[openai]" randomname chromadb langchain-huggingface

In [None]:
import duckdb
import os,shutil
import randomname
from glob import glob

from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

from dotenv import load_dotenv
# Load environment variables from the .env file (if present)
load_dotenv()

## Distinct Categories

In [None]:
# Initialize DuckDB connection
con = duckdb.connect()

# Load required extensions
con.execute("INSTALL httpfs; LOAD httpfs; INSTALL spatial; LOAD spatial;")

s3_places_path = 's3://fsq-os-places-us-east-1/release/dt=2025-09-09/places/parquet/places-*.zstd.parquet'

# Execute the SELECT query and create a view
result = con.execute(f"""
  SELECT
     DISTINCT UNNEST(fsq_category_labels) as category
  FROM read_parquet('{s3_places_path}') WHERE country='IN';
""").df()

con.close()

display(result)

## Embedding the Documents

### Choose the embedding model

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B")

### Create Document Objects

In [None]:
## First Create documents
categories = result['category'].drop_duplicates().tolist()

documents = [
        Document(
            page_content=cat, metadata={"category_id": i, "source": "foursquare poi"}
        ) for i,cat in enumerate(categories)
    ]

### Create the Vector Store (Local)

In [None]:
## Now Store the Embeddings in Vector Store
os.makedirs("db", exist_ok=True)

random_suffix = randomname.get_name()

persistent_directory = f"db/chroma-({random_suffix})"

## If already there, delete and create a new one
for folder in glob("db/chroma*"):
    if os.path.exists(folder):
        shutil.rmtree(folder)

os.mkdir(persistent_directory)

vector_db = Chroma.from_documents(
    documents=documents,
    collection_name="poi_category_embeddings",
    embedding=embeddings,  # Now LangChain-compatible!
    persist_directory=persistent_directory
)

## Retrieve Categories by Keyword

In [None]:
vector_db

In [None]:
retriever = vector_db.as_retriever(similarity_score_threshold=0.6)  # Top 3 similar categories

In [None]:
# Test semantic search
query = "Find restaurants where I can find delicious Ice Cream?"  # User might say this for POIs
relevant_docs = retriever.get_relevant_documents(query)
for doc in relevant_docs:
  print(doc.metadata['category_id'], "|", doc.page_content)