<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/08_semantic_search_for_categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup the Python Environment

In [3]:
!pip install --quiet duckdb jupysql duckdb-engine
## Langchain Framework
!pip install --quiet langchain langchain-community langchain-openai langgraph "langchain[openai]" randomname chromadb langchain-huggingface

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/95.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.8/192.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.3/137.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:

In [4]:
import duckdb
import os,shutil
import randomname
from glob import glob

from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

## Distinct Categories

In [5]:
# Initialize DuckDB connection
con = duckdb.connect()

# Load required extensions
con.execute("INSTALL httpfs; LOAD httpfs; INSTALL spatial; LOAD spatial;")

s3_places_path = 's3://fsq-os-places-us-east-1/release/dt=2025-09-09/places/parquet/places-*.zstd.parquet'

# Execute the SELECT query and create a view
result = con.execute(f"""
  SELECT
     DISTINCT UNNEST(fsq_category_labels) as category
  FROM read_parquet('{s3_places_path}') WHERE country='IN';
""").df()

con.close()

display(result)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,category
0,Travel and Transportation > Lodging > Hotel
1,Retail > Warehouse or Wholesale Store
2,Travel and Transportation > Transport Hub > He...
3,Landmarks and Outdoors > States and Municipali...
4,Landmarks and Outdoors > Forest
...,...
1037,Sports and Recreation > Racquet Sports > Tenni...
1038,Sports and Recreation > Soccer > Soccer Club
1039,Dining and Drinking > Restaurant > Asian Resta...
1040,Retail > Sporting Goods Retail > Tennis Store


## Embedding the Documents

### Choose the embedding model

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Create Document Objects

In [7]:
## First Create documents
categories = result['category'].drop_duplicates().tolist()

documents = [
        Document(
            page_content=cat, metadata={"category_id": i, "source": "foursquare poi"}
        ) for i,cat in enumerate(categories)
    ]

### Create the Vector Store (Local)

In [8]:
## Now Store the Embeddings in Vector Store
os.makedirs("db", exist_ok=True)

random_suffix = randomname.get_name()

persistent_directory = f"db/chroma-({random_suffix})"

## If already there, delete and create a new one
for folder in glob("db/chroma*"):
    if os.path.exists(folder):
        shutil.rmtree(folder)

os.mkdir(persistent_directory)

vector_db = Chroma.from_documents(
    documents=documents,
    collection_name="poi_category_embeddings",
    embedding=embeddings,  # Now LangChain-compatible!
    persist_directory=persistent_directory
)

## Retrieve Categories by Keyword

In [16]:
retriever = vector_db.as_retriever(search_kwargs={"k": 10})  # Top 3 similar categories

In [17]:
# Test semantic search
query = "Find restaurants where I can find Biryani?"  # User might say this for POIs
relevant_docs = retriever.get_relevant_documents(query)
for doc in relevant_docs:
  print(doc.metadata['category_id'], "|", doc.page_content)

421 | Dining and Drinking > Restaurant > Indian Restaurant > Irani Cafe
525 | Dining and Drinking > Restaurant > Eastern European Restaurant > Romanian Restaurant
917 | Dining and Drinking > Restaurant > Indian Restaurant > Chettinad Restaurant
760 | Dining and Drinking > Restaurant > Middle Eastern Restaurant > Shawarma Restaurant
519 | Dining and Drinking > Restaurant > Ukrainian Restaurant
511 | Dining and Drinking > Restaurant > Turkish Restaurant > Manti Place
734 | Dining and Drinking > Restaurant > Eastern European Restaurant
28 | Dining and Drinking > Restaurant > Indian Restaurant > Chaat Place
408 | Dining and Drinking > Dessert Shop > Pastry Shop
793 | Dining and Drinking > Restaurant > Indian Restaurant > Awadhi Restaurant
