In [1]:
import os
from pathlib import Path
import json

from dotenv import load_dotenv
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
from chromadb.utils import embedding_functions


# Load variables from .env into environment
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")

In [2]:
# Get the current working directory
current_dir = Path.cwd()

# Path to the JSON file
json_path = current_dir / "real-estate-listings.json"

# Open and load the JSON file
with json_path.open("r", encoding="utf-8") as f:
    real_estate_listings = json.load(f)

# Check the type and first item
print(type(real_estate_listings))  # <class 'list'>
print("No. of data:", len(real_estate_listings))
display(real_estate_listings[0])      # prints the first listing

<class 'list'>
No. of data: 30


{'id': 1,
 'title': 'Cozy 2-Bedroom Condo in Downtown',
 'description': 'This modern 2-bedroom, 1.5-bathroom condo offers an open floor plan with abundant natural light, hardwood floors throughout, and a stylish kitchen with stainless steel appliances and granite countertops. The spacious living area leads to a private balcony, perfect for enjoying city views. The master bedroom includes a walk-in closet, while the second bedroom is ideal for a home office or guest space. The building features secured entry, an on-site gym, and a rooftop lounge.',
 'neighborhood_description': 'Located in the heart of downtown San Francisco, this condo is within walking distance of trendy cafes, upscale restaurants, and boutique shops. The neighborhood boasts excellent public transportation options, making commuting a breeze. Nearby parks and cultural attractions add to the vibrant urban lifestyle.',
 'location': 'San Francisco, CA',
 'neighborhood': 'Downtown',
 'property_type': 'Condo',
 'year_built':

In [3]:
collection_name = "real_estate_listings_v2"

persist_directory = current_dir / "chromadb_data"
client = chromadb.PersistentClient(
    path=persist_directory,
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [4]:
# Delete a collection
client.delete_collection(collection_name)

In [5]:
# Create a collection or get an existing one
collection_names = [c.name for c in client.list_collections()]
print("collection_names:", collection_names)
if collection_name in collection_names:
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(collection_name)

collection_names: ['real_estate_listings']


In [6]:
# Get all data from the collection
all_data = collection.get()

# `all_data` is a dictionary with keys like 'ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'
print(all_data.keys()) 
metadatas = all_data['metadatas']
print("No. of data:", len(metadatas), '\n\n')

# Iterate through the collection metadata
for i, listing in enumerate(metadatas):
    print(f"Listing {i+1}:")
    for key, value in listing.items():
        print(f"  {key}: {value}")
    print("-" * 40)

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])
No. of data: 0 




In [7]:
# Setup embedding function
# https://platform.openai.com/docs/guides/embeddings/embedding-models
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_base=OPENAI_API_BASE,
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-3-small"
)

collection.embedding_function = embedding_function

In [8]:
ids = [str(listing["id"]) for listing in real_estate_listings]

# Construct the document string
documents = [
    f"{listing['title']}. {listing['description']} "
    f"Neighborhood info: {listing['neighborhood_description']}. "
    f"Location: {listing['location']}, Neighborhood: {listing['neighborhood']}. "
    f"Type: {listing['property_type']}, Bedrooms: {listing['bedrooms']}, "
    f"Bathrooms: {listing['bathrooms']}, Area: {listing['area_sqft']} sqft, "
    f"Year built: {listing['year_built']}, Price: ${listing['price']}."
    for listing in real_estate_listings
]

metadatas = [
    {
        "id": listing["id"],
        "location": listing["location"],
        "neighborhood": listing["neighborhood"],
        "property_type": listing["property_type"],
        "year_built": listing["year_built"],
        "price": listing["price"],
        "bedrooms": listing["bedrooms"],
        "bathrooms": listing["bathrooms"],
        "area_sqft": listing["area_sqft"],
        "listed_date": listing["listed_date"]
    } for listing in real_estate_listings
]


# Add to the collection
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Inserted {len(documents)} listings into Chroma collection '{collection_name}'")


Inserted 30 listings into Chroma collection 'real_estate_listings_v2'


In [9]:
# 1. Check collection size
print("Total items in collection:", collection.count())

# retrieve by ID
print("\nGet item with ID=2:")
# print(collection.get(ids=["2"]))
item = collection.get(ids=["2"], include=["embeddings"])
print("Embedding Shape:", item["embeddings"].shape)

# 2. Run a sample query (search for something relevant to your documents)
query_text = "historic apartment in Boston"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # top 2 matches
)

print("\n--- Query Results ---")
for i in range(len(results["documents"][0])):
    print(f"Match {i+1}:")
    print("  Document:", results["documents"][0][i])
    print("  Metadata:", results["metadatas"][0][i])
    print("  Distance:", results["distances"][0][i])
    print()

# # 3. Get a specific item back by ID
# doc_id = "9"  # or whichever you used
# retrieved = collection.get(ids=[doc_id])
# print("\n--- Retrieved by ID ---")
# print(retrieved)

Total items in collection: 30

Get item with ID=2:
Embedding Shape: (1, 384)

--- Query Results ---
Match 1:
  Document: Compact 1-Bedroom Apartment in Historic Building. This quaint 1-bedroom, 1-bathroom apartment offers vintage charm combined with modern upgrades. The apartment features exposed brick walls, hardwood floors, and a fully renovated kitchen with stainless steel appliances. It is perfect for those looking for a cozy living space with character. Neighborhood info: Situated in a historic district of Boston, MA, this apartment is surrounded by charming architecture, local cafes, and shops. The neighborhood is vibrant, with easy access to public transportation and close proximity to cultural attractions like museums and theaters.. Location: Boston, MA, Neighborhood: Historic District. Type: Apartment, Bedrooms: 1, Bathrooms: 1.0, Area: 600 sqft, Year built: 1900, Price: $210000.
  Metadata: {'bathrooms': 1.0, 'area_sqft': 600, 'price': 210000, 'id': 9, 'location': 'Boston, MA