In [1]:
import os
from pathlib import Path
import json

from dotenv import load_dotenv
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
from chromadb.utils import embedding_functions


# Load variables from .env into environment
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")

In [2]:
# Get the current working directory
current_dir = Path.cwd()

# Path to the JSON file
json_path = current_dir / "real-estate-listings.json"

# Open and load the JSON file
with json_path.open("r", encoding="utf-8") as f:
    real_estate_listings = json.load(f)

# Check the type and first item
print(type(real_estate_listings))  # <class 'list'>
print("No. of data:", len(real_estate_listings))
display(real_estate_listings[0])      # prints the first listing

<class 'list'>
No. of data: 30


{'id': 1,
 'title': 'Cozy 2-Bedroom Condo in Downtown',
 'description': 'This modern 2-bedroom, 1.5-bathroom condo offers an open floor plan with abundant natural light, hardwood floors throughout, and a stylish kitchen with stainless steel appliances and granite countertops. The spacious living area leads to a private balcony, perfect for enjoying city views. The master bedroom includes a walk-in closet, while the second bedroom is ideal for a home office or guest space. The building features secured entry, an on-site gym, and a rooftop lounge.',
 'neighborhood_description': 'Located in the heart of downtown San Francisco, this condo is within walking distance of trendy cafes, upscale restaurants, and boutique shops. The neighborhood boasts excellent public transportation options, making commuting a breeze. Nearby parks and cultural attractions add to the vibrant urban lifestyle.',
 'location': 'San Francisco, CA',
 'neighborhood': 'Downtown',
 'property_type': 'Condo',
 'year_built':

In [3]:
collection_name = "real_estate_listings_v2"

persist_directory = current_dir / "chromadb_data"
client = chromadb.PersistentClient(
    path=persist_directory,
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

In [4]:
# Delete a collection
client.delete_collection(collection_name)

In [5]:
# Create a collection or get an existing one
collection_names = [c.name for c in client.list_collections()]
print("collection_names:", collection_names)
if collection_name in collection_names:
    collection = client.get_collection(collection_name)
else:
    collection = client.create_collection(collection_name)

collection_names: ['real_estate_listings']


In [6]:
# Get all data from the collection
all_data = collection.get()

# `all_data` is a dictionary with keys like 'ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'
print(all_data.keys()) 
metadatas = all_data['metadatas']
print("No. of data:", len(metadatas), '\n\n')

# Iterate through the collection metadata
for i, listing in enumerate(metadatas):
    print(f"Listing {i+1}:")
    for key, value in listing.items():
        print(f"  {key}: {value}")
    print("-" * 40)

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])
No. of data: 0 




In [7]:
# Setup embedding function
# https://platform.openai.com/docs/guides/embeddings/embedding-models
embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    api_base=OPENAI_API_BASE,
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-3-small"
)

collection.embedding_function = embedding_function

In [None]:
DOCUMENT_FORMAT = (
    "{title}. {description} "
    "Neighborhood info: {neighborhood_description}. "
    "Location: {location}, Neighborhood: {neighborhood}. "
    "Type: {property_type}, Bedrooms: {bedrooms}, "
    "Bathrooms: {bathrooms}, Area: {area_sqft} sqft, "
    "Year built: {year_built}, Price: ${price}."
)

ids = [str(listing["id"]) for listing in real_estate_listings]

documents = [
    DOCUMENT_FORMAT.format(**listing) 
    for listing in real_estate_listings
]

import copy

metadatas = copy.deepcopy(real_estate_listings)

# # Construct the document string
# documents = [
#     f"{listing['title']}. {listing['description']} "
#     f"Neighborhood info: {listing['neighborhood_description']}. "
#     f"Location: {listing['location']}, Neighborhood: {listing['neighborhood']}. "
#     f"Type: {listing['property_type']}, Bedrooms: {listing['bedrooms']}, "
#     f"Bathrooms: {listing['bathrooms']}, Area: {listing['area_sqft']} sqft, "
#     f"Year built: {listing['year_built']}, Price: ${listing['price']}."
#     for listing in real_estate_listings
# ]



# metadatas = [
#     {
#         "id": listing["id"],
#         "location": listing["location"],
#         "neighborhood": listing["neighborhood"],
#         "property_type": listing["property_type"],
#         "year_built": listing["year_built"],
#         "price": listing["price"],
#         "bedrooms": listing["bedrooms"],
#         "bathrooms": listing["bathrooms"],
#         "area_sqft": listing["area_sqft"],
#         "listed_date": listing["listed_date"]
#     } for listing in real_estate_listings
# ]


# Add to the collection
collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas,
)

print(f"Inserted {len(documents)} listings into Chroma collection '{collection_name}'")


Inserted 30 listings into Chroma collection 'real_estate_listings_v2'


In [9]:
# 1. Check collection size
print("Total items in collection:", collection.count())

# retrieve by ID
print("\nGet item with ID=2:")
# print(collection.get(ids=["2"]))
item = collection.get(ids=["2"], include=["embeddings"])
print("Embedding Shape:", item["embeddings"].shape)

# 2. Run a sample query (search for something relevant to your documents)
query_text = "historic apartment in Boston"
results = collection.query(
    query_texts=[query_text],
    n_results=2  # top 2 matches
)

print("\n--- Query Results ---")
for i in range(len(results["documents"][0])):
    print(f"Match {i+1}:")
    print("  Document:", results["documents"][0][i])
    print("  Metadata:", results["metadatas"][0][i])
    print("  Distance:", results["distances"][0][i])
    print()

# # 3. Get a specific item back by ID
# doc_id = "9"  # or whichever you used
# retrieved = collection.get(ids=[doc_id])
# print("\n--- Retrieved by ID ---")
# print(retrieved)

Total items in collection: 30

Get item with ID=2:
Embedding Shape: (1, 384)

--- Query Results ---
Match 1:
  Document: Compact 1-Bedroom Apartment in Historic Building. This quaint 1-bedroom, 1-bathroom apartment offers vintage charm combined with modern upgrades. The apartment features exposed brick walls, hardwood floors, and a fully renovated kitchen with stainless steel appliances. It is perfect for those looking for a cozy living space with character. Neighborhood info: Situated in a historic district of Boston, MA, this apartment is surrounded by charming architecture, local cafes, and shops. The neighborhood is vibrant, with easy access to public transportation and close proximity to cultural attractions like museums and theaters.. Location: Boston, MA, Neighborhood: Historic District. Type: Apartment, Bedrooms: 1, Bathrooms: 1.0, Area: 600 sqft, Year built: 1900, Price: $210000.
  Metadata: {'bathrooms': 1.0, 'area_sqft': 600, 'price': 210000, 'id': 9, 'location': 'Boston, MA

## Part II: Preference Extraction

In [10]:
# path = Path("conversations.json")

conv_json_path = current_dir / "conversations.json"
with conv_json_path.open("r", encoding="utf-8") as f:
    conversations = json.load(f)

for conv in conversations:
    print(f"Conversation {conv['conversation_id']}:")
    for msg in conv["messages"]:
        print(f"{msg['role'].capitalize()}: {msg['text']}")
    print("---")

Conversation 1:
Agent: How big do you want your house to be?
Buyer: A comfortable three-bedroom house with a spacious kitchen and a cozy living room.
Agent: What are 3 most important things for you in choosing this property?
Buyer: A quiet neighborhood, good local schools, and convenient shopping options.
Agent: Which amenities would you like?
Buyer: A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.
Agent: Which transportation options are important to you?
Buyer: Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.
Agent: How urban do you want your neighborhood to be?
Buyer: A balance between suburban tranquility and access to urban amenities like restaurants and theaters.
---
Conversation 2:
Agent: Hello! What kind of property are you looking for today?
Buyer: I'm looking for a spacious suburban home for my family.
Agent: Great! How many bedrooms and bathrooms do you need?
Buyer: At least 4 bedrooms and 3 b

In [None]:
from pydantic import BaseModel
from typing import List, Optional, Tuple

from openai import OpenAI


def load_conversations():
    conv_json_path = current_dir / "conversations.json"
    with conv_json_path.open("r", encoding="utf-8") as f:
        conversations = json.load(f)
    
    conv_ids = [conv['conversation_id'] for conv in conversations]
    print("Conversation IDs:", conv_ids, '\n')
    return conv_ids, conversations

In [None]:
id = 2

conv_ids, conversations = load_conversations()
assert id in conv_ids, "invalid conversation id"
conv = conversations[conv_ids.index(id)]


conversation_text = ""
for msg in conv["messages"]:
    role = msg['role'].capitalize()
    conversation_text += f"{role}: {msg['text']}\n"

print(f"========== Conversation {id} ==========")
print(conversation_text)


Conversation IDs: [1, 2, 3, 99, 100] 

Agent: Hello! What kind of property are you looking for today?
Buyer: I'm looking for a spacious suburban home for my family.
Agent: Great! How many bedrooms and bathrooms do you need?
Buyer: At least 4 bedrooms and 3 bathrooms.
Agent: Are there any specific amenities you want in the house?
Buyer: A large backyard, a garage for two cars, and a modern kitchen would be ideal.
Agent: What about the neighborhood? Any preferences?
Buyer: A safe, family-friendly area with good schools nearby.
Agent: Do you have a budget range in mind?
Buyer: Somewhere between $500,000 and $700,000.



In [97]:
class BuyerPreferences(BaseModel):
    bedrooms: Optional[int]
    bathrooms: Optional[float]
    property_type: Optional[str]

    # Additional physical features
    area_min_sqft: Optional[int]   # minimum area (sq. ft)
    area_max_sqft: Optional[int]   # maximum area (sq. ft)
    building_max_age: Optional[int]  # max age in years
    building_min_year: Optional[int] # alternatively, built after year X
    
    # Lifestyle & location
    amenities: List[str] = []
    furnished: bool = False
    location: Optional[str]
    neighborhood_features: List[str] = []
    transportation: List[str] = []
    parking_required: bool = False
    pet_friendly_required: bool = False

    # Financial
    min_budget: Optional[int]
    max_budget: Optional[int]

    
    

client = OpenAI(base_url=OPENAI_API_BASE, api_key=OPENAI_API_KEY)

system_prompt = "You are a real estate preference parser."
user_prompt = f"Extract buyer preferences from the conversation below and fill the JSON fields:\n{conversation_text}"

response = client.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    response_format=BuyerPreferences
)

In [98]:
content = response.choices[0].message.content
content_dict = json.loads(content)
display(content_dict)


{'bedrooms': 4,
 'bathrooms': 3,
 'property_type': 'Single Family Home',
 'area_min_sqft': None,
 'area_max_sqft': None,
 'building_max_age': None,
 'building_min_year': None,
 'amenities': ['Large backyard', 'Garage for two cars', 'Modern kitchen'],
 'furnished': False,
 'location': 'Suburb',
 'neighborhood_features': ['Safe', 'Family-friendly', 'Good schools'],
 'transportation': [],
 'parking_required': True,
 'pet_friendly_required': False,
 'min_budget': 500000,
 'max_budget': 700000}

## Part III: Filtering

- "$eq" – equal (default if you just write "bathrooms": 1.0)
- "$ne" – not equal
- "$gt" – greater than
- "$gte" – greater than or equal
- "$lt" – less than
- "$lte" – less than or equal
- "$in" – in list of values
- "$nin" – not in list of values

In [99]:
from datetime import datetime

# Get the current date and time, and then extract the year attribute
current_year = datetime.now().year
print(current_year)

2025


In [None]:
property_type = content_dict['property_type']
amenities = content_dict['amenities']
furnished = content_dict['furnished']
location = content_dict['location']
neighborhood_features = content_dict['neighborhood_features']
transportation = content_dict['transportation']
parking_required = content_dict['parking_required']
pet_friendly = content_dict['pet_friendly_required']


# Build bullet points
query_text = f"""
- Property type: {property_type}
- Amenities: {', '.join(amenities) if amenities else 'Any'}
- Furnished: {"Yes" if furnished else "No"}
- Location: {location or "Any"}
- Neighborhood features: {', '.join(neighborhood_features) if neighborhood_features else "Any"}
- Transportation preferences: {', '.join(transportation) if transportation else "Any"}
- Parking required: {"Yes" if parking_required else "No"}
- Pet friendly required: {"Yes" if pet_friendly else "No"}
"""

### TODO: Handle the case of no value!
n_bedroom = content_dict['bedrooms']
n_bathroom = content_dict['bathrooms']
min_budget, max_budget = content_dict['min_budget'], content_dict['max_budget']
min_area, max_area = content_dict['area_min_sqft'], content_dict['area_max_sqft']
building_max_age = content_dict['building_max_age']
building_min_year = content_dict['building_min_year']

conditions = []
if n_bedroom:
    conditions += [{"bedrooms": n_bedroom}]
if n_bathroom:
    conditions += [{"bathrooms": {"$gte": n_bathroom - 1}}, {"bathrooms": {"$lte": n_bathroom + 1}}]
if min_budget:
    conditions += [{"price": {"$gte": min_budget}}]
if max_budget:
    conditions += [{"price": {"$lte": max_budget}}]
if min_area:
    conditions += [{"area_sqft": {"$gte": min_area}}]
if max_area:
    conditions += [{"area_sqft": {"$lte": max_area}}]
if building_min_year:
    conditions += [{"year_built": {"$gte": building_min_year}}]
if building_max_age:
    conditions += [{"year_built": {"$gte": current_year - building_max_age}}]



# {'bedrooms': 4,
#  'bathrooms': 3,
#  'property_type': 'suburban home',
#  'area_min_sqft': None,
#  'area_max_sqft': None,
#  'building_max_age': None,
#  'building_min_year': None,
#  'amenities': ['large backyard', 'garage for two cars', 'modern kitchen'],
#  'furnished': False,
#  'location': None,
#  'neighborhood_features': ['safe', 'family-friendly', 'good schools nearby'],
#  'transportation': [],
#  'parking_required': True,
#  'pet_friendly': False,
#  'min_budget': 500000,
#  'max_budget': 700000}


In [101]:
outputs = collection.query(
    query_texts=[query_text],
    where={"$and": conditions},
    n_results=3,
)
display(outputs)

{'ids': [['2', '15', '29']],
 'embeddings': None,
 'documents': [['Spacious 4-Bedroom House with Garden. This spacious 4-bedroom, 3-bathroom house offers a large backyard perfect for outdoor activities, a renovated kitchen with modern finishes, and a 2-car garage. The open-concept living and dining area flows seamlessly into the backyard patio, ideal for entertaining. The master suite includes a walk-in closet and en-suite bathroom with a soaking tub and separate shower. Neighborhood info: Located in a quiet suburban neighborhood in Austin, TX, this house is close to excellent schools, parks, and shopping centers. The area is family-friendly with a tight-knit community and is just a short drive to downtown Austin for easy access to work and entertainment.. Location: Austin, TX, Neighborhood: Suburban. Type: House, Bedrooms: 4, Bathrooms: 3.0, Area: 2100 sqft, Year built: 2015, Price: $715000.',
   'Luxury 4-Bedroom Villa with Pool. This stunning 4-bedroom, 4-bathroom villa offers a spa

In [93]:




outputs = collection.get(
    where={
        "$and": [
            {"bathrooms": 1.0},
            {"bedrooms": {"$gte": 1}},
            {"year_built": {"$gte": 1900}},
        ]
    },
)

display(outputs)

{'ids': ['6', '9', '13', '14', '16', '18', '20'],
 'embeddings': None,
 'documents': ['Charming 1-Bedroom Bungalow. This classic 1-bedroom, 1-bathroom bungalow features an updated kitchen with modern appliances, hardwood floors throughout, and a cozy living room with plenty of natural light. The private patio offers an ideal space for outdoor relaxation and entertaining. This home blends traditional charm with contemporary updates, making it a perfect retreat. Neighborhood info: Located in a peaceful residential area of Portland, OR, this bungalow is close to local cafes, parks, and public transportation. The quiet neighborhood offers a suburban vibe while still being within reach of the city’s vibrant downtown.. Location: Portland, OR, Neighborhood: Residential. Type: House, Bedrooms: 1, Bathrooms: 1.0, Area: 750 sqft, Year built: 1940, Price: $275000.',
  'Compact 1-Bedroom Apartment in Historic Building. This quaint 1-bedroom, 1-bathroom apartment offers vintage charm combined with 

In [None]:
outputs = collection.query(
    query_texts=["modern apartment in Boston"],
    where={
        "$and": [
            {"bathrooms": 1.0},
            {"bedrooms": {"$gte": 1}},
            {"year_built": {"$gte": 1900}},
        ]
    },
    n_results=3,
)

print(outputs)

{'ids': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[]], 'distances': [[]]}


        "id": 9,
        "title": "Compact 1-Bedroom Apartment in Historic Building",
        "description": "This quaint 1-bedroom, 1-bathroom apartment offers vintage charm combined with modern upgrades. The apartment features exposed brick walls, hardwood floors, and a fully renovated kitchen with stainless steel appliances. It is perfect for those looking for a cozy living space with character.",
        "neighborhood_description": "Situated in a historic district of Boston, MA, this apartment is surrounded by charming architecture, local cafes, and shops. The neighborhood is vibrant, with easy access to public transportation and close proximity to cultural attractions like museums and theaters.",
        "location": "Boston, MA",
        "neighborhood": "Historic District",
        "property_type": "Apartment",
        "year_built": 1900,
        "price": 210000,
        "bedrooms": 1,
        "bathrooms": 1.0,
        "area_sqft": 600,
        "listed_date": "2025-03-31"

In [None]:
TEMPLATE = """
Property ID: {id}
Title: {title}
Description: {description}
Neighborhood Info: {neigh_desc}
Location: {location}
Neighborhood: {neighborhood}
Property Type: {property_type}
Year Built: {year_built} CE
Price: ${price:,}
Bedrooms: {bedrooms}
Bathroom: {bathrooms}
Area: {area_sqft} sq.ft.
Listed Date: {listed_date}
"""


def display_real_estate_info(real_estate_id):

    id_str = str(real_estate_id)
    item = collection.get(ids=[id_str])

    metadata = item['metadatas'][0]

    real_estate_info = TEMPLATE.format(
        id=metadata['id'],
        title="PH",
        description="PH",
        neigh_desc="PH",
        location=metadata['location'],
        neighborhood=metadata['neighborhood'],
        property_type=metadata['property_type'],
        year_built=metadata['year_built'],
        price=metadata['price'],
        bedrooms=metadata['bedrooms'],
        bathrooms=metadata['bathrooms'],
        area_sqft=metadata['area_sqft'],
        listed_date=metadata['listed_date'],
    )

    print(real_estate_info)



{'ids': ['9'], 'embeddings': None, 'documents': ['Compact 1-Bedroom Apartment in Historic Building. This quaint 1-bedroom, 1-bathroom apartment offers vintage charm combined with modern upgrades. The apartment features exposed brick walls, hardwood floors, and a fully renovated kitchen with stainless steel appliances. It is perfect for those looking for a cozy living space with character. Neighborhood info: Situated in a historic district of Boston, MA, this apartment is surrounded by charming architecture, local cafes, and shops. The neighborhood is vibrant, with easy access to public transportation and close proximity to cultural attractions like museums and theaters.. Location: Boston, MA, Neighborhood: Historic District. Type: Apartment, Bedrooms: 1, Bathrooms: 1.0, Area: 600 sqft, Year built: 1900, Price: $210000.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'year_built': 1900, 'location': 'Boston, MA', 'price': 210000, 'bathrooms': 1.0, 'n

In [75]:
outputs = collection.get(
    #query_texts=["apartment"],  # or embeddings if you're doing vector search
    where={
        "bathrooms": 1.0,
        "bedrooms": {"$gte": 1, "$lt": 2},
        "year_built": {"$gte": 1900}
    },
    #n_results=10
)

print(outputs)

ValueError: Expected where to have exactly one operator, got {'bathrooms': 1.0, 'bedrooms': {'$gte': 1, '$lt': 2}, 'year_built': {'$gte': 1900}} in get.