## Install libraries

In [33]:
!pip install sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [8]:
!pip install openai
!pip install langchain
!pip install langchain_core
!pip install langchain_openai
!pip install lancedb
!pip install ipywidgets
!pip install tantivy
!pip install markdown
!pip install bs4
!pip install inflect
!pip install langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

## Cleanup

In [40]:
!rm -rf ./real-estate-listings.json
!rm -rf ./real-estate-embeddings-db

## Imports

In [9]:
from google.colab import userdata
import pandas as pd
import numpy as np
import openai
import os
import json
import re

from langchain.llms import OpenAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import LanceDB
from lancedb.rerankers import LinearCombinationReranker

import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

from bs4 import BeautifulSoup
from markdown import markdown

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Button, Box, FloatText, Textarea, Dropdown, Label, IntSlider, FloatSlider

import inflect

## Keys and Constants

In [17]:
DATA_FILE = "homematch.json"
os.environ["OPENAI_API_KEY"] = "voc-1391079853126677396547067e6e4e730ddd2.28982453"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

## Generate and save (or load) listings using LangChain

In [41]:
import os
import json
from bs4 import BeautifulSoup
from markdown import markdown
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI

DATA_FILE = "real_estate_listings.json"

system_prompt = """
You are an expert real estate agent in Delhi in India.
"""

human_prompt = """
Generate at least 15 real estate listings. Use your imagination to generate listings.
      Be sure to include real listings as well.
      Distribute listings across the 5 Delhi boroughs.
      The listings must be in the txt array of dictionaries with each item in the format as shown below:

        {
          "location": "Upper West Side",
          "list_price": 1000000,
          "bedrooms": 3,
          "bathrooms": 2,
          "square_feet": 1000,
          "monthly_hoa": 1000,
          "school_rating": 4.5,
          "description": "A beautiful pre-war building completely restored to modern living. Modern kitchen, new appliances, interior decorated by a famous architect. Ready to move in. Internet connection & Satellite TV dish can be installed on the roof. A serene neighborhood with access to shopping, dining, and entertainment. A real gem for starting families. Great schools. Close to subway."
        }
"""

# Step 1: If file already exists, read it
if os.path.isfile(DATA_FILE):
    with open(DATA_FILE, "r") as f:
        real_estate_listings_json = f.read()
else:
    chat = ChatOpenAI(temperature=1)  # Will fail unless valid key
    messages = [SystemMessage(content=system_prompt), HumanMessage(content=human_prompt)]
    aimessage = chat.invoke(messages)

    # Step 2: Extract text content from markdown
    raw_response = json.loads(aimessage.json())["content"]
    md_text = markdown(raw_response)
    plain_text = ''.join(BeautifulSoup(md_text, "html.parser").find_all(string=True))

    # Step 3: Extract the list only
    start = plain_text.find("[")
    end = plain_text.rfind("]") + 1
    json_content = plain_text[start:end]

    # Step 4: Save cleaned JSON
    with open(DATA_FILE, "w") as f:
        f.write(json_content)

    real_estate_listings_json = json_content

import pandas as pd
df = pd.read_json(DATA_FILE)
print(df.head())


                          location  list_price  bedrooms  bathrooms  \
0        South Delhi - Vasant Kunj     1500000         4          3   
1  Central Delhi - Connaught Place     2200000         2          2   
2         East Delhi - Mayur Vihar      900000         3          2   
3         North Delhi - Model Town     1800000         5          4   
4      West Delhi - Rajouri Garden     1200000         2          2   

   square_feet  monthly_hoa  school_rating  \
0         2000          500            4.8   
1         1200          800            4.3   
2         1500          300            4.7   
3         2500          600            4.6   
4         1000          400            4.4   

                                         description  
0  Luxurious villa with a spacious living area, m...  
1  Modern apartment in the heart of the city with...  
2  Well-maintained home in a family-friendly neig...  
3  Stunning modern house with a backyard oasis. O...  
4  Cozy apartment wit

/tmp/ipython-input-41-3880378248.py:42: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  raw_response = json.loads(aimessage.json())["content"]


## Convert listings to embeddings and save in LanceDb

In [42]:
from lancedb.pydantic import LanceModel, Vector
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load local embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Embedding function
def embed(text: str) -> list:
    return model.encode(text).tolist()

# Load listings JSON
df = pd.read_json("real_estate_listings.json")

# Define schema with 384-dim vector
class RealEstateListings(LanceModel):
    location: str
    list_price: float
    bedrooms: float
    bathrooms: float
    square_feet: float
    monthly_hoa: float
    school_rating: float
    description: str
    description_vector: Vector(384)

# Create table
table = db.create_table("listings", schema=RealEstateListings)

# Insert data with local embeddings
records = df.apply(lambda row: {
    "location": row["location"],
    "list_price": row["list_price"],
    "bedrooms": row["bedrooms"],
    "bathrooms": row["bathrooms"],
    "square_feet": row["square_feet"],
    "monthly_hoa": row["monthly_hoa"],
    "school_rating": row["school_rating"],
    "description": row["description"],
    "description_vector": embed(row["description"])
}, axis=1).tolist()

table.add(records)


AddResult(version=2)

## Inline User Interface to capture Buyer preferences

In [43]:
form_item_layout = Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)

form_items = [
    Box([Label(value='Max Price'), FloatSlider(min=1000000, max=5000000, step=10000, value=5000000)], layout=form_item_layout),
    Box([Label(value='Bed Rooms minimum'), FloatSlider(min=1, max=10, step=1)], layout=form_item_layout),
    Box([Label(value='Bath Rooms minimum'), FloatSlider(min=1, max=10, step=1)], layout=form_item_layout),
    Box([Label(value='School Ratings'), FloatSlider(min=1, max=5, step=1)], layout=form_item_layout),
    Box([Label(value='Square Footage'), FloatSlider(min=1000, max=5000, step=500)], layout=form_item_layout),
    Box([Label(value='Preferences'),
         Textarea(value="North Delhi")], layout=form_item_layout)
]

form = Box(form_items, layout=Layout(
    display='flex',
    flex_flow='column',
    border='solid 2px',
    align_items='stretch',
    width='50%'
))
form


Box(children=(Box(children=(Label(value='Max Price'), FloatSlider(value=5000000.0, max=5000000.0, min=1000000.…

In [44]:
max_price = form_items[0].children[1].value
bedrooms = form_items[1].children[1].value
bathrooms = form_items[2].children[1].value
school_rating = form_items[3].children[1].value
square_feet = form_items[4].children[1].value
preferences = form_items[5].children[1].value
print(max_price)
print(bedrooms)
print(bathrooms)
print(school_rating)
print(square_feet)
print(preferences)

5000000.0
1.0
1.0
1.0
1000.0
North Delhi


## Prefilter based on numeric preferences and vector search on textual preferences.

In [45]:
# User search preferences
preferences = "spacious 3 bedroom home near schools and parks"
preferences_vector = embed(preferences)

# Filter values
max_price = 1000000
bedrooms = 2
bathrooms = 1
school_rating = 4.0
square_feet = 800

# Filter expression
filterExpr = f"""
    list_price < {max_price} and
    bedrooms > {bedrooms} and
    bathrooms > {bathrooms} and
    school_rating > {school_rating} and
    square_feet > {square_feet}
"""

# Vector + filter search
filteredDf = (
    table.search(preferences_vector, vector_column_name="description_vector")
         .where(filterExpr, prefilter=True)
         .limit(5)
         .to_pandas()
)

print("Filtered matching listings:")
print(filteredDf.head())


Filtered matching listings:
                   location  list_price  bedrooms  bathrooms  square_feet  \
0  East Delhi - Mayur Vihar    900000.0       3.0        2.0       1500.0   
1  East Delhi - Preet Vihar    950000.0       3.0        2.0       1600.0   

   monthly_hoa  school_rating  \
0        300.0            4.7   
1        300.0            4.4   

                                         description  \
0  Well-maintained home in a family-friendly neig...   
1  Charming home in a peaceful locale, updated in...   

                                  description_vector  _distance  
0  [0.08323986, 0.0473399, 0.038634855, 0.0642556...   0.741692  
1  [0.08744095, 0.027376797, 0.050016794, 0.03924...   1.065288  


## Personalize Listings

In [50]:
from transformers import pipeline
import pandas as pd

# Load summarization pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")  # Or another like "facebook/bart-large-cnn"

def generate_output_local(prompt: str, df: pd.DataFrame):
    print(f"🔹 {prompt}\n")

    for idx, row in df.iterrows():
        listing_text = (
            f"Location: {row['location']}, Price: ₹{row['list_price']}, Bedrooms: {row['bedrooms']}, "
            f"Bathrooms: {row['bathrooms']}, Area: {row['square_feet']} sqft, "
            f"Monthly HOA: ₹{row['monthly_hoa']}, School Rating: {row['school_rating']}\n"
            f"Description: {row['description']}"
        )

        summary = summarizer(
            listing_text,
            max_length=100,
            min_length=40,
            do_sample=False
        )[0]["summary_text"]

        print(f"➡️ Listing {idx + 1} Summary:\n{summary}\n")


Device set to use cpu


In [51]:
generate_output_local("Give me a factual summary of top 2 listings", filteredDf)

🔹 Give me a factual summary of top 3 listings

➡️ Listing 1 Summary:
 Mayur Vihar is located in a family-friendly neighborhood in East Delhi . Spacious rooms, ample natural light, and close proximity to parks and schools . Ideal for growing families looking for a peaceful environment .

➡️ Listing 2 Summary:
 East Delhi - Preet Vihar, Price: ₹950000.0, Bedrooms: 3.0 - Bathrooms: 2.0 . Area: 1600.0 sqft, Monthly HOA: ⁹300.0.0 , School Rating: 4.4 .



In [53]:
generate_output_local("Recommend a listing that is close to subways with top school rating", filteredDf)

🔹 Recommend a listing that is close to subways with top school rating

➡️ Listing 1 Summary:
 Mayur Vihar is located in a family-friendly neighborhood in East Delhi . Spacious rooms, ample natural light, and close proximity to parks and schools . Ideal for growing families looking for a peaceful environment .

➡️ Listing 2 Summary:
 East Delhi - Preet Vihar, Price: ₹950000.0, Bedrooms: 3.0 - Bathrooms: 2.0 . Area: 1600.0 sqft, Monthly HOA: ⁹300.0.0 , School Rating: 4.4 .



In [54]:
generate_output_local("Recommend at least two listings that are close to subways with top school rating", filteredDf)

🔹 Recommend at least two listings that are close to subways with top school rating

➡️ Listing 1 Summary:
 Mayur Vihar is located in a family-friendly neighborhood in East Delhi . Spacious rooms, ample natural light, and close proximity to parks and schools . Ideal for growing families looking for a peaceful environment .

➡️ Listing 2 Summary:
 East Delhi - Preet Vihar, Price: ₹950000.0, Bedrooms: 3.0 - Bathrooms: 2.0 . Area: 1600.0 sqft, Monthly HOA: ⁹300.0.0 , School Rating: 4.4 .

