This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

In [17]:
from langchain.llms import OpenAI
from langchain_openai import OpenAIEmbeddings
import openai
import chromadb
import os
from dotenv import load_dotenv
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
import pandas as pd
import json

## User Defined Parameters

In [140]:
DATA_FILE = "listings.json"
VECTOR_FILE = "listings-with-vector.json"

DB_CONNECTION = "real-estate-embeddings-db"
DB_TABLE_NAME = "listings"

MODEL_NAME = "gpt-3.5-turbo"

In [20]:
load_dotenv()

True

### Load OpenAI models

In [21]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [25]:
# Create embeddings instance
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

## Generate Real Estate Listings Using OpenAI Model

In [7]:
system_prompt = """
You are an expert real estate agent in New York City in the USA.
"""

human_prompt= """

With your imagination, generate at least 15 real estate listings. 
Make sure that you are including real listings as well from zillow.com.

Distribute listings across the 5 NYC boroughs.

The listings must be in the JSON array of dictionaries with each item in the format as shown below:

{
  "location": "Upper West Side",
  "list_price": 5899000,
  "bedrooms": 4,
  "bathrooms": 3,
  "square_feet": 2000,
  "monthly_hoa": 1000,
  "school_rating": 4.5,
  "description": "A close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze."
}
"""

In [11]:
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "r") as f:
        text_file = f.read()
        f.close()
    generated_real_estate_listings = json.loads(text_file)
else:
    chat = ChatOpenAI(temperature=1)
    
    messages = [
      SystemMessage(
          content=system_prompt
      ),
      HumanMessage(
          content=human_prompt
      ),
    ]
    
    generated_message = chat.invoke(messages)
    generated_real_estate_listings = json.loads(generated_message.json())["content"]

In [12]:
generated_real_estate_listings = generated_real_estate_listings['listings']
generated_real_estate_listings

[{'location': 'Upper West Side',
  'list_price': 5899000,
  'bedrooms': 4,
  'bathrooms': 3,
  'square_feet': 2000,
  'monthly_hoa': 1000,
  'school_rating': 4.5,
  'description': 'A close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze.'},
 {'location': 'Williamsburg, Brooklyn',
  'list_price': 2499000,
  'bedrooms': 3,
  'bathrooms': 2,
  'square_feet': 1500,
  'monthly_hoa': 800,
  'school_rating': 4.2,
  'description': "Experience the vibrant energy of Williamsburg in this stunning 3-bedroom, 2-bathroom condo. Enjoy breathtaking views of the Manhattan skyline from the private balcony and take advantage of the building's rooftop pool and fitness center."},
 {'location': 'Astoria, Queens',
  'list_price': 1799000,
  'bedrooms': 2,
  'bath

## Storing Listings in a Vector Database (LanceDB)

### Setup LanceDB

In [107]:
lance_db = lancedb.connect(DB_CONNECTION)
lance_db_embeddings = get_registry().get("openai").create()

vector_size = 3072

In [108]:
def get_embedding(desc: str):
    return embedding.embed_query(desc)

class REListings(LanceModel):
    location: str
    list_price: float
    bedrooms: float
    bathrooms: float
    square_feet: float
    monthly_hoa: float
    school_rating: float
    description: str
    description_vector: Vector(vector_size)

In [109]:
if not os.path.exists(VECTOR_FILE):
    listing_df = pd.DataFrame(generated_real_estate_listings)
    listing_df["description_vector"] = listing_df["description"].apply(get_embedding)
    listing_df.to_json("listings-with-vector.json")
else:
    listing_df = pd.read_json(VECTOR_FILE)

In [119]:
table = lance_db.create_table(DB_TABLE_NAME, data=listing_df, 
                              mode="overwrite", schema=REListings)
table.create_fts_index(['description'])

# table = lance_db.drop_table(DB_TABLE_NAME)

## Building the User Preference Interface

In [127]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def collect_user_preference():
    """
    """
    questions = {
        "location": "Where would you prefer to live in?",
        "list_price": "What is your budget?", 
        "bedrooms": "What is the ideal number of bedrooms for your dream home?",
        "bathrooms": "How many bathrooms do you need in the home?",
        "square_feet": "How big would you like your dream home to be (in square feet)?",
        "monthly_hoa": "How much are you comfortable with for the monthly HOA fees?",
        "school_rating": "On a scale of 1 to 5, what is your expectation of school quality in your area?",
        "description": "You can now provide textual description of how you would like your home to be."
    }
    
    desc_key_list = ["location", "description"]
    choice_for_none = "Enter 'None' if you do not have a preference"
    
    user_preference = {}
    
    for key in questions.keys():
        response = input(f"\n{questions[key]} {choice_for_none}\n")
        
        ## Skip to next question if the user doesn't have a preference
        if response.lower() == "none":
            continue
        
        if key not in desc_key_list:
            while not is_float(response):
                response = input(f"You have entered a non-numeric answer to the ({key}) question. Please try again.\n\n{questions[key]}")

            user_preference[key] = float(response)
        
        elif key in desc_key_list:
            while not response:
                response = input(f"You have not entered a {key} for your home. Please try again.\n\n{questions[key]}")

            user_preference[key] = response

    return user_preference

In [128]:
user_preference = collect_user_preference()


Where would you prefer to live in? Enter 'None' if you do not have a preference
 None

What is your budget? Enter 'None' if you do not have a preference
 20000000

What is the ideal number of bedrooms for your dream home? Enter 'None' if you do not have a preference
 2

How many bathrooms do you need in the home? Enter 'None' if you do not have a preference
 1

How big would you like your dream home to be (in square feet)? Enter 'None' if you do not have a preference
 None

How much are you comfortable with for the monthly HOA fees? Enter 'None' if you do not have a preference
 None

On a scale of 1 to 5, what is your expectation of school quality in your area? Enter 'None' if you do not have a preference
 None

You can now provide textual description of how you would like your home to be. Enter 'None' if you do not have a preference
 Near the ocean


## Searching Based on Preferences

In [129]:
db = lancedb.connect(DB_CONNECTION)
table = db.open_table(DB_TABLE_NAME)

In [130]:
def build_filter(user_preference):
    
    gte_col_list = ["bedrooms", "bathrooms", "square_feet", "school_rating"]
    lte_col_list = ["list_price", "monthly_hoa"]
    desc_col_list = ["location", "description"]
    
    where_filter_expr = ""
    
    for k, v in user_preference.items():
        if k in gte_col_list:
            where_filter_expr = f"{where_filter_expr} and {k} >= {v}"
        elif k in lte_col_list:
            where_filter_expr = f"{where_filter_expr} and {k} <= {v}"
    
    if where_filter_expr[:4] == " and":
        where_filter_expr = where_filter_expr[4:].strip()
        
    return where_filter_expr

In [131]:
filter_expr = build_filter(user_preference)
desc_preference = user_preference["description"]

filtered_df = table.search(desc_preference, vector_column_name="description").where(filter_expr, prefilter=True).limit(5).to_pandas()

In [133]:
filtered_df

Unnamed: 0,location,list_price,bedrooms,bathrooms,square_feet,monthly_hoa,school_rating,description,description_vector,score
0,"Williamsburg, Brooklyn",2499000.0,3.0,2.0,1500.0,800.0,4.2,Experience the vibrant energy of Williamsburg ...,"[-0.021497991, 0.030489406, -0.01510903, -0.00...",0.047892
1,Greenwich Village,4299000.0,2.0,2.0,1200.0,1200.0,4.6,Live in the heart of Greenwich Village in this...,"[-0.029927915, 0.023214633, -0.0181028, 0.0099...",0.045998
2,Harlem,1499000.0,2.0,1.0,1000.0,500.0,3.9,Experience the rich history and culture of Har...,"[1.15922e-05, 0.02230404, -0.010679532, -0.009...",0.045597
3,"Astoria, Queens",1799000.0,2.0,2.0,1200.0,600.0,4.1,Discover the charm of Astoria in this beautifu...,"[-0.01804709, 0.030670973, -0.009497103, -0.00...",0.04282


## Personalize Listings To User Preference

In [141]:
def generate_output(query, df):
    context = ""
    for index, row in df.iterrows():
        context += f'Located in {row["location"]} with a list price of {int(row["list_price"])} that has {int(row["bedrooms"])} bedrooms, {int(row["bathrooms"])} bathrooms {str(int(row["square_feet"]))} square feet, and a school rating of {str(row["school_rating"])}. {row["description"]} \n\n'
    
    response = openai.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": "You are an expert real estate agent. Write a personalised real estate listing description that matches the user's preference. Use only the context provided.\nDo not make up an answer if you do not know it, stay within the bounds of the context provided, if you don't know the answer, say that you don't have enough information on the topic!",
            },
            {"role": "user", "content": f"CONTEXT: {context}\nQUERY: {query}"},
            {"role": "user", "content": "ANSWER:"},
        ],
    )

    response = response.choices[0].message.content.strip()
    return response

In [142]:
query = "Based on my preferences, can you recommend the most suitable listing for me?"

generate_output(query, filtered_df)

'Based on your preferences for a vibrant energy, stunning views, and a high school rating, I would highly recommend the listing in Williamsburg, Brooklyn. This 3-bedroom, 2-bathroom condo offers breathtaking views of the Manhattan skyline from the private balcony and includes access to a rooftop pool and fitness center, perfect for enjoying the vibrant energy of Williamsburg. With a school rating of 4.2, this property combines modern amenities with a great location.'

In [143]:
query = "If I'm planning to have kids in the future, which home would you recommend? Explain your reasoning to me too."

generate_output(query, filtered_df)

'Based on your future plans of having kids, I would recommend the condo in Williamsburg, Brooklyn with a list price of $2,499,000. This property offers 3 bedrooms, which provides more space for a growing family compared to the other options. With 2 bathrooms and 1500 square feet, there is ample room for you and your future children. Additionally, the school rating of 4.2 is quite good and indicates that there are quality educational options nearby. The building amenities, such as the rooftop pool and fitness center, would be great for both you and your family to enjoy. Furthermore, the vibrant energy of Williamsburg offers a variety of activities and entertainment options suitable for families.'

In [144]:
query = "If I am looking to optimize my budget, which one do you think is a good choice for me?"

generate_output(query, filtered_df)

'Based on your budget optimization goal, the property in Harlem priced at 1499000 would be a great choice for you. This charming 2-bedroom, 1-bathroom brownstone offers a cozy and inviting space that captures the rich history and culture of Harlem. With features like exposed brick, hardwood floors, and a backyard for entertaining, this property provides a comfortable living experience at a more affordable price point compared to the other listings. Additionally, the proximity to public transportation and the best soul food in the city adds to the value and desirability of this home.'