This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

In [5]:
from langchain.llms import OpenAI
import chromadb

ModuleNotFoundError: No module named 'chromadb'

## User Defined Parameters

In [None]:
DATA_FILE = "listings.json"

DB_CONNECTION = "real-estate-embeddings-db"
DB_TABLE_NAME = "listings"

### Load OpenAI models

In [None]:
# Instantiate an OpenAI API client using langchain
instruct_llm = OpenAI(model="gpt-3.5-turbo-instruct", 
             openai_api_key=os.getenv('OPENAI_API_KEY'),
             temperature=0, 
             max_tokens=3500)

chat_llm = ChatOpenAI(model="gpt-3.5-turbo", 
             openai_api_key=os.getenv('OPENAI_API_KEY'),
             temperature=0, 
             max_tokens=3500)

embeddings_model = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

## Generate Real Estate Listings Using OpenAI Model

In [None]:
system_prompt = """
You are an expert real estate agent in New York City in the USA.
"""

human_prompt= """

With your imagination, generate at least 15 real estate listings. 
Make sure that you are including real listings as well from zillow.com.

Distribute listings across the 5 NYC boroughs.

The listings must be in the JSON array of dictionaries with each item in the format as shown below:

{
  "location": "Upper West Side",
  "list_price": 5899000,
  "bedrooms": 4,
  "bathrooms": 3,
  "square_feet": 2000,
  "monthly_hoa": 1000,
  "school_rating": 4.5,
  "description": "A close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze."
}
"""

In [None]:
if os.path.exist(DATA_FILE):
    with open(DATA_FILE, "r") as f:
        generated_real_estate_listings = f.read()
        f.close()
else:
    chat = ChatOpenAI(temperature=1)
    
    messages = [
      SystemMessage(
          content=system_prompt
      ),
      HumanMessage(
          content=human_prompt
      ),
    ]
    
    generated_message = chat.invoke(messages)
    generated_real_estate_listings = json.loads(generated_message.json())["content"]

In [None]:
generated_real_estate_listings

## Storing Listings in a Vector Database (LanceDB)

### Setup LanceDB

In [None]:
lance_db = lancedb.connect(DB_CONNECTION)
embeddings = get_registry().get("openai").create()

In [None]:
class REListings(LanceModel):
    location: str
    list_price: float
    bedrooms: float
    bathrooms: float
    size: float
    monthly_hoa: float
    school_rating: float
    description: str = embeddings.SourceField()
    description_vector: Vector(embeddings.ndims()) = embeddings.VectorField()

In [None]:
listing_df = pd.read_json(generated_real_estate_listings)

In [None]:
if DB_TABLE_NAME in lance_db.table_names():
    table = lance_db.open_table(DB_TABLE_NAME)
else:
    table = lance_db.create_table(DB_TABLE_NAME, schema=REListings)
    
    data = listing_df.apply(
      lambda row: {
          "location": row["location"],
          "list_price": row["list_price"],
          "bedrooms": row["bedrooms"],
          "bathrooms": row["bathrooms"],
          "square_feet": row["square_feet"],
          "monthly_hoa": row["monthly_hoa"],
          "school_rating": row["school_rating"],
          "description": row["description"],
      },
      axis=1,
    ).values.tolist()
    
    table.add(data)

## Building the User Preference Interface

In [None]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def collect_user_preference():
    """
    """
    questions = {
        "location": "Where would you prefer to live in?",
        "list_price": "What is your budget?", 
        "bedrooms": "What is the ideal number of bedrooms for your dream home?",
        "bathrooms": "How many bathrooms do you need in the home?",
        "square_feet": "How big would you like your dream home to be (in square feet)?",
        "monthly_hoa": "How much are you comfortable with for the monthly HOA fees?",
        "school_rating": "On a scale of 1 to 5, what is your expectation of school quality in your area?",
        "description": "You can now provide textual description of how you would like your home to be."
    }
    
    desc_key_list = ["location", "description"]
    choice_for_none = "Enter 'None' if you do not have a preference"
    
    user_preference = {}
    
    for key in questions.keys():
        response = input(questions[key] + f" {choice_for_none}")
        
        ## Skip to next question if the user doesn't have a preference
        if response.lower() == "none":
            continue
        
        if key not in desc_key_list:
            while not is_float(response):
                response = input(f"You have entered a non-numeric answer to the ({key}) question. Please try again.\n\n{questions[key]}")

            user_preference[key] = float(response)
        
        elif key in desc_key_list:
            while not response:
                response = input(f"You have not entered a {key} for your home. Please try again.\n\n{questions[key]}")

            user_preference[key] = response

    return user_preference

In [None]:
user_preference = collect_user_preference()

## Searching Based on Preferences

In [None]:
db = lancedb.connect(DB_CONNECTION)
table = db.open_table(DB_TABLE_NAME)

In [None]:
def build_filter(user_preference):
    
    gte_col_list = ["bedrooms", "bathrooms", "square_feet", "school_rating"]
    lte_col_list = ["list_price", "monthly_hoa"]
    desc_col_list = ["location", "description"]
    
    where_filter_expr = ""
    
    for k, v in user_preference.items():
        if k in gte_col_list:
            where_filter_expr = f"{where_filter_expr} and {k} >= {v}"
        elif k in lte_col_list:
            where_filter_expr = f"{where_filter_expr} and {k} <= {v}"
    
    if where_filter_expr[:4] == " and":
        where_filter_expr = where_filter_expr[4:].strip()
        
    return where_filter_expr

In [None]:
filter_expr = build_filter(user_preference)
desc_preference = user_preference["description"]

filtered_df = table.search(preferences, 
                           vector_column_name="description_vector"
                          ).where(filter_expr, prefilter=True).limit(5).to_pandas()


## Personalize Listings To User Preference

In [None]:
def generate_output(query, df):
    context = ""
    p = inflect.engine()
    for index, row in df.iterrows():
        context += f'Located in {row["location"]} with a list price of {p.number_to_words(int(row["list_price"]))} that has {p.number_to_words(int(row["bedrooms"]))} bedrooms, {p.number_to_words(int(row["bathrooms"]))} bathrooms {str(int(row["square_feet"]))} square feet, and a school rating of {str(row["school_rating"])}. {row["description"]} \n\n'
    
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are an expert real estate agent that answers user's questions based on the context provided.\nDo not make up an answer if you do not know it, stay within the bounds of the context provided, if you don't know the answer, say that you don't have enough information on the topic!",
            },
            {"role": "user", "content": f"CONTEXT: {context}\nQUERY: {query}"},
            {"role": "user", "content": "ANSWER:"},
        ],
    )

    response = response.choices[0].message.content.strip()
    return response

In [None]:
query = "Based on my preferences, can you recommend the most suitable listing for me?"

generate_output(query, filtered_df)