# Step 1: Setting Up the Python Environment

# Install Depenencies

In [1]:
!pip install pandas
!pip install chromadb
!pip install langchain
!pip install numpy
!pip install -U langchain-openai
!pip install pydantic
!pip install shutil
# !pip install openai==0.28
! pip install langchain_community
!pip install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.33.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3

# Import the required dependencies

In [2]:
# import os
import pandas as pd
import shutil
from dataclasses import dataclass

from typing import List
from langchain.output_parsers import PydanticOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field, NonNegativeInt
from langchain.prompts import PromptTemplate
from fastapi.encoders import jsonable_encoder

# from langchain.llms.openai import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.evaluation import load_evaluator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


# Step 2: Generation of the Real Estate Listings

## Define OpenAI model, API Key and Base URL

In [3]:
# Environment variables
OPENAI_API_KEY = 'voc-9001790381266773650678673243c3eb54d3.20882087'
MODEL_NAME = 'gpt-3.5-turbo'
OPEN_AI_BASE_URL = 'https://openai.vocareum.com/v1'

## Load LLM

In [10]:
# load the model
from openai import OpenAI
openAILLM = ChatOpenAI(model=MODEL_NAME, api_key=OPENAI_API_KEY, base_url= 'https://openai.vocareum.com/v1')


INSTRUCTION = "Generate a CSV file with at least 10 real estate listings. Use pipe '|' as delimiter in CSV."
SAMPLE_LISTING = """
Here's a sample listing:

Neighborhood: Green Oaks
Price: $800,000
Bedrooms: 3
Bathrooms: 2
House Size: 2,000 sqft
Description: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.
Neighborhood Description: Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze.

Generate atleast 10 listings.
"""

In [5]:
class RealEstateListing(BaseModel):
    """
    A real estate listing.

    Attributes:
    - neighborhood: str
    - price: NonNegativeInt
    - bedrooms: NonNegativeInt
    - bathrooms: NonNegativeInt
    - house_size: NonNegativeInt
    - description: str
    - neighborhood_description: str
    """
    neighborhood: str = Field(description="The neighborhood where the property is located")
    price: NonNegativeInt = Field(description="The price of the property in USD")
    bedrooms: NonNegativeInt = Field(description="The number of bedrooms in the property")
    bathrooms: NonNegativeInt = Field(description="The number of bathrooms in the property")
    house_size: NonNegativeInt = Field(description="The size of the house in square feet")
    description: str = Field(description="A description of the property")
    neighborhood_description: str = Field(description="A description of the neighborhood.")

class ListingCollection(BaseModel):
    """
    A collection of real estate listings.

    Attributes:
    - listings: List[RealEstateListing]
    """
    listings: List[RealEstateListing] = Field(description="A list of real estate listings")

In [6]:
# For generating parsed output
output_parser = PydanticOutputParser(pydantic_object=ListingCollection)

In [11]:
# Generating the prompt and query
prompt = PromptTemplate(
    template="{instruction}\n{sample}\n{format_instructions}\n",
    input_variables=["instruction", "sample"],
    partial_variables={"format_instructions": lambda: output_parser.get_format_instructions},
)

query = prompt.format(
    instruction=INSTRUCTION,
    sample=SAMPLE_LISTING,
)

query

"Generate a CSV file with at least 10 real estate listings. Use pipe '|' as delimiter in CSV.\n\nHere's a sample listing:\n\nNeighborhood: Green Oaks\nPrice: $800,000\nBedrooms: 3\nBathrooms: 2\nHouse Size: 2,000 sqft\nDescription: Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.\nNeighborhood Description: Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the c

In [12]:
# Get the response from LLM
response = openAILLM.invoke(query)

In [13]:
response.content

"Neighborhood|Price|Bedrooms|Bathrooms|House Size|Description|Neighborhood Description\nGreen Oaks|$800,000|3|2|2,000 sqft|Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.|Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze.\nLakeview|$1,200,000|4|3|2,500 sqf

In [14]:
# Load the response into a Pandas DataFrame
from io import StringIO
df = pd.read_csv(StringIO(response.content), delimiter="|")
df.head()

Unnamed: 0,Neighborhood,Price,Bedrooms,Bathrooms,House Size,Description,Neighborhood Description
0,Green Oaks,"$800,000",3,2.0,"2,000 sqft",Welcome to this eco-friendly oasis nestled in ...,"Green Oaks is a close-knit, environmentally-co..."
1,Lakeview,"$1,200,000",4,3.0,"2,500 sqft",Luxurious lakefront living awaits in this stun...,Lakeview is a vibrant neighborhood with a bust...
2,Downtown,"$900,000",2,2.0,"1,800 sqft",Experience urban living at its finest in this ...,Downtown is a bustling neighborhood with a vib...
3,Old Town,"$950,000",3,2.5,"2,300 sqft","Step back in time in this historic 3-bedroom, ...",Old Town is a picturesque neighborhood with tr...
4,Lincoln Park,"$1,500,000",5,4.0,"3,000 sqft","Live in luxury in this spacious 5-bedroom, 4-b...",Lincoln Park is a vibrant neighborhood with a ...


In [15]:
# Save the dataframe to a csv file
df.to_csv('ai_generated_real_estate_listings.csv', index_label = 'id')

# Step 3: Storing Listings in a Vector Database

* Vector Database Setup: Initialize and configure ChromaDB or a similar vector database to store real estate listings.

* Generating and Storing Embeddings: Convert the LLM-generated listings into suitable embeddings that capture the semantic content of each listing, and store these embeddings in the vector database.

In [16]:
import os

# Define paths for Chroma database and CSV file containing real estate listings
CHROMA_DB_PATH = "chroma"  # Path where Chroma will store the processed data
CSV_PATH = "ai_generated_real_estate_listings.csv"  # CSV file containing real estate listings with descriptions

# Initialize OpenAI Embeddings with API details
embedding_function = OpenAIEmbeddings(
    openai_api_base=OPEN_AI_BASE_URL,  # Base URL for OpenAI API (use custom endpoint if needed)
    api_key=OPENAI_API_KEY  # API key to authenticate with OpenAI
)

# Read CSV file into a pandas DataFrame
df = pd.read_csv(CSV_PATH)

# Create a list to store Document objects (each representing one real estate listing)
documents = []
for index, row in df.iterrows():
    # Create a Document for each row's description and assign a unique ID (using the row index)
    documents.append(Document(page_content=row['Description'], metadata={'id': str(index)}))

# Initialize a text splitter to divide documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,         # Maximum number of characters in each chunk
    chunk_overlap=100,      # Number of overlapping characters between consecutive chunks
    length_function=len,    # Function to calculate the length of a chunk (using the standard length function)
    add_start_index=True    # Include start index of the chunk within the document metadata
)

# Split the list of Document objects into smaller chunks of text
chunks = text_splitter.split_documents(documents)

# Output the result of the splitting process
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

# Display the content and metadata of the second chunk (for demonstration purposes)
if chunks:
    document = chunks[10]  # Retrieve the second chunk
    print(document.page_content)  # Print the text content of the chunk
    print(document.metadata)  # Print the metadata (e.g., document ID, start index, etc.)

# If a Chroma database already exists at the specified path, remove it to start fresh
if os.path.exists(CHROMA_DB_PATH):
    shutil.rmtree(CHROMA_DB_PATH)  # Remove the existing database directory and its contents

# Create and save a Chroma database from the text chunks
chroma_db = Chroma.from_documents(
    chunks,  # The list of text chunks to be stored in the Chroma database
    OpenAIEmbeddings(openai_api_base=OPEN_AI_BASE_URL, api_key=OPENAI_API_KEY),  # Use OpenAI Embeddings for document storage
    persist_directory=CHROMA_DB_PATH  # Directory where the Chroma database will be saved
)

# Persist (save) the Chroma database to disk
chroma_db.persist()

# Output the number of chunks saved and the location of the database
print(f"Saved {len(chunks)} chunks to {CHROMA_DB_PATH}.")


  embedding_function = OpenAIEmbeddings(


Split 10 documents into 21 chunks.
cabinets, granite countertops, and a large island. Retreat to the master suite with a spa-like bathroom and walk-in closet. The backyard oasis offers a patio and lush landscaping, perfect for outdoor entertaining.
{'id': '4', 'start_index': 206}
Saved 21 chunks to chroma.


  chroma_db.persist()


# Step 4: Building the User Preference Interface
 * Gather buyer preferences, including the number of bedrooms, bathrooms, desired location, and other specific requirements, either through a set of predefined questions or by allowing the buyer to input their preferences in natural language.

* Buyer Preference Parsing: Implement logic to interpret and structure these preferences for querying the vector database.

In [20]:
user_query_text = "A spacious two-bedroom house with a large kitchen and a nice living room."

In [19]:
BASIC_PROMPT_TEMPLATE =\
"""
Based on the following context:

{context}

---

Answer the question : {question}
"""

# Step 5: Searching Based on Preferences
*  Semantic Search Implementation: Utilize the structured buyer preferences to conduct a semantic search on the vector database, retrieving listings that best align with the user's requirements.
* Listing Retrieval Logic: Refine the retrieval algorithm to ensure that the most relevant listings are chosen based on their semantic similarity to the buyer's preferences.

In [21]:
def get_response_from_llm_chroma(query_text, PROMPT_TEMPLATE):
    # Initialize the embedding function with OpenAI API credentials
    embedding_function = OpenAIEmbeddings(
        openai_api_base=OPEN_AI_BASE_URL,  # URL of the OpenAI API
        api_key=OPENAI_API_KEY            # API key for OpenAI access
    )

    # Create an instance of Chroma database, specifying the persist directory and embedding function
    db = Chroma(
        persist_directory=CHROMA_DB_PATH,  # Path where the Chroma database is stored
        embedding_function=embedding_function  # Embedding function for search operations
    )

    # Perform a similarity search on the Chroma database to find the top 3 closest matches to the query
    results = db.similarity_search_with_relevance_scores(query_text, k=3)

    # Check if no results were found or if the relevance score of the best result is too low
    if len(results) == 0 or results[0][1] < 0.7:
        # If no relevant results are found, print a message indicating the failure
        print(f"Unable to find matching results.")
    else:
        # Extract the content from the most relevant documents (top 3 results)
        context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

        # Create a prompt template based on the provided template string
        prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

        # Format the prompt with the retrieved context and the original query
        prompt = prompt_template.format(context=context_text, question=query_text)

        # Print the generated prompt for debugging purposes
        print(f"Generated Prompt:\n{prompt}")

        # Initialize the model (ChatOpenAI) using the specified model name and OpenAI credentials
        model = ChatOpenAI(
            model=MODEL_NAME,  # Name of the model to use (e.g., GPT-4)
            api_key=OPENAI_API_KEY,  # API key for OpenAI access
            base_url=OPEN_AI_BASE_URL  # Base URL for the OpenAI API
        )

        # Get the model's response based on the formatted prompt
        response_text = model.predict(prompt)

        # Retrieve the source document IDs for the top results
        sources = [doc.metadata.get("id", None) for doc, _score in results]

        # Format the final response including the model's answer and the source document IDs
        formatted_response = f"Response: {response_text}\nSources: {sources}"

        # Print the final formatted response
        print(formatted_response)


In [22]:
get_response_from_llm_chroma(user_query_text, BASIC_PROMPT_TEMPLATE)

  db = Chroma(


Generated Prompt:
Human: 
Based on the following context:

Live in luxury in this spacious 5-bedroom, 4-bathroom home in Lincoln Park. The grand foyer welcomes you into the elegant living room with a fireplace and high ceilings. The gourmet kitchen features custom cabinets, granite countertops, and a large island. Retreat to the master suite with a spa-like

---

kitchen features custom cabinets, marble countertops, and a breakfast nook. The master suite offers a spa-like bathroom and walk-in closet. The backyard oasis includes a patio and landscaped garden, perfect for outdoor entertaining.

---

Experience urban living at its finest in this stylish 2-bedroom, 2-bathroom condo in Downtown. The open floor plan features floor-to-ceiling windows, providing panoramic views of the city skyline. The modern kitchen is equipped with stainless steel appliances and quartz countertops. The building

---

Answer the question : A spacious two-bedroom house with a large kitchen and a nice living ro

  response_text = model.predict(prompt)


Response: The stylish 2-bedroom, 2-bathroom condo in Downtown with an open floor plan, floor-to-ceiling windows, modern kitchen with stainless steel appliances and quartz countertops would be a perfect fit for someone looking for a spacious two-bedroom house with a large kitchen and a nice living room.
Sources: ['4', '7', '2']



# Step 6: Personalizing Listing Descriptions

* LLM Augmentation: For each retrieved listing, leverage the LLM to enhance the description, highlighting aspects of the property that align with the buyer’s preferences. This involves subtly emphasizing features that match the buyer's needs.
*Preserving Factual Integrity: Ensure that the enhancement process boosts the appeal of the listing while preserving the accuracy of the information.


In [23]:
ADVANCED_PROMPT_TEMPLATE =\
"""
Based on the following context:

{context}

---

Formulate a response that not only addresses the question {question} but also ensures your explanation is clear, engaging, and tailored to match the specified preferences. This includes subtly highlighting features of the real estate property that align with the buyer's desires.
"""

In [24]:
get_response_from_llm_chroma(user_query_text, ADVANCED_PROMPT_TEMPLATE)

Generated Prompt:
Human: 
Based on the following context:

Live in luxury in this spacious 5-bedroom, 4-bathroom home in Lincoln Park. The grand foyer welcomes you into the elegant living room with a fireplace and high ceilings. The gourmet kitchen features custom cabinets, granite countertops, and a large island. Retreat to the master suite with a spa-like

---

kitchen features custom cabinets, marble countertops, and a breakfast nook. The master suite offers a spa-like bathroom and walk-in closet. The backyard oasis includes a patio and landscaped garden, perfect for outdoor entertaining.

---

Experience urban living at its finest in this stylish 2-bedroom, 2-bathroom condo in Downtown. The open floor plan features floor-to-ceiling windows, providing panoramic views of the city skyline. The modern kitchen is equipped with stainless steel appliances and quartz countertops. The building

---

Formulate a response that not only addresses the question A spacious two-bedroom house with 