In [1]:
import os
import json
from typing import List, Any, Optional
from datetime import date
from pathlib import Path
from pydantic import BaseModel, Field
from dotenv import load_dotenv

from openai import OpenAI
from src.utility import show_section, serialize_dates


# Load environment variables
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")

In [None]:
class Property(BaseModel):
    id: int = Field(..., example=1)
    title: str = Field(..., example="Cozy 2-Bedroom Condo in Downtown")
    description: str = Field(..., example="This modern 2-bedroom, 1.5-bathroom condo...")
    neighborhood_description: str = Field(..., example="Located in the heart of downtown...")
    city: str = Field(..., example="San Francisco")
    state: str = Field(..., example="CA")
    neighborhood: Optional[str] = Field(None, example="Downtown")
    property_type: str = Field(..., example="Condo")
    year_built: int = Field(..., example=2010)
    price: float = Field(..., example=320000)
    bedrooms: int = Field(..., example=2)
    bathrooms: float = Field(..., example=1.5)
    area_sqft: int = Field(..., example=850)
    listed_date: date = Field(..., example="2025-04-01")


# TODO:
# - convert "price" into int
# - combine "city" and "state" into "location"
# - new method handling formatting (using "serialize_dates()", ...)
# - put the new method in "add_synthetic_data()"


class SyntheticProperty:

    def __init__(
        self, 
        json_path: Path, 
        model: str = "gpt-4o-mini", 
        temperature: int = 0.7, 
        verbose: bool = False
    ):

        self.json_path = json_path

        self.client = OpenAI(base_url=OPENAI_API_BASE, api_key=OPENAI_API_KEY)
        self.model = model
        self.temperature = temperature
        self.verbose = verbose

        self.data = self.load_json_list()


    def load_json_list(self) -> List[Any]:
        """
        Load a JSON file that contains a list. If the file does not exist, create an empty list.
        Raises an error if the existing JSON is not a list.
        """

        file_path = self.json_path
        
        # Create the file if it doesn't exist
        if not file_path.exists():
            file_path.write_text("[]", encoding="utf-8")
            return []

        # Load the existing JSON content
        with file_path.open("r", encoding="utf-8") as f:
            data = json.load(f)

        # Ensure the content is a list
        if not isinstance(data, list):
            raise ValueError(f"JSON content in {file_path} is not a list.")

        return data
    

    def write_json_list(self) -> None:

        # Write back to file
        with open(self.json_path, "w") as f:
            json.dump(self.data, f, indent=4)  # indent for readability


    def generate_property_sample(self, new_id: int) -> Property:

        # Get the schema example
        example_property = Property.model_json_schema()["properties"]

        # Prompt the LLM
        prompt = f"""
        Generate a new synthetic property listing in JSON format following this exact schema:
        {json.dumps(example_property, indent=4)}

        Make sure:
        - All keys are present.
        - Values are realistic but not copied.
        - Use ID {new_id}.
        - listed_date should be in YYYY-MM-DD format.
        """

        response = self.client.chat.completions.parse(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=self.temperature,
            response_format=Property,
        )

        # Parse the JSON output
        output_text = response.choices[0].message.content.strip()

        # Try parsing as JSON
        try:
            output_dict = json.loads(output_text)
            synthetic_property = Property(**output_dict)
        except json.JSONDecodeError:
            print("Model did not return valid JSON, printing raw response:")
            print(output_text)
            synthetic_property = None

        return synthetic_property
    

    def add_synthetic_data(self, n_samples: int = 1, write_flag: bool = True):

        ids = [item['id'] for item in self.data]
        max_id = max(ids)

        for i in range(n_samples):
            # Generate new property info
            new_id = max_id + i + 1
            property_obj = self.generate_property_sample(new_id)
            property_info = property_obj.model_dump()

            # Convert date object into ISO-formatted string
            property_info = serialize_dates(property_info)

            if self.verbose:
                show_section(f"Property ID {new_id}", property_info, use_display=True)

            # Add new property
            self.data.append(property_info)

        if write_flag:
            self.write_json_list()


        

In [3]:
# Path to JSON file
project_directory = Path.cwd()
catalog_json_path = project_directory / "data" / "real-estate-listings copy.json"

n_samples = 3
model_name = "gpt-4o-mini"  # "gpt-4.1"
write_flag = True

verbose = True


synthetic_prop = SyntheticProperty(catalog_json_path, model=model_name, verbose=verbose)
synthetic_prop.add_synthetic_data(n_samples=n_samples, write_flag=write_flag)




{'id': 32,
 'title': 'Charming 3-Bedroom Family Home with Garden',
 'description': 'This beautifully maintained 3-bedroom, 2-bathroom home offers a spacious layout and a serene garden perfect for family gatherings.',
 'neighborhood_description': 'Nestled in a quiet suburban area, this home is just a short drive from local schools, parks, and shopping centers.',
 'city': 'Austin',
 'state': 'TX',
 'neighborhood': 'Southwest Austin',
 'property_type': 'Single Family Home',
 'year_built': 2015,
 'price': 425000.0,
 'bedrooms': 3,
 'bathrooms': 2.0,
 'area_sqft': 1800,
 'listed_date': '2025-03-15'}




{'id': 33,
 'title': 'Charming 3-Bedroom House with Garden',
 'description': 'This delightful 3-bedroom, 2-bathroom house features a spacious layout with modern finishes and a lovely garden for outdoor enjoyment.',
 'neighborhood_description': 'Nestled in a quiet suburban area, this home offers easy access to local parks, shopping centers, and top-rated schools.',
 'city': 'Austin',
 'state': 'TX',
 'neighborhood': 'Westlake',
 'property_type': 'House',
 'year_built': 2015,
 'price': 475000.0,
 'bedrooms': 3,
 'bathrooms': 2.0,
 'area_sqft': 1800,
 'listed_date': '2025-05-15'}




{'id': 34,
 'title': 'Spacious 3-Bedroom Family Home with Garden',
 'description': 'This beautiful 3-bedroom, 2-bathroom home features a spacious layout, perfect for families. The open-concept living area flows seamlessly into a modern kitchen equipped with stainless steel appliances.',
 'neighborhood_description': 'Nestled in a quiet residential area with tree-lined streets, this home is just a short walk from local parks and schools. Enjoy the community feel while being close to city amenities.',
 'city': 'Austin',
 'state': 'TX',
 'neighborhood': 'Northwest Hills',
 'property_type': 'Single Family Home',
 'year_built': 2015,
 'price': 450000.0,
 'bedrooms': 3,
 'bathrooms': 2.0,
 'area_sqft': 1800,
 'listed_date': '2025-04-01'}


