**Ted8080 House Prices & Images** Kaggle dataset

- Follow the link to the dataset: https://www.kaggle.com/datasets/ted8080/house-prices-and-images-socal

- Set up Kaggle API keys

- Click 'download' and download via kagglehub

In [2]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os

try:
    # Download dataset using kagglehub
    print("Downloading:")
    download_path = kagglehub.dataset_download("ted8080/house-prices-and-images-socal")
    print(f"Dataset downloaded to: {download_path}")

    # Find the CSV file in directory
    csv_file = None
    for root, dirs, files in os.walk(download_path):
        for file in files:
            if file.endswith('.csv'):
                csv_file = os.path.join(root, file)
                break
        if csv_file:
            break

    if not csv_file:
        raise FileNotFoundError("No CSV file found")

    print(f"Found CSV file at: {csv_file}")

    # Load data
    df = pd.read_csv(csv_file)

    # Data exploration
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nDataset shape:", df.shape)
    print("\nColumn types:")
    print(df.dtypes)
    print("\nMissing values per column:")
    print(df.isnull().sum())

    # Preprocessing
    target = 'price'
    if target not in df.columns:
        raise ValueError(f"'{target}' column not found. Available columns: {list(df.columns)}")

    X = df.drop(columns=[target])
    y = df[target].values.reshape(-1, 1)

    # Identify feature types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    print(f"\nNumeric features ({len(numeric_features)}):", list(numeric_features))
    print(f"Categorical features ({len(categorical_features)}):", list(categorical_features))

    # Create preprocessing pipelines
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Process data
    print("\nPreprocessing data...")
    X_processed = preprocessor.fit_transform(X)

    print("\nProcessed features shape:", X_processed.shape)
    print("Target shape:", y.shape)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42
    )

    print("\nTraining set:", X_train.shape, y_train.shape)
    print("Test set:", X_test.shape, y_test.shape)

except Exception as e:
    print(f"\nError: {str(e)}")
    print("Ensure you have a proper kagglehub setup")

Downloading:
Downloading from https://www.kaggle.com/api/v1/datasets/download/ted8080/house-prices-and-images-socal?dataset_version_number=1...


100%|██████████| 369M/369M [00:07<00:00, 52.8MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/ted8080/house-prices-and-images-socal/versions/1
Found CSV file at: /root/.cache/kagglehub/datasets/ted8080/house-prices-and-images-socal/versions/1/socal2.csv

First 5 rows:
   image_id                 street             citi  n_citi  bed  bath  sqft  \
0         0  1317 Van Buren Avenue  Salton City, CA     317    3   2.0  1560   
1         1         124 C Street W      Brawley, CA      48    3   2.0   713   
2         2        2304 Clark Road     Imperial, CA     152    3   1.0   800   
3         3     755 Brawley Avenue      Brawley, CA      48    3   1.0  1082   
4         4  2207 R Carrillo Court     Calexico, CA      55    4   3.0  2547   

    price  
0  201900  
1  228500  
2  273950  
3  350000  
4  385100  

Dataset shape: (15474, 8)

Column types:
image_id      int64
street       object
citi         object
n_citi        int64
bed           int64
bath        float64
sqft          int64
price         int64
dtype: object



In [3]:
# Exterior housing images and pricing dataset containing 8 variables & 15000+ rows in SoCal
! kaggle datasets download robinreni/house-rooms-image-dataset

# Around 3000 collective images of Bathroom, Bedroom, Living Room, Dining, & Kitchen spaces (does not contain price variable)
! kaggle datasets download mikhailma/house-rooms-streets-image-dataset

!wget https://github.com/emanhamed/Houses-dataset/tree/master/Houses%20Dataset

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/kaggle/cli.py", line 68, in main
    out = args.func(**command_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 1734, in dataset_download_cli
    with self.build_kaggle_client() as kaggle:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 688, in build_kaggle_client
    username=self.config_values['username'],
             ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'username'
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/kaggle/cli.py", line 68, in main
    out = args.func(**command_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

**Robin Reni Rooms Images** Kaggle dataset (No price)

- Open the link and follow the steps from the previous code block: https://www.kaggle.com/datasets/robinreni/house-rooms-image-dataset

In [4]:
import os
import zipfile
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import kagglehub

# Constants
IMAGE_SIZE = (128, 128)
DATASET_NAME = "robinreni/house-rooms-image-dataset"
PREPROCESSED_FOLDER = "preprocessed_data"
NUM_SAMPLES = 5250  # Set to None

def setup_directories():
    """Create necessary directories if they don't exist"""
    os.makedirs(PREPROCESSED_FOLDER, exist_ok=True)

def load_images_from_folder(folder, num_samples=None):
    """Load and preprocess images from folder structure"""
    images = []
    labels = []
    class_names = sorted([d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d))])

    if not class_names:
        raise ValueError(f"No class folders found in {folder}")

    print("Found classes:", class_names)

    for class_idx, class_name in enumerate(class_names):
        class_folder = os.path.join(folder, class_name)
        print(f"\nLoading images from: {class_folder}")

        image_files = [f for f in os.listdir(class_folder)
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        if not image_files:
            print(f"Warning: No images found in {class_folder}")
            continue

        for filename in tqdm(image_files[:num_samples] if num_samples else tqdm(image_files)):
            img_path = os.path.join(class_folder, filename)

            try:
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Warning: Failed to read {img_path}")
                    continue

                # Preprocess image
                img = cv2.resize(img, IMAGE_SIZE)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
                img = img_to_array(img) / 255.0  # Normalize

                images.append(img)
                labels.append(class_idx)

            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")

            # Early stopping if we have enough samples
            if num_samples and len(images) >= num_samples:
                break

        if num_samples and len(images) >= num_samples:
            break

    return np.array(images), np.array(labels)

def main():
    setup_directories()

    try:
        # Download dataset using kagglehub
        print(f"Downloading dataset {DATASET_NAME}...")
        dataset_path = kagglehub.dataset_download(DATASET_NAME)
        print(f"Dataset downloaded to: {dataset_path}")

        # Find the main dataset folder (it might be nested)
        dataset_subfolder = None
        for root, dirs, files in os.walk(dataset_path):
            if "House_Room_Dataset" in dirs:
                dataset_subfolder = os.path.join(root, "House_Room_Dataset")
                break

        if not dataset_subfolder:
            raise FileNotFoundError("Could not find folder in downloaded files")

        print("\nDataset contents:", os.listdir(dataset_subfolder))

        # Load and preprocess images
        X_images, Y = load_images_from_folder(dataset_subfolder, NUM_SAMPLES)
        print(f"\nSuccessfully loaded {len(X_images)} images")
        print(f"Image shape: {X_images[0].shape}")
        print(f"Labels shape: {Y.shape}")

        # Split dataset (stratified to maintain class balance)
        X_train, X_test, Y_train, Y_test = train_test_split(
            X_images, Y, test_size=0.2, random_state=42, stratify=Y
        )

        # Save processed data
        np.save(os.path.join(PREPROCESSED_FOLDER, "X_train.npy"), X_train)
        np.save(os.path.join(PREPROCESSED_FOLDER, "X_test.npy"), X_test)
        np.save(os.path.join(PREPROCESSED_FOLDER, "Y_train.npy"), Y_train)
        np.save(os.path.join(PREPROCESSED_FOLDER, "Y_test.npy"), Y_test)

        print(f"\nPreprocessed data saved to '{PREPROCESSED_FOLDER}' folder")

    except Exception as e:
        print(f"\nError: {str(e)}")
        print("Ensure you have a proper kagglehub setup")

if __name__ == "__main__":
    main()

Downloading dataset robinreni/house-rooms-image-dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/robinreni/house-rooms-image-dataset?dataset_version_number=1...


100%|██████████| 116M/116M [00:00<00:00, 151MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1

Dataset contents: ['Bathroom', 'Kitchen', 'Bedroom', 'Livingroom', 'Dinning']
Found classes: ['Bathroom', 'Bedroom', 'Dinning', 'Kitchen', 'Livingroom']

Loading images from: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1/House_Room_Dataset/Bathroom


100%|██████████| 606/606 [00:00<00:00, 1116.29it/s]



Loading images from: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1/House_Room_Dataset/Bedroom


100%|██████████| 1248/1248 [00:01<00:00, 1121.78it/s]



Loading images from: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1/House_Room_Dataset/Dinning


100%|██████████| 1158/1158 [00:01<00:00, 1024.59it/s]



Loading images from: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1/House_Room_Dataset/Kitchen


100%|██████████| 965/965 [00:00<00:00, 974.30it/s]



Loading images from: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1/House_Room_Dataset/Livingroom


100%|█████████▉| 1272/1273 [00:01<00:00, 735.85it/s]



Successfully loaded 5250 images
Image shape: (128, 128, 3)
Labels shape: (5250,)

Preprocessed data saved to 'preprocessed_data' folder


**Mikhail Ma House Rooms & Streets** Kaggle dataset (No Price)

- Open the link and follow the steps from the first code block: https://www.kaggle.com/datasets/mikhailma/house-rooms-streets-image-dataset

- Set a limit with the max_images_per_class configuration based on your RAM usage and storage (this dataset has several images)

In [5]:
import os
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import kagglehub

# Configuration
CONFIG = {
    "dataset_slug": "mikhailma/house-rooms-streets-image-dataset",
    "image_size": (128, 128),
    "preprocessed_folder": "preprocessed_data",
    "max_images_per_class": 2000, # Set a limit here
    "test_size": 0.2,
    "random_state": 42
}

def setup_environment():
    """Create necessary directories and verify dependencies"""
    os.makedirs(CONFIG["preprocessed_folder"], exist_ok=True)

    # Check if OpenCV is installed
    if cv2.__version__ is None:
        raise ImportError("Issue with OpenCV")

def load_and_preprocess_images(dataset_path):
    """Load images with preprocessing and class balancing"""
    # Find main folder
    dataset_subfolder = None
    for root, dirs, files in os.walk(dataset_path):
        if "kaggle_room_street_data" in dirs:
            dataset_subfolder = os.path.join(root, "kaggle_room_street_data")
            break

    if not dataset_subfolder:
        raise FileNotFoundError("Could not find 'kaggle_room_street_data' folder")

    print("\nDataset contents:", os.listdir(dataset_subfolder))

    class_names = sorted([d for d in os.listdir(dataset_subfolder)
                       if os.path.isdir(os.path.join(dataset_subfolder, d))])

    if not class_names:
        raise ValueError("No class folders found in dataset")

    print("Found classes:", class_names)

    images = []
    labels = []
    class_counts = {class_name: 0 for class_name in class_names}

    for class_idx, class_name in enumerate(class_names):
        class_folder = os.path.join(dataset_subfolder, class_name)
        image_files = [f for f in os.listdir(class_folder)
                      if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

        print(f"\nProcessing {len(image_files)} images from {class_name}")

        # Apply limit per class if specified
        if CONFIG["max_images_per_class"]:
            image_files = image_files[:CONFIG["max_images_per_class"]]
            print(f"Limiting to {CONFIG['max_images_per_class']} images per class")

        for filename in tqdm(image_files):
            img_path = os.path.join(class_folder, filename)

            try:
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Warning: Failed to load {img_path}")
                    continue

                # Preprocessing pipeline
                img = cv2.resize(img, CONFIG["image_size"])
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert to RGB
                img = img_to_array(img) / 255.0  # Normalize

                images.append(img)
                labels.append(class_idx)
                class_counts[class_name] += 1

            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")

    print("\nFinal class distribution:")
    for class_name, count in class_counts.items():
        print(f"{class_name}: {count} images")

    return np.array(images), np.array(labels)

def save_data(X_train, X_test, y_train, y_test):
    """Save processed data with validation"""
    try:
        np.save(os.path.join(CONFIG["preprocessed_folder"], "X_train.npy"), X_train)
        np.save(os.path.join(CONFIG["preprocessed_folder"], "X_test.npy"), X_test)
        np.save(os.path.join(CONFIG["preprocessed_folder"], "y_train.npy"), y_train)
        np.save(os.path.join(CONFIG["preprocessed_folder"], "y_test.npy"), y_test)

        # Verify saved files
        for f in ["X_train.npy", "X_test.npy", "y_train.npy", "y_test.npy"]:
            if not os.path.exists(os.path.join(CONFIG["preprocessed_folder"], f)):
                raise FileNotFoundError(f"Failed to save {f}")

        print("\nData successfully saved to:", CONFIG["preprocessed_folder"])
    except Exception as e:
        print(f"\nError saving data: {str(e)}")

def main():
    setup_environment()

    try:
        # Download dataset
        print(f"Downloading dataset {CONFIG['dataset_slug']}...")
        dataset_path = kagglehub.dataset_download(CONFIG["dataset_slug"])
        print(f"Dataset downloaded to: {dataset_path}")

        X, y = load_and_preprocess_images(dataset_path)

        # Stratified split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=CONFIG["test_size"],
            random_state=CONFIG["random_state"],
            stratify=y
        )

        print("\nDataset split:")
        print(f"Train: {X_train.shape[0]} samples")
        print(f"Test: {X_test.shape[0]} samples")

        save_data(X_train, X_test, y_train, y_test)

    except Exception as e:
        print(f"\nError: {str(e)}")
        print("Ensure you have a proper kagglehub setup")

if __name__ == "__main__":
    main()

Downloading dataset mikhailma/house-rooms-streets-image-dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/mikhailma/house-rooms-streets-image-dataset?dataset_version_number=1...


100%|██████████| 295M/295M [00:01<00:00, 164MB/s]

Extracting files...





Dataset downloaded to: /root/.cache/kagglehub/datasets/mikhailma/house-rooms-streets-image-dataset/versions/1

Dataset contents: ['house_data', 'street_data']
Found classes: ['house_data', 'street_data']

Processing 5249 images from house_data
Limiting to 2000 images per class


100%|██████████| 2000/2000 [00:02<00:00, 956.60it/s]



Processing 19658 images from street_data
Limiting to 2000 images per class


100%|██████████| 2000/2000 [00:01<00:00, 1052.05it/s]



Final class distribution:
house_data: 2000 images
street_data: 2000 images

Dataset split:
Train: 3200 samples
Test: 800 samples

Data successfully saved to: preprocessed_data


**GitHub** dataset with Price

Dataset containing 2140 images, 4 images for each house. Also contains a text file that contains the textual metadata of the dataset. More information here: https://github.com/emanhamed/Houses-dataset/blob/master/README.md

- First commented code portion indicates a fix in the case of running with a preexisting dataset



In [6]:
import os
import numpy as np
import cv2
import pandas as pd
from tensorflow.keras.preprocessing.image import img_to_array
from tqdm import tqdm

# First remove existing dataset if needed
if os.path.exists("/content/Houses-dataset"):
    !rm -rf /content/Houses-dataset

# Clone the repository
!git clone https://github.com/emanhamed/Houses-dataset.git /content/Houses-dataset

# Set paths
dataset_path = "/content/Houses-dataset/Houses Dataset"
info_file = os.path.join(dataset_path, "HousesInfo.txt")
output_folder = "/content/processed_houses"

# Configuration
CONFIG = {
    "image_size": (224, 224),
    "max_houses": None,
    "image_types": ["bedroom", "bathroom", "kitchen", "frontal"],
    "normalize_prices": True
}

def load_metadata(info_path):
    """Load and process the HousesInfo.txt file"""
    column_names = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
    metadata = pd.read_csv(info_path, sep=" ", header=None, names=column_names,
                         converters={"bathrooms": lambda x: float(x)})

    metadata["house_id"] = range(1, len(metadata)+1)
    metadata["zipcode"] = metadata["zipcode"].astype(int).astype(str)

    if CONFIG["normalize_prices"]:
        price_mean = metadata["price"].mean()
        price_std = metadata["price"].std()
        metadata["price_normalized"] = (metadata["price"] - price_mean) / price_std

    return metadata

def load_and_process_images(metadata):
    """Load and preprocess images"""
    images = {}
    house_data = {}

    for house_id in metadata["house_id"]:
        images[house_id] = {img_type: None for img_type in CONFIG["image_types"]}
        house_data[house_id] = metadata[metadata["house_id"] == house_id].iloc[0].to_dict()

    image_files = [f for f in os.listdir(dataset_path)
                 if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    for filename in tqdm(image_files, desc="Processing images"):
        try:
            parts = filename.split("_")
            if len(parts) != 2:
                continue

            house_id = int(parts[0])
            img_type = parts[1].split(".")[0].lower()

            if img_type not in CONFIG["image_types"] or house_id not in images:
                continue

            img_path = os.path.join(dataset_path, filename)
            img = cv2.imread(img_path)
            if img is None:
                continue

            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, CONFIG["image_size"])
            images[house_id][img_type] = img_to_array(img) / 255.0

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

    return images, house_data

def save_processed_data(images, house_data):
    """Save processed data"""
    os.makedirs(output_folder, exist_ok=True)

    # Save per-house data
    complete_houses = []
    for house_id, img_data in images.items():
        if all(img is not None for img in img_data.values()):
            complete_houses.append({
                "house_id": house_id,
                **house_data[house_id],
                **{f"image_{k}": v for k, v in img_data.items()}
            })

    # Save numpy arrays
    house_images = np.array([[
        h[f"image_{img_type}"] for img_type in CONFIG["image_types"]
    ] for h in complete_houses])

    np.save(os.path.join(output_folder, "house_images.npy"), house_images)

    # Save metadata
    metadata_df = pd.DataFrame(complete_houses)
    metadata_df.to_csv(os.path.join(output_folder, "metadata.csv"), index=False)

    print(f"\nSaved {len(complete_houses)} complete houses")
    print(f"Images shape: {house_images.shape}")
    print(f"Metadata columns: {metadata_df.columns.tolist()}")

# Main processing
try:
    print("Starting dataset processing...")

    metadata = load_metadata(info_file)
    if CONFIG["max_houses"]:
        metadata = metadata.head(CONFIG["max_houses"])

    images, house_data = load_and_process_images(metadata)
    save_processed_data(images, house_data)

except Exception as e:
    print(f"\nError: {str(e)}")

Cloning into '/content/Houses-dataset'...
remote: Enumerating objects: 2166, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 2166 (delta 0), reused 0 (delta 0), pack-reused 2165 (from 1)[K
Receiving objects: 100% (2166/2166), 176.26 MiB | 37.79 MiB/s, done.
Resolving deltas: 100% (20/20), done.
Updating files: 100% (2144/2144), done.
Starting dataset processing...


Processing images: 100%|██████████| 2140/2140 [00:10<00:00, 210.88it/s]



Saved 535 complete houses
Images shape: (535, 4, 224, 224, 3)
Metadata columns: ['house_id', 'bedrooms', 'bathrooms', 'area', 'zipcode', 'price', 'price_normalized', 'image_bedroom', 'image_bathroom', 'image_kitchen', 'image_frontal']


**Unified data loader**

In [7]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler

class HousePriceDataLoader:
    def __init__(self):
        self.scaler = StandardScaler()
        self.datasets = {
            'ted8080': self._load_ted8080,
            'github': self._load_github,
            'robinreni': self._load_robinreni,
            'mikhailma': self._load_mikhailma
        }

    def load_dataset(self, dataset_name, data_path):
        """Main loader interface"""
        if dataset_name not in self.datasets:
            raise ValueError(f"Unknown dataset: {dataset_name}. Choose from {list(self.datasets.keys())}")
        return self.datasets[dataset_name](data_path)

    def _load_ted8080(self, path):
        """Load preprocessed Ted8080 data"""
        X = np.load(os.path.join(path, "X_processed.npy"))
        y = np.load(os.path.join(path, "y.npy"))

        # Ted8080 specific processing
        X = self.scaler.fit_transform(X)
        return X, y, None  # No images in this dataset

    def _load_github(self, path):
        """Load GitHub houses dataset"""
        images = np.load(os.path.join(path, "house_images.npy"))
        metadata = pd.read_csv(os.path.join(path, "metadata.csv"))

        # Extract features
        X_tabular = metadata[['bedrooms', 'bathrooms', 'area']].values
        X_tabular = self.scaler.fit_transform(X_tabular)
        y = metadata['price'].values

        return X_tabular, y, images

    def _load_robinreni(self, path):
        """Load Robinreni dataset (classification)"""
        X_train = np.load(os.path.join(path, "X_train.npy"))
        X_test = np.load(os.path.join(path, "X_test.npy"))
        y_train = np.load(os.path.join(path, "Y_train.npy"))
        y_test = np.load(os.path.join(path, "Y_test.npy"))

        # Combine train/test
        X = np.concatenate([X_train, X_test])
        y = np.concatenate([y_train, y_test])

        return None, y, X  # Using images only

    def _load_mikhailma(self, path):
        """Load Mikhailma dataset"""
        X_train = np.load(os.path.join(path, "X_train.npy"))
        y_train = np.load(os.path.join(path, "y_train.npy"))

        return None, y_train, X_train  # Using images only

**Baseline Model**

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

class HousePriceBaseline:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)

    def train(self, X_tabular, X_images, y):
        """Train on combined features"""
        # Extract image features if available
        if X_images is not None:
            img_features = np.array([img.mean(axis=(1,2)) for img in X_images])  # Simple channel means
            X = np.concatenate([X_tabular, img_features], axis=1) if X_tabular is not None else img_features
        else:
            X = X_tabular

        self.model.fit(X, y)
        return self

    def predict(self, X_tabular, X_images):
        if X_images is not None:
            img_features = np.array([img.mean(axis=(1,2)) for img in X_images])
            X = np.concatenate([X_tabular, img_features], axis=1) if X_tabular is not None else img_features
        else:
            X = X_tabular

        return self.model.predict(X)

    def evaluate(self, X_tabular, X_images, y_true):
        y_pred = self.predict(X_tabular, X_images)
        return {
            'mae': mean_absolute_error(y_true, y_pred),
            'r2': self.model.score(X_tabular if X_images is None else
                                 np.concatenate([X_tabular, img_features], axis=1), y_true)
        }

Deep Learning Model (CNN)

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, Concatenate
from tensorflow.keras.models import Model

class HousePriceCNN:
    def __init__(self, img_shape=(224, 224, 3), num_tabular_features=3):
        self.img_shape = img_shape
        self.num_tabular_features = num_tabular_features
        self.model = self._build_model()

    def _build_model(self):
        # Image branch
        img_input = Input(shape=self.img_shape)
        x = Conv2D(32, (3,3), activation='relu')(img_input)
        x = tf.keras.layers.MaxPooling2D()(x)
        x = Flatten()(x)

        # Tabular branch
        if self.num_tabular_features > 0:
            tab_input = Input(shape=(self.num_tabular_features,))
            y = Dense(16, activation='relu')(tab_input)
            combined = Concatenate()([x, y])
        else:
            tab_input = None
            combined = x

        # Regression head
        z = Dense(64, activation='relu')(combined)
        output = Dense(1)(z)

        # Create model
        inputs = [img_input, tab_input] if tab_input is not None else img_input
        return Model(inputs=inputs, outputs=output)

    def compile(self, lr=0.001):
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(lr),
            loss='mae',
            metrics=['mae']
        )

    def train(self, X_images, X_tabular, y, epochs=10, batch_size=32, val_split=0.2):
        inputs = [X_images, X_tabular] if X_tabular is not None else X_images
        history = self.model.fit(
            inputs, y,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=val_split
        )
        return history

In [11]:
data_loader = HousePriceDataLoader()
X_tabular, y, X_images = data_loader.load_dataset('github', '/content/processed_houses')

# Reshape X_images to combine the 4 image types into a single channel
X_images = X_images.reshape(-1, 224, 224, 12)  # 4 images * 3 channels = 12 channels

# Initialize and train the CNN model with the updated input shape
cnn_model = HousePriceCNN(img_shape=(224, 224, 12), num_tabular_features=3) # Adjust image shape
cnn_model.compile(lr=0.0001) # Smaller learning rate
cnn_model.train(X_images, X_tabular, y, epochs=15, batch_size=16) # Adjust parameters

Epoch 1/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1s/step - loss: 585052.0625 - mae: 585052.0625 - val_loss: 531567.3750 - val_mae: 531567.3750
Epoch 2/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1s/step - loss: 568673.0000 - mae: 568673.0000 - val_loss: 528863.6875 - val_mae: 528863.6875
Epoch 3/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1s/step - loss: 604653.8750 - mae: 604653.8750 - val_loss: 522587.8750 - val_mae: 522587.8750
Epoch 4/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - loss: 585753.5000 - mae: 585753.5000 - val_loss: 511172.9375 - val_mae: 511172.9375
Epoch 5/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1s/step - loss: 545566.5000 - mae: 545566.5000 - val_loss: 493736.0312 - val_mae: 493736.0312
Epoch 6/15
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1s/step - loss: 539347.0625 - mae: 539347.0625 - val_loss: 470015.1875 - v

<keras.src.callbacks.history.History at 0x7afb21c7e690>

In [12]:
# Evaluate the CNN model on the test set
y_pred_cnn = cnn_model.model.predict([X_test, X_test])
mae_cnn = mean_absolute_error(y_test, y_pred_cnn)
print(f"CNN MAE on test set: {mae_cnn}")

# Evaluate the baseline model on the test set
baseline_model = HousePriceBaseline()
baseline_model.train(X_tabular, X_images, y)
y_pred_baseline = baseline_model.predict(X_test_tabular, X_test_images)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
print(f"Baseline MAE on test set: {mae_baseline}")


NameError: name 'X_test_images' is not defined

In [None]:
!pip freeze > requirements.txt
