<a href="https://colab.research.google.com/github/kaitlynchen1/PredictingHousingPrices/blob/main/housingprices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Ted8080 House Prices & Images** Kaggle dataset

In [None]:
import pandas as pd
import os
import zipfile
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Download dataset using Kaggle API
os.system('kaggle datasets download -d ted8080/house-prices-and-images-socal')

# Unzip file
with zipfile.ZipFile('house-prices-and-images-socal.zip', 'r') as zip_ref:
    zip_ref.extractall('house-prices-and-images-socal')

# List files, find the CSV file
path = 'house-prices-and-images-socal'
files = os.listdir(path)
print("Files in dataset:", files)

csv_files = [f for f in files if f.endswith('.csv')]
if csv_files:
    dataset_path = os.path.join(path, csv_files[0])  # Choose the first CSV file
    df = pd.read_csv(dataset_path)

    # first few rows
    print(df.head())

    # Summary statistics
    print(df.describe())

    # Data types of columns
    print(df.dtypes)

    # Number of rows and columns
    print(df.shape)

    # Preprocessing
    target_column = 'price'
    if target_column in df.columns:
        Y = df[target_column].values  # Output matrix
        X = df.drop(columns=[target_column])  # Input matrix

        # Handling missing values and encoding categorical variables, and scale numerical features
        numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns

        # Preprocessing pipeline
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # Apply preprocessing
        X_processed = preprocessor.fit_transform(X)

        # Convert Y to a 2D array
        Y = Y.reshape(-1, 1)

        # Display shapes of X and Y
        print("Shape X (input matrix):", X_processed.shape)
        print("Shape Y (output matrix):", Y.shape)

        # Split into train and test sets
        X_train, X_test, Y_train, Y_test = train_test_split(X_processed, Y, test_size=0.2, random_state=42)
        print("Shape X_train:", X_train.shape)
        print("Shape X_test:", X_test.shape)
        print("Shape Y_train:", Y_train.shape)
        print("Shape Y_test:", Y_test.shape)
else:
    print("Issue finding Price column")

Files in dataset: ['socal2.csv', 'socal2']
   image_id                 street             citi  n_citi  bed  bath  sqft  \
0         0  1317 Van Buren Avenue  Salton City, CA     317    3   2.0  1560   
1         1         124 C Street W      Brawley, CA      48    3   2.0   713   
2         2        2304 Clark Road     Imperial, CA     152    3   1.0   800   
3         3     755 Brawley Avenue      Brawley, CA      48    3   1.0  1082   
4         4  2207 R Carrillo Court     Calexico, CA      55    4   3.0  2547   

    price  
0  201900  
1  228500  
2  273950  
3  350000  
4  385100  
           image_id        n_citi           bed          bath          sqft  \
count  15474.000000  15474.000000  15474.000000  15474.000000  15474.000000   
mean    7736.500000    216.597518      3.506398      2.453251   2173.913209   
std     4467.103368    112.372985      1.034838      0.958742   1025.339617   
min        0.000000      0.000000      1.000000      0.000000    280.000000   
25%     3

In [None]:
# Exterior housing images and pricing dataset containing 8 variables & 15000+ rows in SoCal
! kaggle datasets download robinreni/house-rooms-image-dataset

# Around 3000 collective images of Bathroom, Bedroom, Living Room, Dining, & Kitchen spaces (does not contain price variable)
! kaggle datasets download mikhailma/house-rooms-streets-image-dataset

!wget https://github.com/emanhamed/Houses-dataset/tree/master/Houses%20Dataset

Dataset URL: https://www.kaggle.com/datasets/robinreni/house-rooms-image-dataset
License(s): CC0-1.0
Downloading house-rooms-image-dataset.zip to /content
 91% 106M/116M [00:00<00:00, 200MB/s] 
100% 116M/116M [00:00<00:00, 202MB/s]
Dataset URL: https://www.kaggle.com/datasets/mikhailma/house-rooms-streets-image-dataset
License(s): CC0-1.0
house-rooms-streets-image-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
--2025-03-20 14:38:29--  https://github.com/emanhamed/Houses-dataset/tree/master/Houses%20Dataset
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘Houses Dataset’

Houses Dataset          [ <=>                ] 508.19K  --.-KB/s    in 0.06s   

2025-03-20 14:38:30 (7.77 MB/s) - ‘Houses Dataset’ saved [520390]



**Robin Reni Rooms Images** Kaggle dataset (No price)

In [None]:
import os
import zipfile
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import kagglehub

# Download Robin Reni dataset
path = kagglehub.dataset_download("robinreni/house-rooms-image-dataset")
print("Path to dataset files:", path)

# Extract zip file
dataset_folder = "/content/datasets/House_Room_Dataset"
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

# Ensure the zip file is extracted
zip_path = os.path.join(dataset_folder, "house-rooms-image-dataset.zip")
if os.path.exists(zip_path):
    print(f"Extracting {zip_path} to {dataset_folder}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_folder)
    print(f"Dataset extracted to {dataset_folder}")
else:
    print(f"Zip file not found at {zip_path}")

# Verify extraction
print("Contents of dataset folder:", os.listdir(dataset_folder))

# Navigate into the correct subfolder
dataset_subfolder = os.path.join(dataset_folder, "House_Room_Dataset")
print("Contents of dataset subfolder:", os.listdir(dataset_subfolder))

# Image resize
IMAGE_SIZE = (128, 128)

def load_images_from_folder(folder, num_samples=None):
    images = []
    labels = []
    class_names = sorted(os.listdir(folder))  # Get list of subfolders
    print("Class names:", class_names)  # Debug: Print class names

    for class_name in class_names:
        class_folder = os.path.join(folder, class_name)
        if not os.path.isdir(class_folder):
            print(f"Skipping non-folder: {class_folder}")  # Debug: Skip non-folders
            continue

        print(f"Loading images from class: {class_name}")  # Debug: Print class name
        for filename in tqdm(os.listdir(class_folder)):
            img_path = os.path.join(class_folder, filename)
            # print(f"Processing file: {img_path}")  # Debug: Print file path

            # Check for valid image extensions
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                print(f"Skip non-image: {img_path}")  # Debug: Skip non-images
                continue

            # Load and resize image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Failed to load image: {img_path}")  # Debug: Skip corrupted images
                continue

            img = cv2.resize(img, IMAGE_SIZE)
            img = img_to_array(img) / 255.0  # Normalize pixel values
            images.append(img)
            labels.append(class_names.index(class_name))  # Assign label based on folder name

            # Limit the number of samples if specified
            if num_samples and len(images) >= num_samples:
                break

        # Break outer loop if num_samples is reached
        if num_samples and len(images) >= num_samples:
            break

    return np.array(images), np.array(labels)

# Load images from dataset subfolder
X_images, Y = load_images_from_folder(dataset_subfolder, num_samples=5250)
print(f"Loaded {X_images.shape[0]} images from the dataset.")

# Split into train and test sets
X_train_images, X_test_images, Y_train, Y_test = train_test_split(
    X_images, Y, test_size=0.2, random_state=42
)

# Save the preprocessed data
os.makedirs("preprocessed_data", exist_ok=True)
np.save("preprocessed_data/X_train_images.npy", X_train_images)
np.save("preprocessed_data/X_test_images.npy", X_test_images)
np.save("preprocessed_data/Y_train.npy", Y_train)
np.save("preprocessed_data/Y_test.npy", Y_test)

Path to dataset files: /root/.cache/kagglehub/datasets/robinreni/house-rooms-image-dataset/versions/1
Extracting /content/datasets/House_Room_Dataset/house-rooms-image-dataset.zip to /content/datasets/House_Room_Dataset
Dataset extracted to /content/datasets/House_Room_Dataset
Contents of dataset folder: ['house-rooms-image-dataset.zip', 'House_Room_Dataset']
Contents of dataset subfolder: ['Kitchen', 'Livingroom', 'Dinning', 'Bedroom', 'Bathroom']
Class names: ['Bathroom', 'Bedroom', 'Dinning', 'Kitchen', 'Livingroom']
Loading images from class: Bathroom


100%|██████████| 606/606 [00:00<00:00, 1007.63it/s]


Loading images from class: Bedroom


100%|██████████| 1248/1248 [00:01<00:00, 939.66it/s]


Loading images from class: Dinning


100%|██████████| 1158/1158 [00:01<00:00, 874.77it/s]


Loading images from class: Kitchen


100%|██████████| 965/965 [00:01<00:00, 946.74it/s]


Loading images from class: Livingroom


100%|██████████| 1273/1273 [00:01<00:00, 920.32it/s]


Loaded 5250 images from the dataset.


**Mikhail Ma House Rooms & Streets** Kaggle dataset (No Price)

In [None]:
import os
import zipfile
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import kagglehub

# Download the dataset
dataset_slug = "mikhailma/house-rooms-streets-image-dataset"
path = kagglehub.dataset_download(dataset_slug)
print("Path to dataset files:", path)

# Extract zip file
dataset_folder = "/content/datasets/House_Rooms_Streets_Dataset"
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

# Check if zip file has been properly extracted
zip_path = os.path.join(dataset_folder, "house-rooms-streets-image-dataset.zip")
if os.path.exists(zip_path):
    print(f"Extracting {zip_path} to {dataset_folder}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_folder)
    print(f"Dataset extracted to {dataset_folder}")
else:
    print(f"Zip file not found at {zip_path}")

# Confirm extraction
print("Contents of dataset folder:", os.listdir(dataset_folder))

# Navigate to correct subfolder
dataset_subfolder = os.path.join(dataset_folder, "kaggle_room_street_data")
print("Contents of dataset subfolder:", os.listdir(dataset_subfolder))

# Image resize
IMAGE_SIZE = (128, 128)

def load_images_from_folder(folder, street_data_limit=1000):
    images = []
    labels = []
    class_names = sorted(os.listdir(folder))  # Get list of subfolders
    print("Class names:", class_names)  # Print class names

    street_data_count = 0  # Count for street_data images

    for class_name in class_names:
        class_folder = os.path.join(folder, class_name)
        if not os.path.isdir(class_folder):
            print(f"Skipping non-folder: {class_folder}")  # Skip non-folders
            continue

        print(f"Loading images from class: {class_name}")  # Print class name
        for filename in tqdm(os.listdir(class_folder)):
            # Set a limit to save RAM usage
            if class_name == "street_data" and street_data_count >= street_data_limit:
                print(f"Reached street_data limit of {street_data_limit} images.")
                break

            img_path = os.path.join(class_folder, filename)

            # Check for valid image extensions
            if not filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                print(f"Skip non-image: {img_path}")  # Skip non-images
                continue

            # Load and resize image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Failed to load image: {img_path}") # Skip potentially corrupted images
                continue

            img = cv2.resize(img, IMAGE_SIZE)
            img = img_to_array(img) / 255.0  # Normalize pixel values
            images.append(img)
            labels.append(class_names.index(class_name))  # Assign label based on folder name

            # Increment street_data counter if applicable
            if class_name == "street_data":
                street_data_count += 1

    return np.array(images), np.array(labels)

# Load images, set limit to limit RAM usage
X_images, Y = load_images_from_folder(dataset_subfolder, street_data_limit=5000)
print(f"Loaded {X_images.shape[0]} images from the dataset.")

# Split into train and test sets
X_train_images, X_test_images, Y_train, Y_test = train_test_split(
    X_images, Y, test_size=0.2, random_state=42
)

# Save the preprocessed data
os.makedirs("preprocessed_data", exist_ok=True)
np.save("preprocessed_data/X_train_images.npy", X_train_images)
np.save("preprocessed_data/X_test_images.npy", X_test_images)
np.save("preprocessed_data/Y_train.npy", Y_train)
np.save("preprocessed_data/Y_test.npy", Y_test)

Path to dataset files: /root/.cache/kagglehub/datasets/mikhailma/house-rooms-streets-image-dataset/versions/1
Extracting /content/datasets/House_Rooms_Streets_Dataset/house-rooms-streets-image-dataset.zip to /content/datasets/House_Rooms_Streets_Dataset
Dataset extracted to /content/datasets/House_Rooms_Streets_Dataset
Contents of dataset folder: ['kaggle_room_street_data', 'house-rooms-streets-image-dataset.zip']
Contents of dataset subfolder: ['street_data', 'house_data']
Class names: ['house_data', 'street_data']
Loading images from class: house_data


100%|██████████| 5249/5249 [00:04<00:00, 1120.94it/s]


Loading images from class: street_data


 25%|██▌       | 5000/19658 [00:03<00:10, 1351.97it/s]


Reached street_data limit of 5000 images.
Loaded 10249 images from the dataset.


**GitHub** dataset (No price)

Previously processed all images, but changes to the directories leaves us with a current issue. This should be fixable for later, however.

In [None]:
# !ls -l /content/datasets
# file /content/datasets/Houses_Dataset

import os
import numpy as np
import cv2
from tensorflow.keras.preprocessing.image import img_to_array

# Path to the folder containing images
drive_path = "/content/datasets/Houses_Dataset"

# Verify the path
if not os.path.exists(drive_path):
    raise FileNotFoundError(f"The path {drive_path} does not exist.")
if not os.path.isdir(drive_path):
    raise NotADirectoryError(f"The path {drive_path} is not a directory.")

# Resize images
IMAGE_SIZE = (128, 128)

def load_images_from_folder(folder):
    images = []
    filenames = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)

        # Verify image filetype
        if filename.lower().endswith((".jpg", ".png", ".jpeg")):
            # Load image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Warning: Unable to load image {filename}. Skipping...")
                continue

            # Resize and normalize image
            img = cv2.resize(img, IMAGE_SIZE)
            img = img_to_array(img) / 255.0  # Normalize pixel values to [0, 1]
            images.append(img)
            filenames.append(filename)
        else:
            print(f"Skipping non-image file: {filename}")

    return np.array(images), filenames

# Load all images
X_images, image_filenames = load_images_from_folder(drive_path)

# Save data
output_path = os.path.join(drive_path, "X_images.npy")
np.save(output_path, X_images)

# Print confirmation
print(f"Images processed: {len(X_images)}")
print(f"Processed images saved to: {output_path}")

!ls -l /content/datasets

FileNotFoundError: The path /content/datasets/Houses_Dataset does not exist.