In [6]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import warnings
import json
import os
from datetime import datetime
from pathlib import Path
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [7]:
# Load Dataset
df = pd.read_csv('houses_dataset.csv')

print(f"Dataset shape: {df.shape}")
print(f"Rows: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"\nFirst few rows:")
df.head()


Dataset shape: (4853, 109)
Rows: 4,853
Columns: 109

First few rows:


Unnamed: 0,basic_info_bathrooms,basic_info_bedrooms,basic_info_description,basic_info_homeType_CONDO,basic_info_homeType_SINGLE_FAMILY,basic_info_house_age,basic_info_livingArea,basic_info_lotSize_sqft,basic_info_zipcode_94005,basic_info_zipcode_94014,...,property_details_standard_appliance_score,schools_elementary_school_distance,schools_elementary_school_rating,schools_high_school_distance,schools_high_school_rating,schools_middle_school_distance,schools_middle_school_rating,scores_bikeScore,scores_transitScore,scores_walkScore
0,2.0,2,Perched at the nexus of two of San Francisco's...,1,0,15.0,835.0,0.0,0,0,...,4.0,0.1,4.0,0.7,8.0,1.1,6.0,64.0,93.0,97.0
1,1.0,1,1 bedroom Below Market Rate (BMR) housing oppo...,1,0,15.0,613.0,0.0,0,0,...,5.0,0.4,5.0,1.2,8.0,1.6,6.0,62.0,100.0,99.0
2,1.0,2,"723 Taylor St #402, San Francisco, CA 94108 is...",1,0,15.0,757.0,0.0,0,0,...,0.0,0.4,5.0,1.2,8.0,1.6,6.0,62.0,100.0,99.0
3,1.0,1,$126k Reduction! This bright and modern upper-...,1,0,15.0,870.0,0.0,0,0,...,6.0,0.5,6.0,1.1,3.0,1.5,3.0,99.0,100.0,100.0
4,1.0,1,Chic and generous 1BR/1BA condo blocks from Ha...,1,0,16.0,920.0,0.0,0,0,...,4.0,0.5,6.0,1.1,3.0,1.4,3.0,99.0,100.0,99.0


In [8]:
# DataFrame prep
target_col = 'financial_lastSoldPrice'
description_col = 'basic_info_description'

y = df[target_col].copy()
descriptions = df[description_col].copy()

columns_to_exclude = [target_col, description_col]
X = df.drop(columns=columns_to_exclude)

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Log transform the target variable for RMSLE
y_log = np.log1p(y)


Features shape: (4853, 107)
Target shape: (4853,)


In [9]:
# Data Setup
X_temp, X_test, y_temp_log, y_test_log = train_test_split(
    X, y_log, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X_temp, y_temp_log,
    test_size=0.2,
    random_state=42,
    shuffle=True
)
# 0.64 Train, 0.16 Validate, 0.2 Test

# Combine train + val for cross-validation
X_train_full = np.vstack([X_train, X_val])
y_train_full_log = np.concatenate([y_train_log, y_val_log])
print(f"Training set size: {X_train_full.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 3882 samples
Test set size: 971 samples


In [10]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.5],
    'lambda': [1, 5, 10],
}

In [11]:
# Model definition with log transform
base_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=500,
    random_state=42,
    n_jobs=1,
)

In [12]:
print("Starting hyperparameter search with 5-fold CV...")
print(f"Total combinations to test: {np.prod([len(v) for v in param_grid.values()])}")
print("This may take a while...")

grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_root_mean_squared_error',  # Optimize for RMSE on log scale (= RMSLE)
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

grid_search.fit(X_train_full, y_train_full_log)

Starting hyperparameter search with 5-fold CV...
Total combinations to test: 2916
This may take a while...
Fitting 5 folds for each of 2916 candidates, totalling 14580 fits


[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.6s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7; total time=   0.7s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.8; total time=   0.8s
[CV] END colsample_bytree=0.7, gamma=0, lambda=1, learning_rate=0.01, max_depth=3, min_child_weight=1, subsample=0.7; total time=   0.8s
[CV] END colsample_bytree=0.7, gamma=0, l

KeyboardInterrupt: 

In [None]:
print("HYPERPARAMETER SEARCH RESULTS")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV RMSLE: {-grid_search.best_score_:.6f}")

In [None]:
print("\nRetraining best model with early stopping...")
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train_full, y_train_full_log,
    test_size=0.2,
    random_state=42
)

final_model = xgb.XGBRegressor(
    **grid_search.best_params_,
    objective='reg:squarederror',
    eval_metric='rmse',
    n_estimators=1000,
    early_stopping_rounds=50,
    random_state=42,
    n_jobs=-1
)

final_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_val_final, y_val_final)],
    verbose=False
)
print(f"Best iteration (early stopping): {final_model.best_iteration}")

In [None]:
def evaluate_model(model, X, y_log, set_name):
    y_pred_log = model.predict(X)
    y_pred = np.expm1(y_pred_log)
    y_actual = np.expm1(y_log)
    
    mae = mean_absolute_error(y_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
    rmsle = np.sqrt(mean_squared_error(y_log, y_pred_log))
    r2 = r2_score(y_actual, y_pred)
    
    print(f"\n{set_name} Set:")
    print(f"  MAE:   ${mae:,.2f}")
    print(f"  RMSE:  ${rmse:,.2f}")
    print(f"  RMSLE: {rmsle:.6f}")
    print(f"  RÂ²:    {r2:.4f}")
    
    return {'mae': mae, 'rmse': rmse, 'rmsle': rmsle, 'r2': r2}

In [None]:
print("FINAL MODEL PERFORMANCE (WITH EARLY STOPPING)")
train_metrics = evaluate_model(final_model, X_train_final, y_train_final, "Training")
val_metrics = evaluate_model(final_model, X_val_final, y_val_final, "Validation")
test_metrics = evaluate_model(final_model, X_test, y_test_log, "Test")
