# HDB Resale Price Prediction - Model Building

This notebook covers the model building process for predicting HDB resale prices in Singapore using machine learning techniques.

---

## Table of Contents

1. [Setup and Imports](#1-setup-and-imports)
2. [Data Loading and Exploration](#2-data-loading-and-exploration)
3. [Feature Correlation Analysis](#3-feature-correlation-analysis)
4. [Data Preparation](#4-data-preparation)
5. [Model Training and Evaluation](#5-model-training-and-evaluation)
6. [XGBoost Hyperparameter Tuning](#6-xgboost-hyperparameter-tuning)
7. [Final Model Training and Export](#7-final-model-training-and-export)

---
## 1. Setup and Imports <a id="1-setup-and-imports"></a>

Import all necessary libraries for data manipulation, visualization, and machine learning.

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning - model selection and evaluation
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

# Machine learning - regression models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model persistence
import joblib

---
## 2. Data Loading and Exploration <a id="2-data-loading-and-exploration"></a>

Load the preprocessed HDB dataset and examine its structure.

In [None]:
# Load the preprocessed HDB resale dataset
hdb_model = pd.read_csv('HDB_model_ready.csv')

In [None]:
# Display dataset structure and data types
hdb_model.info()

In [None]:
# Display summary statistics for all features
hdb_model.describe()

---
## 3. Feature Correlation Analysis <a id="3-feature-correlation-analysis"></a>

Visualize correlations between features to understand relationships and identify important predictors.

In [None]:
# Reorder columns to put target variable (resale_price) first for better visualization
cols = ['resale_price'] + [c for c in hdb_model.columns if c != 'resale_price']
hdb_model = hdb_model[cols]

# Compute correlation matrix
corr = hdb_model.corr(numeric_only=True)

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(
    corr,
    cmap='coolwarm',
    annot=True,
    fmt=".2f",
    square=True,
    cbar_kws={'shrink': 0.8}
)
plt.title('HDB Features Correlation Heatmap')
plt.tight_layout()
plt.show()

---
## 4. Data Preparation <a id="4-data-preparation"></a>

Create dataset subsets for different modeling scenarios and define the train-test split strategy.

In [None]:
# Create dataset subsets for different analysis scenarios

# Subset: Only common flat types (3-room, 4-room, 5-room)
ds_imp_rooms = hdb_model[hdb_model['flat_type_int'].isin([3, 4, 5])]

# Main dataset: All records
ds_all = hdb_model.copy()

# Subsets by region
ds_region_0 = hdb_model[hdb_model['region_code'] == 0]  # Central
ds_region_1 = hdb_model[hdb_model['region_code'] == 1]  # North/East
ds_region_2 = hdb_model[hdb_model['region_code'] == 2]  # West

# Common flat types split by region
ds_imp_rooms_0 = ds_imp_rooms[ds_imp_rooms['region_code'] == 0]
ds_imp_rooms_1 = ds_imp_rooms[ds_imp_rooms['region_code'] == 1]
ds_imp_rooms_2 = ds_imp_rooms[ds_imp_rooms['region_code'] == 2]

In [None]:
def year_split(df):
    """
    Split dataset into train and test sets based on year.
    Training: data before 2024
    Testing: data from 2024 onwards
    
    Parameters:
        df: DataFrame with 'year' and 'resale_price' columns
    
    Returns:
        X_train, X_test, y_train, y_test
    """
    train = df[df['year'] < 2024]
    test = df[df['year'] >= 2024]
    
    X_train = train.drop(columns=['resale_price'])
    y_train = train['resale_price']
    X_test = test.drop(columns=['resale_price'])
    y_test = test['resale_price']
    
    return X_train, X_test, y_train, y_test


# Create train-test splits for all dataset variants
dataset_names = [
    'imp_rooms', 'imp_rooms_0', 'imp_rooms_1', 'imp_rooms_2',
    'all', 'region_0', 'region_1', 'region_2'
]

dataset_list = [
    ds_imp_rooms, ds_imp_rooms_0, ds_imp_rooms_1, ds_imp_rooms_2,
    ds_all, ds_region_0, ds_region_1, ds_region_2
]

# Store all splits in a dictionary for easy access
splits = {}
for name, dataset in zip(dataset_names, dataset_list):
    X_train, X_test, y_train, y_test = year_split(dataset)
    splits[name] = (X_train, X_test, y_train, y_test)
    print(f"{name}: Train size = {len(X_train)}, Test size = {len(X_test)}")

---
## 5. Model Training and Evaluation <a id="5-model-training-and-evaluation"></a>

Train and evaluate three regression models:
- **Linear Regression**: Simple baseline model
- **Random Forest**: Ensemble of decision trees
- **XGBoost**: Gradient boosted trees

In [None]:
# Train and evaluate models across all dataset splits
results = {}

for name, (X_train, X_test, y_train, y_test) in splits.items():
    # Define models to evaluate
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    }
    
    model_results = {}
    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Generate predictions
        y_pred = model.predict(X_test)
        
        # Calculate evaluation metrics
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Store results
        model_results[model_name] = {
            'model': model,
            'mae': mae,
            'r2': r2,
            'feature_importance': getattr(model, 'feature_importances_', None)
        }
    
    results[name] = model_results

# Display results for each dataset and model
for ds_name, model_dict in results.items():
    print(f"\n=== Results for dataset: {ds_name} ===")
    for m_name, res in model_dict.items():
        print(f"{m_name}: MAE = ${res['mae']:,.2f}, R² = {res['r2']:.4f}")

---
## 6. XGBoost Hyperparameter Tuning <a id="6-xgboost-hyperparameter-tuning"></a>

Use RandomizedSearchCV to find optimal hyperparameters for XGBoost on the full dataset.

In [None]:
# Get train/test split for the 'all' dataset
X_train_all, X_test_all, y_train_all, y_test_all = splits['all']

# Define base XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# Define hyperparameter search space
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],        # L1 regularization
    'reg_lambda': [1, 1.5, 2]        # L2 regularization
}

# Perform randomized search with cross-validation
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=20,                        # Number of parameter combinations to try
    scoring='neg_mean_absolute_error',
    cv=3,                             # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the search
random_search.fit(X_train_all, y_train_all)

print("\n" + "="*50)
print("Best parameters found:")
print(random_search.best_params_)
print(f"\nBest CV MAE: ${-random_search.best_score_:,.2f}")

In [None]:
# Evaluate the tuned model on test set
best_xgb = random_search.best_estimator_

y_pred_all = best_xgb.predict(X_test_all)
mae_all = mean_absolute_error(y_test_all, y_pred_all)
r2_all = r2_score(y_test_all, y_pred_all)

print(f"Tuned XGBoost Test Performance:")
print(f"  MAE: ${mae_all:,.2f}")
print(f"  R²:  {r2_all:.4f}")

---
## 7. Final Model Training and Export <a id="7-final-model-training-and-export"></a>

Train the final model on the complete dataset using optimized hyperparameters and save for deployment.

In [None]:
# Prepare full dataset (train + test) for final model training
X_full = ds_all.drop(columns=['resale_price'])
y_full = ds_all['resale_price']

# Get best parameters and ensure reproducibility settings
best_params = random_search.best_params_.copy()
best_params.update({'random_state': 42, 'n_jobs': -1})

print("Training final model with parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

# Train final model on complete dataset
final_xgb = xgb.XGBRegressor(**best_params)
final_xgb.fit(X_full, y_full)

print(f"\nFinal model trained on {len(X_full):,} samples")

In [None]:
# Save the trained model for deployment
model_path = 'xgb_resale_all.joblib'
joblib.dump(final_xgb, model_path)

print(f"Model saved to: {model_path}")

In [None]:
# Display feature importances from the final model
feature_importance = pd.DataFrame({
    'feature': X_full.columns,
    'importance': final_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importances:")
print(feature_importance.to_string(index=False))

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('XGBoost Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()