In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from prophet import Prophet


In [13]:
# Load processed main dataset
hdb_model = pd.read_csv('HDB_model_ready.csv')


In [14]:
# Dataset with flat_type_int in [3,4,5]
ds_imp_rooms = hdb_model[hdb_model['flat_type_int'].isin([3,4,5])]

# Main dataset (ALL rows)
ds_all = hdb_model.copy()

# Datasets split by region_code
ds_region_0 = hdb_model[hdb_model['region_code'] == 0]
ds_region_1 = hdb_model[hdb_model['region_code'] == 1]
ds_region_2 = hdb_model[hdb_model['region_code'] == 2]

# Split ds_imp_rooms by region_code
ds_imp_rooms_0 = ds_imp_rooms[ds_imp_rooms['region_code'] == 0]
ds_imp_rooms_1 = ds_imp_rooms[ds_imp_rooms['region_code'] == 1]
ds_imp_rooms_2 = ds_imp_rooms[ds_imp_rooms['region_code'] == 2]


In [15]:
def year_split(df):
    train = df[df['year'] < 2024]
    test = df[df['year'] >= 2024]
    X_train = train.drop(columns=['resale_price'])
    y_train = train['resale_price']
    X_test = test.drop(columns=['resale_price'])
    y_test = test['resale_price']
    return X_train, X_test, y_train, y_test


# Datasets dictionary keys and corresponding DataFrames
dataset_names = [
    'imp_rooms', 'imp_rooms_0', 'imp_rooms_1', 'imp_rooms_2',
    'all', 'region_0', 'region_1', 'region_2'
]

dataset_list = [
    ds_imp_rooms, ds_imp_rooms_0, ds_imp_rooms_1, ds_imp_rooms_2,
    ds_all, ds_region_0, ds_region_1, ds_region_2
]

splits = {}

for name, dataset in zip(dataset_names, dataset_list):
    X_train, X_test, y_train, y_test = year_split(dataset)
    splits[name] = (X_train, X_test, y_train, y_test)



In [None]:
results = {}

for name, (X_train, X_test, y_train, y_test) in splits.items():
    models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'LinearRegression': LinearRegression(),
    'ElasticNet': ElasticNet(random_state=42),
    'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100,), max_iter=300, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'CatBoost': CatBoostRegressor(iterations=100, random_seed=42, verbose=0),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

    model_result = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        model_result[model_name] = {
            'model': model,
            'mae': mae,
            'r2': r2,
            'feature_importance': getattr(model, 'feature_importances_', None)
        }
    results[name] = model_result

# Print accuracy for each dataset and model
for ds_name, model_dict in results.items():
    print(f"\n=== Results for dataset: {ds_name} ===")
    for m_name, res in model_dict.items():
        print(f"{m_name}: MAE = {res['mae']:.2f}, R² = {res['r2']:.4f}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1788
[LightGBM] [Info] Number of data points in the train set: 153448, number of used features: 16
[LightGBM] [Info] Start training from score 478721.163625




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1761
[LightGBM] [Info] Number of data points in the train set: 30086, number of used features: 15
[LightGBM] [Info] Start training from score 572923.160905


In [None]:
X_train, X_test, y_train, y_test = year_split(ds_all)


In [7]:

models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'LinearRegression': LinearRegression(),
    'ElasticNet': ElasticNet(random_state=42),
    'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100,), max_iter=300, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'CatBoost': CatBoostRegressor(iterations=100, random_seed=42, verbose=0),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'model': model, 'mae': mae, 'r2': r2}
    print(f"{name}: MAE = {mae:.2f}, R² = {r2:.4f}")


RandomForest: MAE = 59073.06, R² = 0.8601
DecisionTree: MAE = 60759.71, R² = 0.8433
LinearRegression: MAE = 69592.18, R² = 0.7467
ElasticNet: MAE = 74796.84, R² = 0.6637




MLPRegressor: MAE = 156624.46, R² = 0.1721
XGBoost: MAE = 57318.73, R² = 0.8703
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 169150, number of used features: 16
[LightGBM] [Info] Start training from score 490971.586248
LightGBM: MAE = 60388.56, R² = 0.8483
CatBoost: MAE = 58301.29, R² = 0.8642
KNN: MAE = 78288.21, R² = 0.7321


In [10]:
# Use monthly average resale_price for ARIMA — example assumes 'month_num' and 'year' columns exist

df_arima = ds_all.groupby(['year', 'month_num'])['resale_price'].mean().reset_index()

# Rename 'month_num' to 'month' for proper datetime parsing
df_arima['date'] = pd.to_datetime(df_arima.rename(columns={'month_num': 'month'})[['year', 'month']].assign(day=1))

ts = df_arima.set_index('date')['resale_price']


# Train/test split for ARIMA time series
train_ts = ts[ts.index.year < 2024]
test_ts = ts[ts.index.year >= 2024]

# Fit ARIMA model (example parameters, tune as needed)
arima_model = sm.tsa.ARIMA(train_ts, order=(5,1,0))
arima_res = arima_model.fit()

# Predict on test set dates
pred_arima = arima_res.predict(start=test_ts.index[0], end=test_ts.index[-1], typ='levels')

# Evaluate ARIMA
mae_arima = mean_absolute_error(test_ts, pred_arima)
r2_arima = r2_score(test_ts, pred_arima)
print(f"ARIMA: MAE = {mae_arima:.2f}, R² = {r2_arima:.4f}")


ARIMA: MAE = 52149.74, R² = -4.6828


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [11]:
# Prepare data for Prophet
df_prophet = df_arima[['date', 'resale_price']].rename(columns={'date': 'ds', 'resale_price': 'y'})

# Split train and test
train_prophet = df_prophet[df_prophet['ds'].dt.year < 2024]
test_prophet = df_prophet[df_prophet['ds'].dt.year >= 2024]

prophet_model = Prophet()
prophet_model.fit(train_prophet)

forecast = prophet_model.predict(test_prophet[['ds']])
pred_prophet = forecast['yhat']

# Evaluate Prophet
mae_prophet = mean_absolute_error(test_prophet['y'], pred_prophet)
r2_prophet = r2_score(test_prophet['y'], pred_prophet)
print(f"Prophet: MAE = {mae_prophet:.2f}, R² = {r2_prophet:.4f}")


19:20:33 - cmdstanpy - INFO - Chain [1] start processing
19:20:34 - cmdstanpy - INFO - Chain [1] done processing


Prophet: MAE = 28965.59, R² = -0.8205


In [None]:

models = {
    'SVR': SVR(kernel='rbf'),
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'model': model, 'mae': mae, 'r2': r2}
    print(f"{name}: MAE = {mae:.2f}, R² = {r2:.4f}")
