In [None]:
# CHECK IF NULL

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('cleaned_data.csv')
pd.set_option("display.max_rows", None)  # Show all rows
print(df.isnull().sum(axis=0))

*** Get all revelant data ***

In [1]:
import pandas as pd

# Load datasets
data = pd.read_csv("Data/data_simple.csv")
data_ltamrt = pd.read_csv("Data/data_complex_LTAMRTStation.csv")
data_nparks = pd.read_csv("Data/data_complex_NParks.csv")

# Merge datasets on common columns
data_merged = data.merge(
    data_ltamrt[['month', 'town', 'flat_type', 'block', 'street_name', 'LTAMRTStation_within_1km', 'LTAMRTStation_nearest']],
    on=['month', 'town', 'flat_type', 'block', 'street_name'],
    how='left'
).merge(
    data_nparks[['month', 'town', 'flat_type', 'block', 'street_name', 'NParks_within_1km', 'NParks_nearest']],
    on=['month', 'town', 'flat_type', 'block', 'street_name'],
    how='left'
)

# Reorder columns
final_columns = [
    "month", "town", "flat_type", "block", "street_name", "storey_range", "floor_area_sqm", "flat_model", 
    "lease_commence_date", "remaining_lease", "resale_price", "Latitude", "Longitude", 
    "LTAMRTStation_within_1km", "LTAMRTStation_nearest", "MallCoordinates_within_1km", "MallCoordinates_nearest", 
    "Hawker_within_1km", "Hawker_nearest", "PreSchool_within_1km", "PreSchool_nearest", 
    "Primary_within_1km", "Primary_nearest", "Secondary_within_1km", "Secondary_nearest", 
    "JuniorCollege_within_1km", "JuniorCollege_nearest", "MixedLevel_within_1km", "MixedLevel_nearest", 
    "NParks_within_1km", "NParks_nearest", "Sports_within_1km", "Sports_nearest"
]
data_merged = data_merged[final_columns]

# Save to CSV
data_merged.to_csv("Data/data.csv", index=False)

# Display first few rows
data_merged.head()


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,Secondary_within_1km,Secondary_nearest,JuniorCollege_within_1km,JuniorCollege_nearest,MixedLevel_within_1km,MixedLevel_nearest,NParks_within_1km,NParks_nearest,Sports_within_1km,Sports_nearest
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,...,4,0.379514,0,1.125769,0,2.220834,3,0.14688,0,1.427928
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,...,0,1.323435,1,0.781657,1,0.890652,17,0.554452,1,0.707011
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,0,1.338299,0,1.15859,1,0.710017,15,0.132971,0,1.14578
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,...,1,0.532181,0,1.77062,0,1.799429,4,0.533977,2,0.879483
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,...,0,1.364263,0,1.204581,1,0.732892,17,0.131067,0,1.177439


*** HDB Resale Data Prep + Normalisation ***

In [None]:
import pandas as pd
import json
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
file_path = "Data/data.csv"  # Adjust if needed
df = pd.read_csv(file_path)

# Step 1: Convert 'month' to separate year and month columns (Inserted Immediately After `month`)
df['month'] = pd.to_datetime(df['month'])
df.insert(df.columns.get_loc('month') + 1, 'year', df['month'].dt.year)
df['month'] = df['month'].dt.month

# Step 2: Convert 'lease_commence_date' to lease age (Inserted Immediately After `lease_commence_date`)
df.insert(df.columns.get_loc('lease_commence_date') + 1, 'lease_age', 2025 - df['lease_commence_date'])

# Step 3: Convert 'remaining_lease' from text to numeric format
def convert_remaining_lease(lease_str):
    lease_str = str(lease_str).strip()  # Ensure string format and remove spaces
    
    # Handle cases like "92 years"
    match_years = re.match(r"(\d+) years", lease_str)
    match_full = re.match(r"(\d+) years (\d+) months", lease_str)

    if match_full:
        years, months = map(int, match_full.groups())
        return years + months / 12  # Convert months to decimal fraction
    elif match_years:
        return int(match_years.group(1))  # Only years provided
    elif lease_str.isdigit():  # Handle cases where only a number is provided
        return int(lease_str)
    else:
        return np.nan  # Still handles missing/unexpected values

# Apply the corrected function
df['remaining_lease'] = df['remaining_lease'].apply(convert_remaining_lease)

# Check how many NaNs remain
print("Missing values in remaining_lease after fix:", df['remaining_lease'].isna().sum())

# Step 4: Encode categorical variables (keeping original columns)
categorical_cols = ["town", "flat_type", "flat_model"]  # storey_range excluded from One-Hot Encoding
df_encoded = df.copy()

label_encoders = {}
encoded_mappings = {}  # For saving in JSON format

for col in categorical_cols:
    # Apply Label Encoding
    le = LabelEncoder()
    label_encoded_col = col + "_LE"
    df_encoded.insert(df_encoded.columns.get_loc(col) + 1, label_encoded_col, le.fit_transform(df_encoded[col]))
    label_encoders[col] = le
    encoded_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

    # Apply One-Hot Encoding and insert next to the original column
    ohe_cols = pd.get_dummies(df[col], prefix=col, drop_first=True)
    for ohe_col in ohe_cols.columns:
        df_encoded.insert(df_encoded.columns.get_loc(label_encoded_col) + 1, ohe_col, ohe_cols[ohe_col])

# ✅ **Storey Range Mapping (Ensures it follows `storey_range`)**
storey_mapping = {
    "01 TO 03": 1, "04 TO 06": 2, "07 TO 09": 3, "10 TO 12": 4, "13 TO 15": 5, 
    "16 TO 18": 6, "19 TO 21": 7, "22 TO 24": 8, "25 TO 27": 9, "28 TO 30": 10, 
    "31 TO 33": 11, "34 TO 36": 12, "37 TO 39": 13, "40 TO 42": 14, "43 TO 45": 15, 
    "46 TO 48": 16, "49 TO 51": 17
}
df_encoded.insert(df_encoded.columns.get_loc("storey_range") + 1, "storey_range_LE", df_encoded["storey_range"].map(storey_mapping))

# ✅ **Fix JSON Serialization Error**
# Convert NumPy int32 to Python int before saving
encoded_mappings_fixed = {col: {str(k): int(v) for k, v in mapping.items()} for col, mapping in encoded_mappings.items()}

# **Save LabelEncoders in JSON (Human-Readable & Code-Friendly)**
with open("label_encoders.json", "w") as f:
    json.dump(encoded_mappings_fixed, f, indent=4)

# Step 5: Feature Engineering (Inserted Immediately After `floor_area_sqm`)
df_encoded.insert(df_encoded.columns.get_loc('floor_area_sqm') + 1, 'price_per_sqm', df_encoded['resale_price'] / df_encoded['floor_area_sqm'])

# Save cleaned data without normalisation (for Tree-Based Models)
df_encoded.to_csv("Data/cleaned_data.csv", index=False)

# Normalisation for Linear Models
numerical_cols = [
    "floor_area_sqm", "price_per_sqm", "lease_commence_date", "resale_price", "Latitude", "Longitude", 
    "lease_age", "remaining_lease", "LTAMRTStation_nearest", "MallCoordinates_nearest",
    "Hawker_nearest", "PreSchool_nearest", "Primary_nearest", "Secondary_nearest",
    "JuniorCollege_nearest", "MixedLevel_nearest", "NParks_nearest", "Sports_nearest"
]

scaler = MinMaxScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Save cleaned data with normalisation (for Linear Regression, GWR)
df_encoded.to_csv("Data/cleaned_data_normalisation.csv", index=False)

# ✅ **Print Required Column Sets**
# Columns that should NOT be included in model training
non_model_columns = ["month", "block", "street_name"]
print("\n📌 **Columns that should NOT be included in model training:**")
print(non_model_columns)

# Columns with Label Encoding
label_encoded_columns = [col + "_LE" for col in categorical_cols] + ["storey_range_LE"]
print("\n📌 **Columns with Label Encoding (for Tree-Based Models):**")
print(label_encoded_columns)

# Columns with One-Hot Encoding
one_hot_encoded_columns = []
for col in categorical_cols:  # Only include town, flat_type, flat_model
    one_hot_encoded_columns.extend([c for c in df_encoded.columns if c.startswith(col + "_") and c not in label_encoded_columns])

print("\n📌 **Columns with One-Hot Encoding (for Linear Models):**")
print(one_hot_encoded_columns)

# Print confirmation
print("\n✅ Data Prep Complete: Both versions saved.")
print("✅ LabelEncoders saved in JSON format.")


Missing values in remaining_lease after fix: 0

📌 **Columns that should NOT be included in model training:**
['month', 'block', 'street_name']

📌 **Columns with Label Encoding (for Tree-Based Models):**
['town_LE', 'flat_type_LE', 'flat_model_LE', 'storey_range_LE']

📌 **Columns with One-Hot Encoding (for Linear Models):**
['town_YISHUN', 'town_WOODLANDS', 'town_TOA PAYOH', 'town_TAMPINES', 'town_SERANGOON', 'town_SENGKANG', 'town_SEMBAWANG', 'town_QUEENSTOWN', 'town_PUNGGOL', 'town_PASIR RIS', 'town_MARINE PARADE', 'town_KALLANG/WHAMPOA', 'town_JURONG WEST', 'town_JURONG EAST', 'town_HOUGANG', 'town_GEYLANG', 'town_CLEMENTI', 'town_CHOA CHU KANG', 'town_CENTRAL AREA', 'town_BUKIT TIMAH', 'town_BUKIT PANJANG', 'town_BUKIT MERAH', 'town_BUKIT BATOK', 'town_BISHAN', 'town_BEDOK', 'flat_type_MULTI-GENERATION', 'flat_type_EXECUTIVE', 'flat_type_5 ROOM', 'flat_type_4 ROOM', 'flat_type_3 ROOM', 'flat_type_2 ROOM', 'flat_model_Type S2', 'flat_model_Type S1', 'flat_model_Terrace', 'flat_model_

*** Split Data into Train & Test ***

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load both datasets
cleaned_df = pd.read_csv("Data/cleaned_data.csv")  # Cleaned data
normalized_df = pd.read_csv("Data/cleaned_data_normalisation.csv")  # Normalized data

# Ensure the same order in both datasets
assert cleaned_df.shape == normalized_df.shape, "Datasets have different shapes!"
assert (cleaned_df.columns == normalized_df.columns).all(), "Columns do not match!"
assert (cleaned_df.index == normalized_df.index).all(), "Indexes do not match!"

# Define target variable
target_column = "resale_price"

# Create an index column to ensure consistent splitting
cleaned_df["index"] = cleaned_df.index  # Create index reference
normalized_df["index"] = normalized_df.index  # Keep the same index

# Split only the cleaned dataset (train-test split based on indices)
train_idx, test_idx = train_test_split(cleaned_df["index"], test_size=0.2, random_state=42)

# Use the same indices to split both datasets
cleaned_train = cleaned_df.loc[train_idx].drop(columns=["index"])
cleaned_test = cleaned_df.loc[test_idx].drop(columns=["index"])
normalized_train = normalized_df.loc[train_idx].drop(columns=["index"])
normalized_test = normalized_df.loc[test_idx].drop(columns=["index"])

# Save the consistent train/test splits for both datasets
cleaned_train.to_csv("Data/cleaned_train.csv", index=False)
cleaned_test.to_csv("Data/cleaned_test.csv", index=False)
normalized_train.to_csv("Data/normalized_train.csv", index=False)
normalized_test.to_csv("Data/normalized_test.csv", index=False)

# ✅ Check if month, year, town, block, and street_name are the same across both datasets
columns_to_check = ["month", "year", "town", "block", "street_name"]

train_check = cleaned_train[columns_to_check].equals(normalized_train[columns_to_check])
test_check = cleaned_test[columns_to_check].equals(normalized_test[columns_to_check])

if train_check and test_check:
    print("✅ The train and test sets are consistent across cleaned and normalized datasets.")
else:
    print("❌ Inconsistency detected in train/test splits. Check data alignment!")

print("✅ Training and test datasets are consistently split and saved as CSVs!")


✅ The train and test sets are consistent across cleaned and normalized datasets.
✅ Training and test datasets are consistently split and saved as CSVs!


*** Test Model = Linear Regression ***

In [None]:
linear_model_columns = [
    "month", "year", 

    "town_YISHUN", "town_WOODLANDS", "town_TOA PAYOH", "town_TAMPINES", "town_SERANGOON", "town_SENGKANG", "town_SEMBAWANG", "town_QUEENSTOWN", "town_PUNGGOL", "town_PASIR RIS", 
    "town_MARINE PARADE", "town_KALLANG/WHAMPOA", "town_JURONG WEST", "town_JURONG EAST", "town_HOUGANG", "town_GEYLANG", "town_CLEMENTI", "town_CHOA CHU KANG", "town_CENTRAL AREA", 
    "town_BUKIT TIMAH", "town_BUKIT PANJANG", "town_BUKIT MERAH", "town_BUKIT BATOK", "town_BISHAN", "town_BEDOK", 

    "flat_type_MULTI-GENERATION", "flat_type_EXECUTIVE", "flat_type_5 ROOM", "flat_type_4 ROOM", "flat_type_3 ROOM", "flat_type_2 ROOM", 

    "storey_range_LE", 
    "floor_area_sqm", "price_per_sqm", 

    "flat_model_Type S2", "flat_model_Type S1", "flat_model_Terrace", "flat_model_Standard", "flat_model_Simplified", "flat_model_Premium Maisonette", "flat_model_Premium Apartment Loft", 
    "flat_model_Premium Apartment", "flat_model_New Generation", "flat_model_Multi Generation", "flat_model_Model A2", "flat_model_Model A-Maisonette", "flat_model_Model A", 
    "flat_model_Maisonette", "flat_model_Improved-Maisonette", "flat_model_Improved", "flat_model_DBSS", "flat_model_Apartment", "flat_model_Adjoined flat", "flat_model_3Gen", 

    "lease_commence_date", "lease_age", "remaining_lease", 

    "Latitude", "Longitude", 

    "LTAMRTStation_within_1km", "LTAMRTStation_nearest", 
    "MallCoordinates_within_1km", "MallCoordinates_nearest", "Hawker_within_1km", "Hawker_nearest", 
    "PreSchool_within_1km", "PreSchool_nearest", "Primary_within_1km", "Primary_nearest", "Secondary_within_1km", "Secondary_nearest", 
    "JuniorCollege_within_1km", "JuniorCollege_nearest", "MixedLevel_within_1km", "MixedLevel_nearest", 
    "NParks_within_1km", "NParks_nearest", "Sports_within_1km", "Sports_nearest"
]

tree_model_columns = [
    "month", "year", 

    "town_LE", 

    "flat_type_LE",

    "storey_range_LE", 
    "floor_area_sqm", "price_per_sqm", 

    "flat_model_LE", 
    
    "lease_commence_date", "lease_age", "remaining_lease", 

    "Latitude", "Longitude", 

    "LTAMRTStation_within_1km", "LTAMRTStation_nearest", 
    "MallCoordinates_within_1km", "MallCoordinates_nearest", "Hawker_within_1km", "Hawker_nearest", 
    "PreSchool_within_1km", "PreSchool_nearest", "Primary_within_1km", "Primary_nearest", "Secondary_within_1km", "Secondary_nearest", 
    "JuniorCollege_within_1km", "JuniorCollege_nearest", "MixedLevel_within_1km", "MixedLevel_nearest", 
    "NParks_within_1km", "NParks_nearest", "Sports_within_1km", "Sports_nearest"
]

In [None]:
# Test Model = Linear Regression

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset (replace 'your_data.csv' with actual file)
df = pd.read_csv("Data/cleaned_data_normalisation.csv")

# Define features (X) and target (y)
X = df[linear_model_columns].copy() # Predictors
y = df["resale_price"]  # Target variable

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")
