### Initial Cleaning of HDB Resale Dataset -- Before Feature Extraction ###

In [None]:
import pandas as pd
import json
import pickle
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

# Load dataset
file_path = "Data_Raw/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"  # Adjust if needed
df = pd.read_csv(file_path)

# Step 1: Convert 'month' to separate year and month columns
df['month'] = pd.to_datetime(df['month'])
df.insert(df.columns.get_loc('month') + 1, 'year', df['month'].dt.year)
df['month'] = df['month'].dt.month

# Step 2: Encode Categorical Variables - Label Encoding (For Tree-Based Models)
categorical_cols = ["town", "flat_type", "flat_model"]
label_encoders = {}
encoded_mappings = {}

df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    label_encoded_col = col + "_LE"
    df_encoded.insert(df_encoded.columns.get_loc(col) + 1, label_encoded_col, le.fit_transform(df_encoded[col]))
    label_encoders[col] = le
    encoded_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Step 3: Target Encoding (For Models that Benefit from It)
target_col = 'resale_price'

df_encoded[categorical_cols] = df_encoded[categorical_cols].astype(str)
encoder = TargetEncoder(cols=categorical_cols)
encoder.fit(df_encoded[categorical_cols], df_encoded[target_col])

target_encoded_values = encoder.transform(df_encoded[categorical_cols])
target_encoded_values.columns = [f"{col}_TE" for col in categorical_cols]

# Insert TE columns after corresponding LE columns
for col in categorical_cols:
    le_col = col + "_LE"
    te_col = col + "_TE"
    df_encoded.insert(df_encoded.columns.get_loc(le_col) + 1, te_col, target_encoded_values[te_col])

# Step 4: Map 'storey_range' to Numeric Values
storey_mapping = {
    "01 TO 03": 1, "04 TO 06": 2, "07 TO 09": 3, "10 TO 12": 4, "13 TO 15": 5, 
    "16 TO 18": 6, "19 TO 21": 7, "22 TO 24": 8, "25 TO 27": 9, "28 TO 30": 10, 
    "31 TO 33": 11, "34 TO 36": 12, "37 TO 39": 13, "40 TO 42": 14, "43 TO 45": 15, 
    "46 TO 48": 16, "49 TO 51": 17
}
df_encoded.insert(df_encoded.columns.get_loc("storey_range") + 1, "storey_range_LE", df_encoded["storey_range"].map(storey_mapping))

# Step 5: Calculating price_per_sqm
df_encoded.insert(df_encoded.columns.get_loc('floor_area_sqm') + 1, 'price_per_sqm', df_encoded['resale_price'] / df_encoded['floor_area_sqm'])

# Step 6: Save Cleaned Data
df_encoded.to_csv("Data/HDBResale_cleaned.csv", index=False)

# Print confirmation of saved data
print("\n✅ Data saved at: Data/HDBResale_cleaned.csv")

# Save LabelEncoders in JSON
decoded_mappings = {col: {str(k): int(v) for k, v in mapping.items()} for col, mapping in encoded_mappings.items()}
with open("label_encoders.json", "w") as f:
    json.dump(decoded_mappings, f, indent=4)

# Print confirmation of saved data
print("✅ Label encoders saved at: label_encoders.json")


### Feature Engineering - Geocoding ###

In [None]:
import pandas as pd
import requests
import time

# Load dataset
file_path = "Data/HDBResale_cleaned.csv"
df = pd.read_csv(file_path)

# Ensure the required columns exist
if "block" not in df.columns or "street_name" not in df.columns:
    raise ValueError("Dataset must contain 'block' and 'street_name' columns.")

# Local Nominatim instance URL
NOMINATIM_URL = "http://localhost:8080/search"

# Function to get coordinates from Nominatim
def get_coordinates(address):
    print(f"Geocoding - {address}")
    params = {"q": address, "format": "json"}
    response = requests.get(NOMINATIM_URL, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            return data[0]["lat"], data[0]["lon"]  # Return latitude, longitude
    return None, None  # Return None if no result

# Apply geocoding to each row
df["Latitude"], df["Longitude"] = zip(*df.apply(lambda row: get_coordinates(f"{row['block']} {row['street_name']}"), axis=1))

# Save the new dataset with coordinates
output_file = "HDBResale_with_coordinates.csv"
df.to_csv(output_file, index=False)

print(f"Geocoded dataset saved as: {output_file}")


In [19]:
import pandas as pd
import requests
import time

# Load dataset
file_path = "HDBResale_with_coordinates.csv"
df = pd.read_csv(file_path)

# Ensure the required columns exist
if "block" not in df.columns or "street_name" not in df.columns:
    raise ValueError("Dataset must contain 'block' and 'street_name' columns.")

# Ensure Latitude and Longitude columns exist
if "Latitude" not in df.columns or "Longitude" not in df.columns:
    df["Latitude"], df["Longitude"] = None, None  # Create empty columns if they don't exist

# Local Nominatim instance URL
NOMINATIM_URL = "http://localhost:8080/search"

# Function to get coordinates from Nominatim
def get_coordinates(address):
    print(f"Geocoding - {address}")
    params = {"q": address, "format": "json"}
    
    try:
        response = requests.get(NOMINATIM_URL, params=params, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if data:
                return data[0]["lat"], data[0]["lon"]  # Return latitude, longitude
    except requests.exceptions.RequestException as e:
        print(f"Error with request: {e}")
    
    return None, None  # Return None if no result

# Process only rows with missing coordinates
for index, row in df.iterrows():
    if pd.isna(row["Latitude"]) or pd.isna(row["Longitude"]):
        address = f"{row['block']} {row['street_name']}"
        lat, lon = get_coordinates(address)
        
        # Save new coordinates if found
        df.at[index, "Latitude"] = lat
        df.at[index, "Longitude"] = lon

        # Optional: Introduce a short delay to prevent excessive requests
        # time.sleep(1)

# Save the updated dataset
output_file = "HDBResale_with_coordinates.csv"
df.to_csv(output_file, index=False)

print(f"Geocoded dataset saved as: {output_file}")


Geocoding - 101 BEDOK NORTH AVE 4
Geocoding - 548 BEDOK NORTH AVE 1
Geocoding - 75 BEDOK NORTH RD
Geocoding - 550 BEDOK NORTH AVE 1
Geocoding - 528 BEDOK NORTH ST 3
Geocoding - 534 BEDOK NORTH ST 3
Geocoding - 550 BEDOK NORTH AVE 1
Geocoding - 77 BEDOK NORTH RD


  df.at[index, "Latitude"] = lat
  df.at[index, "Longitude"] = lon


Geocoding - 57 NEW UPPER CHANGI RD
Geocoding - 57 NEW UPPER CHANGI RD
Geocoding - 105 BEDOK NORTH AVE 4
Geocoding - 75 BEDOK NORTH RD
Geocoding - 425 BEDOK NORTH RD
Geocoding - 117 BEDOK NORTH RD
Geocoding - 423 BEDOK NORTH AVE 1
Geocoding - 533 BEDOK NORTH ST 3
Geocoding - 80 BEDOK NORTH RD
Geocoding - 78 BEDOK NORTH RD
Geocoding - 81 BEDOK NORTH RD
Geocoding - 124 BEDOK NORTH RD
Geocoding - 705 BEDOK NORTH RD
Geocoding - 554 BEDOK NORTH ST 3
Geocoding - 81 BEDOK NORTH RD
Geocoding - 508 BEDOK NORTH AVE 3
Geocoding - 185 BEDOK NORTH RD
Geocoding - 220B BEDOK CENTRAL
Geocoding - 415 BEDOK NORTH AVE 2
Geocoding - 219C BEDOK CENTRAL
Geocoding - 94C BEDOK NORTH AVE 4
Geocoding - 342 BUKIT BATOK ST 34
Geocoding - 530 BUKIT BATOK ST 51
Geocoding - 145 BUKIT BATOK ST 11
Geocoding - 528 BUKIT BATOK ST 51
Geocoding - 537 BUKIT BATOK ST 52
Geocoding - 213 BUKIT BATOK ST 21
Geocoding - 219 BUKIT BATOK ST 21
Geocoding - 235 BUKIT BATOK EAST AVE 5
Geocoding - 130 BUKIT BATOK WEST AVE 6
Geocoding -

In [None]:
import pandas as pd

# Load dataset
file_path = "HDBResale_with_coordinates.csv"  # Adjust if needed
df = pd.read_csv(file_path)

# Ensure Latitude and Longitude columns exist
if "Latitude" not in df.columns or "Longitude" not in df.columns:
    raise ValueError("Dataset must contain 'Latitude' and 'Longitude' columns.")

# Filter rows where either Latitude or Longitude is missing
missing_coords_df = df[df["Latitude"].isna() | df["Longitude"].isna()]

# Save to a new CSV file
output_file = "Missing_Coordinates.csv"
missing_coords_df.to_csv(output_file, index=False)

print(f"Rows with missing coordinates saved as: {output_file}")


Rows with missing coordinates saved as: Missing_Coordinates.csv


### Feature Engineering - Number of location within 1km ###

1. Run files feature_engineering-complex_coordinates.py and feature_engineering.py to computed the number of each type of location within a 1km radius of every property.

2. Merge datasets

In [None]:
import pandas as pd

# Load datasets
data = pd.read_csv("Data/data_simple.csv")
data_ltamrt = pd.read_csv("Data/data_complex_LTAMRTStation.csv")
data_nparks = pd.read_csv("Data/data_complex_NParks.csv")

# Merge datasets on common columns
data_merged = data.merge(
    data_ltamrt[["month", "year", 'town', 'flat_type', 'block', 'street_name', 'LTAMRTStation_within_1km', 'LTAMRTStation_nearest']],
    on=["month", "year", 'town', 'flat_type', 'block', 'street_name'],
    how='left'
).merge(
    data_nparks[["month", "year", 'town', 'flat_type', 'block', 'street_name', 'NParks_within_1km', 'NParks_nearest']],
    on=["month", "year", 'town', 'flat_type', 'block', 'street_name'],
    how='left'
)

# Reorder columns
final_columns = [
    "month", "year", "town", "town_LE", "town_TE","flat_type",  "flat_type_LE", "flat_type_TE", "block", "street_name", 
    "storey_range", "storey_range_LE", "floor_area_sqm", "price_per_sqm", "flat_model",  "flat_model_LE", "flat_model_TE",
    "lease_commence_date", "remaining_lease", "resale_price", "Latitude", "Longitude", 
    "LTAMRTStation_within_1km", "LTAMRTStation_nearest", "MallCoordinates_within_1km", "MallCoordinates_nearest", 
    "Hawker_within_1km", "Hawker_nearest", "PreSchool_within_1km", "PreSchool_nearest", 
    "Primary_within_1km", "Primary_nearest", "Secondary_within_1km", "Secondary_nearest", 
    "JuniorCollege_within_1km", "JuniorCollege_nearest", "MixedLevel_within_1km", "MixedLevel_nearest", 
    "NParks_within_1km", "NParks_nearest", "Sports_within_1km", "Sports_nearest"
]
data_merged = data_merged[final_columns]

# Save to CSV
data_merged.to_csv("Data/data.csv", index=False)

# Display first few rows
data_merged.head()


### Splitting Data ###

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load both datasets
cleaned_df = pd.read_csv("Data/cleaned_data.csv")  # Cleaned data
normalized_df = pd.read_csv("Data/cleaned_data_normalisation.csv")  # Normalized data

# Ensure the same order in both datasets
assert cleaned_df.shape == normalized_df.shape, "Datasets have different shapes!"
assert (cleaned_df.columns == normalized_df.columns).all(), "Columns do not match!"
assert (cleaned_df.index == normalized_df.index).all(), "Indexes do not match!"

# Define target variable
target_column = "resale_price"

# Create an index column to ensure consistent splitting
cleaned_df["index"] = cleaned_df.index  # Create index reference
normalized_df["index"] = normalized_df.index  # Keep the same index

# Split only the cleaned dataset (train-test split based on indices)
train_idx, test_idx = train_test_split(cleaned_df["index"], test_size=0.2, random_state=42)

# Use the same indices to split both datasets
cleaned_train = cleaned_df.loc[train_idx].drop(columns=["index"])
cleaned_test = cleaned_df.loc[test_idx].drop(columns=["index"])
normalized_train = normalized_df.loc[train_idx].drop(columns=["index"])
normalized_test = normalized_df.loc[test_idx].drop(columns=["index"])

# Save the consistent train/test splits for both datasets
cleaned_train.to_csv("Data/cleaned_train.csv", index=False)
cleaned_test.to_csv("Data/cleaned_test.csv", index=False)
normalized_train.to_csv("Data/normalized_train.csv", index=False)
normalized_test.to_csv("Data/normalized_test.csv", index=False)

# ✅ Check if month, year, town, block, and street_name are the same across both datasets
columns_to_check = ["month", "year", "town", "block", "street_name"]

train_check = cleaned_train[columns_to_check].equals(normalized_train[columns_to_check])
test_check = cleaned_test[columns_to_check].equals(normalized_test[columns_to_check])

if train_check and test_check:
    print("✅ The train and test sets are consistent across cleaned and normalized datasets.")
else:
    print("❌ Inconsistency detected in train/test splits. Check data alignment!")

print("✅ Training and test datasets are consistently split and saved as CSVs!")


### Data Analysis ###

In [9]:
import folium
import pandas as pd
from folium.plugins import MarkerCluster

# Load dataset
df = pd.read_csv("Data/cleaned_train.csv")

# Remove missing values
df = df.dropna(subset=["Latitude", "Longitude", "resale_price"])

# Group by location and compute average resale price and count of transactions
grouped = df.groupby(["Latitude", "Longitude"], as_index=False).agg(
    avg_price=("resale_price", "mean"), count=("resale_price", "count")
)

# Create a map centered on Singapore
singapore_map_cluster = folium.Map(location=[1.3521, 103.8198], zoom_start=12)

# Get the correct map variable name assigned by Folium
map_var_name = singapore_map_cluster.get_name()

# Create a marker cluster
marker_cluster = MarkerCluster().add_to(singapore_map_cluster)

# Add markers to the cluster
for _, row in grouped.iterrows():
    folium.Marker(
        location=[row["Latitude"], row["Longitude"]],
        popup=f"Avg Resale Price: ${row['avg_price']:,.2f} ({int(row['count'])} units)",
        tooltip=f"Avg Resale Price: ${row['avg_price']:,.2f} ({int(row['count'])} units)",
        icon=folium.Icon(color="blue", icon="info-sign"),
    ).add_to(marker_cluster)

# JavaScript for adding cluster hover tooltips
custom_js = f'''
<script>
document.addEventListener("DOMContentLoaded", function() {{
    var actualMap = { map_var_name };  // Correct map variable

    actualMap.eachLayer(function(layer) {{
        if (layer instanceof L.MarkerClusterGroup) {{
            console.log("Cluster layer detected!");  // Debugging

            layer.on("clustermouseover", function(e) {{
                console.log("Hover event fired!!!", e.layer);

                var cluster = e.layer;
                var totalPrice = 0;
                var totalCount = 0;

                // Recursively get only clustered markers
                function getMarkersInCluster(clusterLayer) {{
                    var markers = clusterLayer._markers || [];  // Directly inside this cluster

                    if (clusterLayer._childClusters && clusterLayer._childClusters.length > 0) {{
                        clusterLayer._childClusters.forEach(childCluster => {{
                            markers = markers.concat(getMarkersInCluster(childCluster)); // Collect markers from child clusters
                        }});
                    }}

                    return markers;
                }}

                var clusterMarkers = getMarkersInCluster(cluster);
                console.log("Total Cluster Markers Found:", clusterMarkers.length);

                // Separate pin markers (standalone markers outside clusters)
                var pinMarkers = [];
                actualMap.eachLayer(function(layer) {{
                    if (layer instanceof L.Marker && !layer.__parent) {{  // Exclude cluster markers
                        pinMarkers.push(layer);
                    }}
                }});

                // Merge both cluster markers and pin markers
                var allMarkers = clusterMarkers.concat(pinMarkers);
                console.log("Total Markers to Process:", allMarkers.length);

                if (allMarkers.length === 0) {{
                    console.warn("No visible markers found, skipping popup.");
                    return;
                }}

                // Debug: Check marker popups
                allMarkers.forEach(marker => {{
                    console.log("Checking marker:", marker);
                    console.log("Popup content:", marker.getPopup() ? marker.getPopup().getContent() : "No popup found");
                }});

                // Convert popup content to plain text (strip HTML tags if needed)
                function extractTextFromPopup(popupContent) {{
                    if (typeof popupContent === "string") {{
                        return popupContent.trim(); // Already plain text, trim spaces
                    }} else if (popupContent instanceof HTMLElement) {{
                        return (popupContent.textContent || popupContent.innerText || "").trim(); // Extract text from HTML element
                    }} else {{
                        console.warn("Unexpected popup content type:", popupContent);
                        return "";
                    }}
                }}

                // Process each marker to compute the avg resale price
                var validMarkers = allMarkers.filter(marker => marker instanceof L.Marker && marker.getPopup());

                if (validMarkers.length === 0) {{
                    console.warn("No valid resale price markers found, skipping popup.");
                    return;
                }}

                validMarkers.forEach(marker => {{
                    var popup = marker.getPopup();
                    if (popup) {{
                        var popupText = extractTextFromPopup(popup.getContent()); // Fix: Properly extract text

                        if (typeof popupText === "string") {{
                            var match = popupText.match(/\\$([\\d,]+\\.?\d{{0,2}})/);
                            var countMatch = popupText.match(/\\((\\d+\\.?\\d*) units\\)/);

                            if (match && countMatch) {{
                                var price = parseFloat(match[1].replace(/,/g, "")) || 0;
                                var count = parseFloat(countMatch[1]) || 0;

                                if (!isNaN(price) && !isNaN(count) && count > 0) {{
                                    totalPrice += price * count;
                                    totalCount += count;
                                }} else {{
                                    console.warn("Skipping invalid price/count values:", price, count);
                                }}
                            }}
                        }}
                    }}
                }});

                if (totalCount > 0) {{
                    var avgPrice = (totalPrice / totalCount).toFixed(2);
                    
                    // Format totalCount: Remove `.0` if it's a whole number
                    var formattedTotalCount = (totalCount % 1 === 0) ? totalCount.toFixed(0) : totalCount.toFixed(2);

                    var popup = L.popup({{
                        closeButton: false,
                        autoClose: false,
                        closeOnClick: false,
                        className: "custom-cluster-tooltip"
                    }})
                    .setContent(
                        "<b>Avg Resale Price: $" + avgPrice + "</b><br>" +
                        "<b>Total Units: " + formattedTotalCount + "</b>"
                    )
                    .setLatLng(cluster.getLatLng());

                    popup.openOn(actualMap);

                    // **Immediate Popup Disappearance on Mouseout**
                    cluster.on("mouseout", function() {{
                        actualMap.closePopup();
                    }});
                }} else {{
                    console.warn("No valid resale prices found, popup not created.");
                }}
            }});
        }}
    }});
}});
</script>
'''

# Attach the custom JavaScript to the map
singapore_map_cluster.get_root().html.add_child(folium.Element(custom_js))

# Save and view
singapore_map_cluster.save("singapore_clustered_avg_hover.html")
print("Clustered resale price map with hover popups saved as singapore_clustered_avg_hover.html")


  '''


Clustered resale price map with hover popups saved as singapore_clustered_avg_hover.html
