In [2]:
import pandas as pd
import requests
import time

# Load dataset
file_path = "UncleanedData/ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv"
df = pd.read_csv(file_path)

# Ensure the required columns exist
if "block" not in df.columns or "street_name" not in df.columns:
    raise ValueError("Dataset must contain 'block' and 'street_name' columns.")

# Local Nominatim instance URL
NOMINATIM_URL = "http://localhost:8080/search"

# Function to get coordinates from Nominatim
def get_coordinates(address):
    print(f"Geocoding - {address}")
    params = {"q": address, "format": "json"}
    response = requests.get(NOMINATIM_URL, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if data:
            return data[0]["lat"], data[0]["lon"]  # Return latitude, longitude
    return None, None  # Return None if no result

# Apply geocoding to each row
df["Latitude"], df["Longitude"] = zip(*df.apply(lambda row: get_coordinates(f"{row['block']} {row['street_name']}"), axis=1))

# Save the new dataset with coordinates
output_file = "HDBResale_with_coordinates.csv"
df.to_csv(output_file, index=False)

print(f"Geocoded dataset saved as: {output_file}")


Geocoding - 406 ANG MO KIO AVE 10
Geocoding - 108 ANG MO KIO AVE 4
Geocoding - 602 ANG MO KIO AVE 5
Geocoding - 465 ANG MO KIO AVE 10
Geocoding - 601 ANG MO KIO AVE 5
Geocoding - 150 ANG MO KIO AVE 5
Geocoding - 447 ANG MO KIO AVE 10
Geocoding - 218 ANG MO KIO AVE 1
Geocoding - 447 ANG MO KIO AVE 10
Geocoding - 571 ANG MO KIO AVE 3
Geocoding - 534 ANG MO KIO AVE 10
Geocoding - 233 ANG MO KIO AVE 3
Geocoding - 235 ANG MO KIO AVE 3
Geocoding - 219 ANG MO KIO AVE 1
Geocoding - 536 ANG MO KIO AVE 10
Geocoding - 230 ANG MO KIO AVE 3
Geocoding - 570 ANG MO KIO AVE 3
Geocoding - 624 ANG MO KIO AVE 4
Geocoding - 441 ANG MO KIO AVE 10
Geocoding - 625 ANG MO KIO AVE 9
Geocoding - 119 ANG MO KIO AVE 3
Geocoding - 255 ANG MO KIO AVE 4
Geocoding - 432 ANG MO KIO AVE 10
Geocoding - 211 ANG MO KIO AVE 3
Geocoding - 584 ANG MO KIO AVE 3
Geocoding - 118 ANG MO KIO AVE 4
Geocoding - 333 ANG MO KIO AVE 1
Geocoding - 256 ANG MO KIO AVE 4
Geocoding - 330 ANG MO KIO AVE 1
Geocoding - 557 ANG MO KIO AVE 10
G

In [19]:
import pandas as pd
import requests
import time

# Load dataset
file_path = "HDBResale_with_coordinates.csv"
df = pd.read_csv(file_path)

# Ensure the required columns exist
if "block" not in df.columns or "street_name" not in df.columns:
    raise ValueError("Dataset must contain 'block' and 'street_name' columns.")

# Ensure Latitude and Longitude columns exist
if "Latitude" not in df.columns or "Longitude" not in df.columns:
    df["Latitude"], df["Longitude"] = None, None  # Create empty columns if they don't exist

# Local Nominatim instance URL
NOMINATIM_URL = "http://localhost:8080/search"

# Function to get coordinates from Nominatim
def get_coordinates(address):
    print(f"Geocoding - {address}")
    params = {"q": address, "format": "json"}
    
    try:
        response = requests.get(NOMINATIM_URL, params=params, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if data:
                return data[0]["lat"], data[0]["lon"]  # Return latitude, longitude
    except requests.exceptions.RequestException as e:
        print(f"Error with request: {e}")
    
    return None, None  # Return None if no result

# Process only rows with missing coordinates
for index, row in df.iterrows():
    if pd.isna(row["Latitude"]) or pd.isna(row["Longitude"]):
        address = f"{row['block']} {row['street_name']}"
        lat, lon = get_coordinates(address)
        
        # Save new coordinates if found
        df.at[index, "Latitude"] = lat
        df.at[index, "Longitude"] = lon

        # Optional: Introduce a short delay to prevent excessive requests
        # time.sleep(1)

# Save the updated dataset
output_file = "HDBResale_with_coordinates.csv"
df.to_csv(output_file, index=False)

print(f"Geocoded dataset saved as: {output_file}")


Geocoding - 101 BEDOK NORTH AVE 4
Geocoding - 548 BEDOK NORTH AVE 1
Geocoding - 75 BEDOK NORTH RD
Geocoding - 550 BEDOK NORTH AVE 1
Geocoding - 528 BEDOK NORTH ST 3
Geocoding - 534 BEDOK NORTH ST 3
Geocoding - 550 BEDOK NORTH AVE 1
Geocoding - 77 BEDOK NORTH RD


  df.at[index, "Latitude"] = lat
  df.at[index, "Longitude"] = lon


Geocoding - 57 NEW UPPER CHANGI RD
Geocoding - 57 NEW UPPER CHANGI RD
Geocoding - 105 BEDOK NORTH AVE 4
Geocoding - 75 BEDOK NORTH RD
Geocoding - 425 BEDOK NORTH RD
Geocoding - 117 BEDOK NORTH RD
Geocoding - 423 BEDOK NORTH AVE 1
Geocoding - 533 BEDOK NORTH ST 3
Geocoding - 80 BEDOK NORTH RD
Geocoding - 78 BEDOK NORTH RD
Geocoding - 81 BEDOK NORTH RD
Geocoding - 124 BEDOK NORTH RD
Geocoding - 705 BEDOK NORTH RD
Geocoding - 554 BEDOK NORTH ST 3
Geocoding - 81 BEDOK NORTH RD
Geocoding - 508 BEDOK NORTH AVE 3
Geocoding - 185 BEDOK NORTH RD
Geocoding - 220B BEDOK CENTRAL
Geocoding - 415 BEDOK NORTH AVE 2
Geocoding - 219C BEDOK CENTRAL
Geocoding - 94C BEDOK NORTH AVE 4
Geocoding - 342 BUKIT BATOK ST 34
Geocoding - 530 BUKIT BATOK ST 51
Geocoding - 145 BUKIT BATOK ST 11
Geocoding - 528 BUKIT BATOK ST 51
Geocoding - 537 BUKIT BATOK ST 52
Geocoding - 213 BUKIT BATOK ST 21
Geocoding - 219 BUKIT BATOK ST 21
Geocoding - 235 BUKIT BATOK EAST AVE 5
Geocoding - 130 BUKIT BATOK WEST AVE 6
Geocoding -

In [20]:
import pandas as pd

# Load dataset
file_path = "HDBResale_with_coordinates.csv"  # Adjust if needed
df = pd.read_csv(file_path)

# Ensure Latitude and Longitude columns exist
if "Latitude" not in df.columns or "Longitude" not in df.columns:
    raise ValueError("Dataset must contain 'Latitude' and 'Longitude' columns.")

# Filter rows where either Latitude or Longitude is missing
missing_coords_df = df[df["Latitude"].isna() | df["Longitude"].isna()]

# Save to a new CSV file
output_file = "Missing_Coordinates.csv"
missing_coords_df.to_csv(output_file, index=False)

print(f"Rows with missing coordinates saved as: {output_file}")


Rows with missing coordinates saved as: Missing_Coordinates.csv


In [16]:
import pandas as pd
import ast
from geopy.distance import geodesic
import os

# Set variables for easy modification
RADIUS_KM = 1  # Change this value as needed
df_filename = "HDBResale_with_coordinates.csv"  # Main dataset file
locations_filenames = [
    "CleanData/LTAMRTStation.csv",
    "CleanData/MallCoordinates.csv",
    "CleanData/PreSchool.csv",
    "CleanData/Primary.csv",
    "CleanData/Secondary.csv",
    "CleanData/JuniorCollege.csv",
    "CleanData/MixedLevel.csv",
    "CleanData/NParks.csv",
    "CleanData/Sports.csv"
]

# Load main data
df = pd.read_csv(df_filename)

# Take the first 10 rows for testing
df_test = df.head(10).copy()

### **Simple Location Functions (Latitude & Longitude columns) ###
def count_simple_locations_within_radius(row, locations, radius_km):
    return sum(geodesic((row["Latitude"], row["Longitude"]), (loc["Latitude"], loc["Longitude"])).km <= radius_km 
               for _, loc in locations.iterrows())

def nearest_simple_location_distance(row, locations):
    distances = [geodesic((row["Latitude"], row["Longitude"]), (loc["Latitude"], loc["Longitude"])).km 
                 for _, loc in locations.iterrows()]
    return min(distances) if distances else None

### **Complex Location Functions (coordinates column) ###
def is_location_within_radius(row, location_coordinates, radius_km):
    for coord in location_coordinates:
        lon, lat = coord  # Extract longitude and latitude
        if geodesic((row["Latitude"], row["Longitude"]), (lat, lon)).km <= radius_km:
            return True  # Stop checking once one is within range
    return False

def count_complex_locations_within_radius(row, locations, radius_km):
    return sum(
        is_location_within_radius(row, ast.literal_eval(loc["coordinates"]), radius_km)
        for _, loc in locations.iterrows()
    )

def nearest_complex_location_distance(row, locations):
    min_distance = float("inf")
    
    for _, loc in locations.iterrows():
        location_coordinates = ast.literal_eval(loc["coordinates"])  # Convert string to list
        for lon, lat in location_coordinates:
            distance = geodesic((row["Latitude"], row["Longitude"]), (lat, lon)).km
            if distance < min_distance:
                min_distance = distance

    return min_distance if min_distance != float("inf") else None

# Loop through each locations CSV file
for locations_filename in locations_filenames:
    print(f"Processing locations from '{locations_filename}'")

    # Load locations data
    locations_df = pd.read_csv(locations_filename)

    # Extract the base name of the locations CSV file (without extension)
    locations_csv_name = os.path.splitext(os.path.basename(locations_filename))[0]

    # Generate dynamic column names
    within_col_name = f"{locations_csv_name}_within_{RADIUS_KM}km"
    nearest_col_name = f"{locations_csv_name}_nearest"

    # Detect if the dataset uses simple (Latitude/Longitude) or complex (coordinates) format
    if "Latitude" in locations_df.columns and "Longitude" in locations_df.columns:
        print(f"Detected simple location format in '{locations_filename}'")

        # Process data and store results
        for index, row in df_test.iterrows():
            print(f"Processing row {index + 1} for '{locations_filename}'")
            df_test.at[index, within_col_name] = count_simple_locations_within_radius(row, locations_df, RADIUS_KM)
            df_test.at[index, nearest_col_name] = nearest_simple_location_distance(row, locations_df)

    elif "coordinates" in locations_df.columns:
        print(f"Detected complex location format in '{locations_filename}'")

        # Process data and store results
        for index, row in df_test.iterrows():
            print(f"Processing row {index + 1} for '{locations_filename}'")
            df_test.at[index, within_col_name] = count_complex_locations_within_radius(row, locations_df, RADIUS_KM)
            df_test.at[index, nearest_col_name] = nearest_complex_location_distance(row, locations_df)

    else:
        raise ValueError(f"Invalid format in '{locations_filename}': Must contain either 'Latitude' & 'Longitude' OR 'coordinates' column.")

# Save updated test CSV
df_test.to_csv("data_test.csv", index=False)
print(f"Updated test CSV saved as 'data_test.csv' with results from all location files.")


Processing locations from 'CleanData/LTAMRTStation.csv'
Detected complex location format in 'CleanData/LTAMRTStation.csv'
Processing row 1 for 'CleanData/LTAMRTStation.csv'
Processing row 2 for 'CleanData/LTAMRTStation.csv'
Processing row 3 for 'CleanData/LTAMRTStation.csv'
Processing row 4 for 'CleanData/LTAMRTStation.csv'
Processing row 5 for 'CleanData/LTAMRTStation.csv'
Processing row 6 for 'CleanData/LTAMRTStation.csv'
Processing row 7 for 'CleanData/LTAMRTStation.csv'
Processing row 8 for 'CleanData/LTAMRTStation.csv'
Processing row 9 for 'CleanData/LTAMRTStation.csv'
Processing row 10 for 'CleanData/LTAMRTStation.csv'
Processing locations from 'CleanData/MallCoordinates.csv'
Detected simple location format in 'CleanData/MallCoordinates.csv'
Processing row 1 for 'CleanData/MallCoordinates.csv'
Processing row 2 for 'CleanData/MallCoordinates.csv'
Processing row 3 for 'CleanData/MallCoordinates.csv'
Processing row 4 for 'CleanData/MallCoordinates.csv'
Processing row 5 for 'CleanDat