### Functions to check integrity of data ###

In [None]:
# CHECK IF COORDINATES ARE STORED IN A 2D ARRAY

import pandas as pd
import ast  # To safely evaluate string representation of lists

# Load the CSV file
csv_file_path = "Data_Coordinates/LTAMRTStation.csv"
df = pd.read_csv(csv_file_path)

def is_valid_2d_array(coord_str):
    """
    Check if a given string representation of coordinates is a valid 2D array.
    - Must be a list of lists.
    - Each inner list must contain exactly 2 elements (longitude, latitude).
    """
    try:
        coordinates = ast.literal_eval(coord_str)  # Convert string back to list
        if isinstance(coordinates, list) and all(
            isinstance(coord, list) and len(coord) == 2 for coord in coordinates
        ):
            return True
        return False
    except (SyntaxError, ValueError):
        return False

# Apply the function to check each row in the "coordinates" column
df["is_2D_valid"] = df["coordinates"].apply(is_valid_2d_array)

# Print the rows where coordinates are NOT valid 2D arrays
invalid_rows = df[~df["is_2D_valid"]]

# Display results
if invalid_rows.empty:
    print("✅ All values in the 'coordinates' column are valid 2D arrays.")
else:
    print(f"❌ Found {len(invalid_rows)} rows with invalid coordinates.")
    print("Here are the invalid rows:")
    print(invalid_rows[["geometry_type", "coordinates"]])


In [1]:
# CHECK IF NULL

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('Data_Coordinates/HDBResale_with_coordinates.csv')
pd.set_option("display.max_rows", None)  # Show all rows

print(df.isnull().sum(axis=0))

month                  0
town                   0
flat_type              0
block                  0
street_name            0
storey_range           0
floor_area_sqm         0
flat_model             0
lease_commence_date    0
remaining_lease        0
resale_price           0
Latitude               0
Longitude              0
dtype: int64


### SportCenters ###

In [None]:
import pandas as pd

# Define file paths
csv_file_path = "Data_Raw/SportSGSportFacilities_Full.xlsx"  # Input CSV file
csv_output_path = "Data_Coordinates/Sports.csv"  # Output CSV file

# Load the CSV file
df = pd.read_excel(csv_file_path)

# Print all column names, each on a new line
print("Columns in the dataset:")
for col in df.columns:
    print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["SPORTS_CEN", "Latitude", "Longitude"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


### LTAMRTStations ###

In [None]:
import pandas as pd

# Define file paths
excel_file_path = "Data_Raw/LTAMRTStationExit_Full.xlsx"  # Input Excel file
csv_output_path = "LTAMRTStation.csv"  # Output CSV file

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Columns to drop
columns_to_drop = ["Name", "EXIT_CODE", "INC_CRC", "FMEL_UPD_D"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore if column not found

# Save the cleaned DataFrame to CSV
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


In [None]:
import pandas as pd

# Define file paths
csv_file_path = "LTAMRTStation.csv"  # Input CSV file
updated_csv_path = "LTAMRTStation.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Ensure 'Longitude' and 'Latitude' columns exist
if "Longitude" in df.columns and "Latitude" in df.columns:
    # Create a new 'coordinates' column with [[Longitude, Latitude]] format (2D array)
    df["coordinates"] = df.apply(lambda row: [[row["Longitude"], row["Latitude"]]], axis=1)

    # Drop the original longitude and latitude columns
    df = df.drop(columns=["Longitude", "Latitude"], errors="ignore")

    # Save the updated DataFrame to the same CSV file
    df.to_csv(updated_csv_path, index=False)

    print(f"✅ Updated CSV saved as: {updated_csv_path}")
else:
    print("❌ 'Longitude' or 'Latitude' columns not found in the dataset.")


In [None]:
import pandas as pd
import ast  # To safely convert string representations of lists

# Define file paths
csv_file_path = "LTAMRTStation.csv"  # Input CSV file
updated_csv_path = "LTAMRTStation.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Ensure required columns exist
if "STATION_NA" in df.columns and "coordinates" in df.columns:
    # Convert 'coordinates' column from string to actual lists
    df["coordinates"] = df["coordinates"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Group by 'STATION_NA' and merge coordinates into a single 2D array per station
    df_grouped = df.groupby("STATION_NA", as_index=False).agg({
        "coordinates": lambda x: [coord for sublist in x for coord in sublist]  # Flatten to single 2D array
    })

    # Save the updated DataFrame to a new CSV file
    df_grouped.to_csv(updated_csv_path, index=False)

    print(f"✅ Updated CSV saved as: {updated_csv_path}")
else:
    print("❌ Required columns ('STATION_NA', 'coordinates') not found in the dataset.")


### NPARKS ###

In [None]:
import json
import pandas as pd

# Define the file path
file_path_geojson = "UncleanedData/NParksTracks.geojson"  # Input GeoJSON file
csv_output_path = "NParksTracks.csv"  # Output CSV file

# Load the GeoJSON file
with open(file_path_geojson, "r", encoding="utf-8") as file:
    geojson_data = json.load(file)

# Extract relevant data from the GeoJSON
features = geojson_data.get("features", [])

geojson_data_list = []

def flatten_multiline_coordinates(multi_coords):
    """Flatten a 3D MultiLineString coordinates array into 2D."""
    return [coord for line in multi_coords for coord in line]

for feature in features:
    properties = feature.get("properties", {})  # Extract properties
    geometry = feature.get("geometry", {})  # Extract geometry
    coordinates = geometry.get("coordinates", [])
    geom_type = geometry.get("type")

    # Process coordinates based on geometry type
    if geom_type == "MultiLineString":
        coordinates = flatten_multiline_coordinates(coordinates)  # Flatten 3D to 2D
    elif geom_type == "LineString":
        pass  # Already 2D

    # Store the processed data
    properties["geometry_type"] = geom_type
    properties["coordinates"] = json.dumps(coordinates)  # Store as JSON string for CSV compatibility
    geojson_data_list.append(properties)

# Convert to DataFrame
df_geojson = pd.DataFrame(geojson_data_list)

# Save the cleaned data to a CSV file
df_geojson.to_csv(csv_output_path, index=False)

print(f"CSV file saved: {csv_output_path}")


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = "NParksTracks.csv"
df = pd.read_csv(csv_file_path)

# Print all column names
print("CSV Columns:")
print(df.columns.tolist())

# Columns to drop
columns_to_drop = ["TYPE", "allow_walk", "allow_whee", "allow_cycl", 
                   "allow_pmd", "AMA_CATEGO", "Shape_Leng", "geometry_type"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore errors if a column doesn't exist

# Save the updated CSV file
updated_csv_path = "NParksTracks.csv"
df.to_csv(updated_csv_path, index=False)

print(f"✅ Updated CSV saved as: {updated_csv_path}")

print("CSV Columns:")
print(df.columns.tolist())


In [None]:
import pandas as pd
import ast  # To safely evaluate string representation of lists

# Load the CSV file
csv_file_path = "NParksTracks.csv"
df = pd.read_csv(csv_file_path)

# Step 1: Drop the 'FID' column
df = df.drop(columns=['FID'], errors='ignore')

# Step 2: Remove rows where 'PARK' is empty
df = df[df['PARK'].notna() & df['PARK'].str.strip().ne('')]

# Step 3: Combine rows with the same 'PARK'
def combine_lists(series):
    """Convert to list, flattening if needed."""
    combined = []
    for item in series.dropna():  # Drop NaN values
        if isinstance(item, str):
            try:
                parsed_item = ast.literal_eval(item)  # Convert string to list
                if isinstance(parsed_item, list):
                    combined.extend(parsed_item)
                else:
                    combined.append(parsed_item)
            except (SyntaxError, ValueError):
                combined.append(item)  # Keep as string if conversion fails
        else:
            combined.append(item)
    return list(set(combined))  # Remove duplicates

def combine_coordinates(series):
    """Flatten and combine all 2D arrays into a single 2D array."""
    combined = []
    for item in series.dropna():  # Drop NaN values
        if isinstance(item, str):
            try:
                coords = ast.literal_eval(item)  # Convert string to list
                if isinstance(coords, list):
                    combined.extend(coords)  # Flatten into 2D array
            except (SyntaxError, ValueError):
                pass  # Skip invalid coordinates
    return combined

# Group by 'PARK' and apply the transformations
df_grouped = df.groupby('PARK', as_index=False).agg({
    'PARK_TYPE': lambda x: combine_lists(x),
    'PCN_LOOP': lambda x: combine_lists(x),
    'coordinates': lambda x: combine_coordinates(x)
})

# Save the cleaned data to a new CSV file
cleaned_csv_path = "NParksTracks.csv"
df_grouped.to_csv(cleaned_csv_path, index=False)

print(f"✅ Cleaned CSV saved as: {cleaned_csv_path}")


### Preschool ###

In [None]:
import pandas as pd

# Define file paths
excel_file_path = "UncleanedData/PreSchoolsLocation_Full.xlsx"  # Input Excel file
csv_output_path = "PreSchoolsLocation.csv"  # Output CSV file

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Columns to drop
columns_to_drop = ["Name", "INC_CRC", "FMEL_UPD_D", "CENTRE_CODE"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore if column not found

# Save the cleaned DataFrame to CSV
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


### Primary / Secondary / JC / Mixed ###

In [None]:
import pandas as pd

# Define file paths
csv_file_path = "UncleanedData/Generalinformationofschools.csv"  # Input CSV file
csv_output_path = "Generalinformationofschools.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Print all column names, each on a new line
# print("Columns in the dataset:")
# for col in df.columns:
#     print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["school_name", "address", "postal_code", "dgp_code", "zone_code", "mainlevel_code"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


In [None]:
import pandas as pd
from Geocoding.geocoding import get_coordinates

# Load your dataset
csv_file_path = "Generalinformationofschools.csv"
df = pd.read_csv(csv_file_path)

# Apply function to get coordinates
df["Longitude"], df["Latitude"] = zip(*df["address"].apply(get_coordinates))

# Save the updated CSV
updated_csv_path = "ALLSCHOOLS.csv"
df.to_csv(updated_csv_path, index=False)

print(f"✅ Updated CSV saved: {updated_csv_path}")


In [None]:
import pandas as pd

# Define file path
csv_file_path = "ALLSCHOOLS.csv"  # Update with the correct file path

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Get unique values in 'mainlevel_code' column
unique_values = df["mainlevel_code"].dropna().unique()  # Drop NaN values if any

# Print unique values (one per line)
print("Unique values in 'mainlevel_code':")
for value in unique_values:
    print(f"- {value}")


In [None]:
import pandas as pd

# Define file paths
csv_file_path = "ALLSCHOOLS.csv"  # Input CSV file
csv_output_path = "ALLSCHOOLS.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Print all column names, each on a new line
# print("Columns in the dataset:")
# for col in df.columns:
#     print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["school_name", "mainlevel_code", "Longitude","Latitude"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())

In [None]:
import pandas as pd

# Define file path
csv_file_path = "ALLSCHOOLS.csv"  # Update with correct file path

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Define categories to filter
categories = ["PRIMARY", "SECONDARY", "JUNIOR COLLEGE", "MIXED LEVELS"]

# Save separate CSV files for each category
for category in categories:
    df_filtered = df[df["mainlevel_code"] == category].drop(columns=["mainlevel_code"], errors="ignore")
    output_path = f"{category.replace(' ', '_')}.csv"  # Replace spaces with underscores for filenames
    df_filtered.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")


In [None]:
import pandas as pd

# Define file paths
primary_secondary_file = "ALLSCHOOLS.csv"  # Update with correct file path
preschool_file = "PreSchoolsLocation.csv"  # Update with correct file path
combined_output_file = "ALLSCHOOLS.csv"  # Output file

# Load the primary/secondary/junior college dataset
df_main = pd.read_csv(primary_secondary_file)

# Load the preschool dataset
df_preschool = pd.read_csv(preschool_file)

# Ensure column names match for merging
expected_columns = df_main.columns.tolist()

# Create a new dataframe for preschools with matching columns
df_preschool_mapped = pd.DataFrame(columns=expected_columns)

# Map preschool data to the main dataset format
df_preschool_mapped["school_name"] = df_preschool["CENTRE_NAME"]
df_preschool_mapped["Latitude"] = df_preschool["Latitude"]
df_preschool_mapped["Longitude"] = df_preschool["Longitude"]
df_preschool_mapped["mainlevel_code"] = "PRESCHOOL"  # Fill mainlevel_code with "PRESCHOOL"

# Concatenate both datasets
df_combined = pd.concat([df_main, df_preschool_mapped], ignore_index=True)

# Save the combined dataset
df_combined.to_csv(combined_output_file, index=False)

print(f"✅ Combined CSV file saved as: {combined_output_file}")


### Hawker ###

Please add the following missing content!

"HDP_Hawker and coffeshop listing_for online use_16.5.2018.xlsx" rename to "HDP_Hawker_and_coffeeshop.xlsx" fromn https://ch-api.healthhub.sg/api/public/content/2be093bf58c948bd8e510df83a80914a?v=ee49b3af


In [None]:
import pandas as pd 

# File paths
EXCEL_FILE = "Data_Raw/HDP_Hawker_and_coffeeshop.xlsx"  # Input Excel file containing raw hawker centre data
HAWKER_COORDS_CSV = "Data_Coordinates/Hawker.csv"  # Output file for cleaned hawker data

print("📌 Cleaning hawker centre data...")  # Status update for cleaning process

# Load dataset from Excel, selecting only necessary columns
df = pd.read_excel(EXCEL_FILE, usecols=["Name of hawker centre/coffee\nshop", "Address"], engine="openpyxl")

# Remove duplicate entries and drop rows with missing values
df = df.drop_duplicates().dropna()

# Trim any extra spaces from the "Address" column to ensure consistency
df["Address"] = df["Address"].str.strip()

# Print the number of cleaned rows after processing
print(f"✅ Completed Cleaning data ({len(df)} rows)")


📌 Cleaning hawker centre data...
✅ Completed Cleaning data (961 rows)


In [None]:
from Geocoding.geocoding import get_coordinates 

# Fetch longitude and latitude for each hawker centre using the address
df["Longitude"], df["Latitude"] = zip(*df["Address"].apply(get_coordinates))

# Save the updated DataFrame with coordinates to a CSV file
df.to_csv(HAWKER_COORDS_CSV, index=False)

# Print success message with the output file location
print(f"✅ Updated CSV saved: {HAWKER_COORDS_CSV}")


Geocoding - 116 Aljunied Ave 2
Geocoding - 117 Aljunied Ave 2
Geocoding - 119 Aljunied Ave 2
Geocoding - 119 Aljunied Ave 2
Geocoding - 91 Alps Avenue
Geocoding - 412 Bedok North Ave 2
Geocoding - 416 Bedok North Ave 2
Geocoding - 418 Bedok North Ave 2
Geocoding - 418 Bedok North Ave 2
Geocoding - 136 Bedok North Ave 3
Geocoding - 204 Bedok North St 1
Geocoding - 204 Bedok North St 1
Geocoding - 216 Bedok North St 1
Geocoding - 217 Bedok North St 1
Geocoding - 122 Bedok North St 2
Geocoding - 123 Bedok North St 2
Geocoding - 123 Bedok North St 2
Geocoding - 128 Bedok North St 2
Geocoding - 129 Bedok North St 2
Geocoding - 85 Bedok North St 4
Geocoding - 86 Bedok North St 4
Geocoding - 87 Bedok North St 4
Geocoding - 88 Bedok North St 4
Geocoding - 760 Bedok Reservoir
View
Geocoding - 1 Bedok Road
Geocoding - 324 Bedok Road
Geocoding - 168 Bedok South Ave 3
Geocoding - 69 Bedok South Ave 3
Geocoding - 69 Bedok South Ave 3
Geocoding - 16 Bedok South Road
Geocoding - 18 Bedok South Road
G

### Mall ###

Please add the following missing content!
"List_of_Malls.txt" copy and pasted from https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore

In [None]:
import pandas as pd
from Geocoding.geocoding import get_coordinates 

# File paths
MALL_LIST_FILE = "Data_Raw/List_of_Malls.txt"  # Input file containing mall names categorized by region
MALL_COORDS_CSV = "Data_Coordinates/MallCoordinates.csv"  # Output file to store mall coordinates

# Dictionary to store mall names grouped by region
regions = {}
current_region = None  # To keep track of the current region while reading the file

# Read the mall list from the text file
with open(MALL_LIST_FILE, "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()  # Remove leading and trailing spaces
        if line:
            # If the line is a region, update `current_region`
            if line in ["Central", "East", "North", "North East", "North West", "South", "West"]:
                current_region = line
                regions[current_region] = []  # Initialize a new list for this region
            elif current_region:
                # If it's a mall name, add it under the current region
                regions[current_region].append(line)

# Convert the dictionary into a list of (Mall Name, Region) tuples
mall_data = [(mall, region) for region, malls in regions.items() for mall in malls]

# Create a DataFrame with mall names and their respective regions
df = pd.DataFrame(mall_data, columns=["Mall Name", "Region"])

# Fetch longitude and latitude using the `get_coordinates` function for each mall
df["Longitude"], df["Latitude"] = zip(*df["Mall Name"].apply(get_coordinates))

# Save the updated DataFrame with coordinates to a CSV file
df.to_csv(MALL_COORDS_CSV, index=False)

# Print success message with the number of rows processed
print(f"✅ Mall coordinates saved to {MALL_COORDS_CSV} ({len(df)} rows)")


Geocoding - 100 AM
Geocoding - 313@Somerset
Geocoding - Aperia
Geocoding - Balestier Hill Shopping Centre
Geocoding - Bugis Cube
Geocoding - Bugis Junction
Geocoding - Bugis+
Geocoding - Capitol Piazza
Geocoding - Cathay Cineleisure Orchard
Geocoding - Clarke Quay Central
Geocoding - The Centrepoint
Geocoding - City Square Mall
Geocoding - City Gate Mall
Geocoding - CityLink Mall
Geocoding - Duo
Geocoding - Far East Plaza
Geocoding - Funan
Geocoding - Great World City
Geocoding - GRiD (formerly PoMo)[1]
Geocoding - HDB Hub
Geocoding - Holland Village Shopping Mall
Geocoding - ION Orchard
Geocoding - Junction 8
Geocoding - Knightsbridge[2]
Geocoding - Liat Towers
Geocoding - Lucky Plaza
Geocoding - Marina Bay Sands
Geocoding - The Shoppes at Marina Bay Sands
Geocoding - Marina Bay Link Mall
Geocoding - Marina Square
Geocoding - Millenia Walk
Geocoding - Mustafa Shopping Centre
Geocoding - Ngee Ann City
Geocoding - One Holland Village
Geocoding - Orchard Central
Geocoding - Orchard Gatew