In [12]:
# CHECK IF COORDINATES ARE STORED IN A 2D ARRAY

import pandas as pd
import ast  # To safely evaluate string representation of lists

# Load the CSV file
csv_file_path = "LTAMRTStation.csv"
df = pd.read_csv(csv_file_path)

def is_valid_2d_array(coord_str):
    """
    Check if a given string representation of coordinates is a valid 2D array.
    - Must be a list of lists.
    - Each inner list must contain exactly 2 elements (longitude, latitude).
    """
    try:
        coordinates = ast.literal_eval(coord_str)  # Convert string back to list
        if isinstance(coordinates, list) and all(
            isinstance(coord, list) and len(coord) == 2 for coord in coordinates
        ):
            return True
        return False
    except (SyntaxError, ValueError):
        return False

# Apply the function to check each row in the "coordinates" column
df["is_2D_valid"] = df["coordinates"].apply(is_valid_2d_array)

# Print the rows where coordinates are NOT valid 2D arrays
invalid_rows = df[~df["is_2D_valid"]]

# Display results
if invalid_rows.empty:
    print("✅ All values in the 'coordinates' column are valid 2D arrays.")
else:
    print(f"❌ Found {len(invalid_rows)} rows with invalid coordinates.")
    print("Here are the invalid rows:")
    print(invalid_rows[["geometry_type", "coordinates"]])


✅ All values in the 'coordinates' column are valid 2D arrays.


In [28]:
# CHECK IF NULL

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv('HDBResale_with_coordinates_GoogleMaps.csv')

df.isnull().sum(axis=0)

month                  0
town                   0
flat_type              0
block                  0
street_name            0
storey_range           0
floor_area_sqm         0
flat_model             0
lease_commence_date    0
remaining_lease        0
resale_price           0
Latitude               0
Longitude              0
dtype: int64

In [None]:
# GET TL NO. OF ROWS

import pandas as pd

# Load the cleaned CSV file
cleaned_csv_path = "NParksTracks.csv"
df_cleaned = pd.read_csv(cleaned_csv_path)

# Get the number of rows
num_rows = df_cleaned.shape[0]

# Print the result
print(f"Total number of rows: {num_rows}")


FileNotFoundError: [Errno 2] No such file or directory: 'NParksTracks.csv'

**Cleaning for SportCenters**

In [3]:
import pandas as pd

# Define file paths
csv_file_path = "UncleanedData/SportSGSportFacilities_Full.xlsx"  # Input CSV file
csv_output_path = "SportSGSportFacilities.csv"  # Output CSV file

# Load the CSV file
df = pd.read_excel(csv_file_path)

# Print all column names, each on a new line
print("Columns in the dataset:")
for col in df.columns:
    print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["SPORTS_CEN", "Latitude", "Longitude"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


Columns in the dataset:
- Name
- Latitude
- Longitude
- SPORTS_CEN
- FACILITIES
- HOUSE_BLOC
- ROAD_NAME
- POSTAL_COD
- INFORMATIO
- CONTACT_NO
- STADIUM_OP
- BOOKING_LI
- ATHLETICS_
- FOOTBALL_F
- FACILITY_I
- STATUS
- TIER
- INC_CRC
- FMEL_UPD_D
- MAINTENANC
- SWIMMING_C
- SPORTS_HAL
- GYM_OPERAT
- COMPETITIO
- TEACHING_P
- WADING_POO
- INDOOR_SPO
- BADMINTON_
- TABLE_TENN
- GYM
- PICKLEBALL
- TENNIS_SQU
- TENNIS_COU
- FOOTBALL_S
- OTHERS_OPE
- NETBALL_CO
- SQUASH_COU
- LAWN_BOWL_
- RUGBY_FIEL
- PETANQUE_C
- GATEBALL_C
- ACTIVE_HEA
- SOCCER_COU
- UPGRADING_
- HOCKEY_PIT
- VOLLEYBALL
- BASKETBALL
✅ Cleaned CSV file saved: SportSGSportFacilities.csv
Columns in the dataset:
['SPORTS_CEN', 'Latitude', 'Longitude']


**Cleaning for LTAMRTStation**

In [9]:
import pandas as pd

# Define file paths
excel_file_path = "UncleanedData/LTAMRTStationExit_Full.xlsx"  # Input Excel file
csv_output_path = "LTAMRTStation.csv"  # Output CSV file

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Columns to drop
columns_to_drop = ["Name", "EXIT_CODE", "INC_CRC", "FMEL_UPD_D"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore if column not found

# Save the cleaned DataFrame to CSV
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


✅ Cleaned CSV file saved: LTAMRTStation.csv
Columns in the dataset:
['Latitude', 'Longitude', 'STATION_NA']


In [10]:
import pandas as pd

# Define file paths
csv_file_path = "LTAMRTStation.csv"  # Input CSV file
updated_csv_path = "LTAMRTStation.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Ensure 'Longitude' and 'Latitude' columns exist
if "Longitude" in df.columns and "Latitude" in df.columns:
    # Create a new 'coordinates' column with [[Longitude, Latitude]] format (2D array)
    df["coordinates"] = df.apply(lambda row: [[row["Longitude"], row["Latitude"]]], axis=1)

    # Drop the original longitude and latitude columns
    df = df.drop(columns=["Longitude", "Latitude"], errors="ignore")

    # Save the updated DataFrame to the same CSV file
    df.to_csv(updated_csv_path, index=False)

    print(f"✅ Updated CSV saved as: {updated_csv_path}")
else:
    print("❌ 'Longitude' or 'Latitude' columns not found in the dataset.")


✅ Updated CSV saved as: LTAMRTStation.csv


In [15]:
import pandas as pd
import ast  # To safely convert string representations of lists

# Define file paths
csv_file_path = "LTAMRTStation.csv"  # Input CSV file
updated_csv_path = "LTAMRTStation.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Ensure required columns exist
if "STATION_NA" in df.columns and "coordinates" in df.columns:
    # Convert 'coordinates' column from string to actual lists
    df["coordinates"] = df["coordinates"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Group by 'STATION_NA' and merge coordinates into a single 2D array per station
    df_grouped = df.groupby("STATION_NA", as_index=False).agg({
        "coordinates": lambda x: [coord for sublist in x for coord in sublist]  # Flatten to single 2D array
    })

    # Save the updated DataFrame to a new CSV file
    df_grouped.to_csv(updated_csv_path, index=False)

    print(f"✅ Updated CSV saved as: {updated_csv_path}")
else:
    print("❌ Required columns ('STATION_NA', 'coordinates') not found in the dataset.")


✅ Updated CSV saved as: LTAMRTStation.csv


**Cleaning for NPARKS**

In [None]:
import json
import pandas as pd

# Define the file path
file_path_geojson = "UncleanedData/NParksTracks.geojson"  # Input GeoJSON file
csv_output_path = "NParksTracks.csv"  # Output CSV file

# Load the GeoJSON file
with open(file_path_geojson, "r", encoding="utf-8") as file:
    geojson_data = json.load(file)

# Extract relevant data from the GeoJSON
features = geojson_data.get("features", [])

geojson_data_list = []

def flatten_multiline_coordinates(multi_coords):
    """Flatten a 3D MultiLineString coordinates array into 2D."""
    return [coord for line in multi_coords for coord in line]

for feature in features:
    properties = feature.get("properties", {})  # Extract properties
    geometry = feature.get("geometry", {})  # Extract geometry
    coordinates = geometry.get("coordinates", [])
    geom_type = geometry.get("type")

    # Process coordinates based on geometry type
    if geom_type == "MultiLineString":
        coordinates = flatten_multiline_coordinates(coordinates)  # Flatten 3D to 2D
    elif geom_type == "LineString":
        pass  # Already 2D

    # Store the processed data
    properties["geometry_type"] = geom_type
    properties["coordinates"] = json.dumps(coordinates)  # Store as JSON string for CSV compatibility
    geojson_data_list.append(properties)

# Convert to DataFrame
df_geojson = pd.DataFrame(geojson_data_list)

# Save the cleaned data to a CSV file
df_geojson.to_csv(csv_output_path, index=False)

print(f"CSV file saved: {csv_output_path}")


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = "NParksTracks.csv"
df = pd.read_csv(csv_file_path)

# Print all column names
print("CSV Columns:")
print(df.columns.tolist())

# Columns to drop
columns_to_drop = ["TYPE", "allow_walk", "allow_whee", "allow_cycl", 
                   "allow_pmd", "AMA_CATEGO", "Shape_Leng", "geometry_type"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore errors if a column doesn't exist

# Save the updated CSV file
updated_csv_path = "NParksTracks.csv"
df.to_csv(updated_csv_path, index=False)

print(f"✅ Updated CSV saved as: {updated_csv_path}")

print("CSV Columns:")
print(df.columns.tolist())


In [None]:
import pandas as pd
import ast  # To safely evaluate string representation of lists

# Load the CSV file
csv_file_path = "NParksTracks.csv"
df = pd.read_csv(csv_file_path)

# Step 1: Drop the 'FID' column
df = df.drop(columns=['FID'], errors='ignore')

# Step 2: Remove rows where 'PARK' is empty
df = df[df['PARK'].notna() & df['PARK'].str.strip().ne('')]

# Step 3: Combine rows with the same 'PARK'
def combine_lists(series):
    """Convert to list, flattening if needed."""
    combined = []
    for item in series.dropna():  # Drop NaN values
        if isinstance(item, str):
            try:
                parsed_item = ast.literal_eval(item)  # Convert string to list
                if isinstance(parsed_item, list):
                    combined.extend(parsed_item)
                else:
                    combined.append(parsed_item)
            except (SyntaxError, ValueError):
                combined.append(item)  # Keep as string if conversion fails
        else:
            combined.append(item)
    return list(set(combined))  # Remove duplicates

def combine_coordinates(series):
    """Flatten and combine all 2D arrays into a single 2D array."""
    combined = []
    for item in series.dropna():  # Drop NaN values
        if isinstance(item, str):
            try:
                coords = ast.literal_eval(item)  # Convert string to list
                if isinstance(coords, list):
                    combined.extend(coords)  # Flatten into 2D array
            except (SyntaxError, ValueError):
                pass  # Skip invalid coordinates
    return combined

# Group by 'PARK' and apply the transformations
df_grouped = df.groupby('PARK', as_index=False).agg({
    'PARK_TYPE': lambda x: combine_lists(x),
    'PCN_LOOP': lambda x: combine_lists(x),
    'coordinates': lambda x: combine_coordinates(x)
})

# Save the cleaned data to a new CSV file
cleaned_csv_path = "NParksTracks.csv"
df_grouped.to_csv(cleaned_csv_path, index=False)

print(f"✅ Cleaned CSV saved as: {cleaned_csv_path}")


**Cleaning for PreSchools**

In [32]:
import pandas as pd

# Define file paths
excel_file_path = "UncleanedData/PreSchoolsLocation_Full.xlsx"  # Input Excel file
csv_output_path = "PreSchoolsLocation.csv"  # Output CSV file

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Columns to drop
columns_to_drop = ["Name", "INC_CRC", "FMEL_UPD_D", "CENTRE_CODE"]

# Drop the specified columns
df = df.drop(columns=columns_to_drop, errors="ignore")  # Ignore if column not found

# Save the cleaned DataFrame to CSV
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


✅ Cleaned CSV file saved: PreSchoolsLocation.csv
Columns in the dataset:
['Latitude', 'Longitude', 'CENTRE_NAME']


**Cleaning for Primary / Secondary**

In [25]:
import pandas as pd

# Define file paths
csv_file_path = "UncleanedData/Generalinformationofschools.csv"  # Input CSV file
csv_output_path = "Generalinformationofschools.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Print all column names, each on a new line
# print("Columns in the dataset:")
# for col in df.columns:
#     print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["school_name", "address", "postal_code", "dgp_code", "zone_code", "mainlevel_code"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())


✅ Cleaned CSV file saved: Generalinformationofschools.csv
Columns in the dataset:
['school_name', 'address', 'postal_code', 'dgp_code', 'zone_code', 'mainlevel_code']


In [None]:
import pandas as pd
from Geocoding.geocoding import get_coordinates

# Load your dataset
csv_file_path = "Generalinformationofschools.csv"
df = pd.read_csv(csv_file_path)

# Apply function to get coordinates
df["Longitude"], df["Latitude"] = zip(*df["address"].apply(get_coordinates))

# Save the updated CSV
updated_csv_path = "ALLSCHOOLS.csv"
df.to_csv(updated_csv_path, index=False)

print(f"✅ Updated CSV saved: {updated_csv_path}")


✅ Updated CSV saved: Schools_with_Geolocation.csv


In [None]:
import pandas as pd

# Define file path
csv_file_path = "ALLSCHOOLS.csv"  # Update with the correct file path

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Get unique values in 'mainlevel_code' column
unique_values = df["mainlevel_code"].dropna().unique()  # Drop NaN values if any

# Print unique values (one per line)
print("Unique values in 'mainlevel_code':")
for value in unique_values:
    print(f"- {value}")


Unique values in 'mainlevel_code':
- PRIMARY
- SECONDARY
- JUNIOR COLLEGE
- MIXED LEVELS
- CENTRALISED INSTITUTE


In [None]:
import pandas as pd

# Define file paths
csv_file_path = "ALLSCHOOLS.csv"  # Input CSV file
csv_output_path = "ALLSCHOOLS.csv"  # Output CSV file

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Print all column names, each on a new line
# print("Columns in the dataset:")
# for col in df.columns:
#     print(f"- {col}")  # Print each column separately

# Columns to keep
columns_to_keep = ["school_name", "mainlevel_code", "Longitude","Latitude"]

# Keep only the specified columns
df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
df.to_csv(csv_output_path, index=False)

print(f"✅ Cleaned CSV file saved: {csv_output_path}")

print("Columns in the dataset:")
print(df.columns.tolist())

✅ Cleaned CSV file saved: PrimarySecondaryJC.csv
Columns in the dataset:
['school_name', 'mainlevel_code', 'Longitude', 'Latitude']


In [None]:
import pandas as pd

# Define file path
csv_file_path = "ALLSCHOOLS.csv"  # Update with correct file path

# Load the CSV file
df = pd.read_csv(csv_file_path)

# Define categories to filter
categories = ["PRIMARY", "SECONDARY", "JUNIOR COLLEGE", "MIXED LEVELS"]

# Save separate CSV files for each category
for category in categories:
    df_filtered = df[df["mainlevel_code"] == category].drop(columns=["mainlevel_code"], errors="ignore")
    output_path = f"{category.replace(' ', '_')}.csv"  # Replace spaces with underscores for filenames
    df_filtered.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")


✅ Saved: PRIMARY.csv
✅ Saved: SECONDARY.csv
✅ Saved: JUNIOR_COLLEGE.csv
✅ Saved: MIXED_LEVELS.csv


In [None]:
import pandas as pd

# Define file paths
primary_secondary_file = "ALLSCHOOLS.csv"  # Update with correct file path
preschool_file = "PreSchoolsLocation.csv"  # Update with correct file path
combined_output_file = "ALLSCHOOLS.csv"  # Output file

# Load the primary/secondary/junior college dataset
df_main = pd.read_csv(primary_secondary_file)

# Load the preschool dataset
df_preschool = pd.read_csv(preschool_file)

# Ensure column names match for merging
expected_columns = df_main.columns.tolist()

# Create a new dataframe for preschools with matching columns
df_preschool_mapped = pd.DataFrame(columns=expected_columns)

# Map preschool data to the main dataset format
df_preschool_mapped["school_name"] = df_preschool["CENTRE_NAME"]
df_preschool_mapped["Latitude"] = df_preschool["Latitude"]
df_preschool_mapped["Longitude"] = df_preschool["Longitude"]
df_preschool_mapped["mainlevel_code"] = "PRESCHOOL"  # Fill mainlevel_code with "PRESCHOOL"

# Concatenate both datasets
df_combined = pd.concat([df_main, df_preschool_mapped], ignore_index=True)

# Save the combined dataset
df_combined.to_csv(combined_output_file, index=False)

print(f"✅ Combined CSV file saved as: {combined_output_file}")


✅ Combined CSV file saved as: ALLSCHOOLS.csv


**Cleaning for HDB Resale**