# Healthcare Facilities Locations Dataset

In [107]:
!pip install geopandas pyarrow 




In [108]:
import geopandas as gpd
import pandas as pd
import requests # for the API 
import re # for postal code extraction 

## Load datasets

In [109]:
# Healthcare facilities GeoJSON file 
geojson_file = "hotosm_sgp_health_facilities_points_geojson.geojson"
gdf = gpd.read_file(geojson_file)

# Show first few rows to inspect
display(gdf.head())

Unnamed: 0,name,name:en,amenity,building,healthcare,healthcare:speciality,operator:type,capacity:persons,addr:full,addr:city,source,name:ms,name:zh,name:ta,osm_id,osm_type,geometry
0,Family Doctors at 365,,doctors,,,,,,,Singapore,,,,,4584767744,nodes,POINT (103.72029 1.33773)
1,Bukit Merah Polyclinic,,clinic,,clinic,,,,,Singapore,,Poliklinik Bukit Merah,,,3038816607,nodes,POINT (103.81702 1.28369)
2,Geylang Polyclinic,,clinic,,clinic,,,,,Singapore,Kaart Ground Survey 2017,Poliklinik Geylang,,,4583783511,nodes,POINT (103.88731 1.31957)
3,Woodlands Polyclinic,,clinic,,,,,,,Singapore,,Poliklinik Woodlands,,,4583841487,nodes,POINT (103.77523 1.43079)
4,Jurong Polyclinic,,clinic,,clinic,,,,,Singapore,,Poliklinik Jurong,,,4583842089,nodes,POINT (103.73073 1.34962)


## Dataset statistics

In [110]:
# Get the number of rows and columns
num_rows, num_columns = gdf.shape

# Display the number of rows
print(f"Number of rows in the dataset: {num_rows}")

Number of rows in the dataset: 1028


In [111]:
# Check for NaN values in each column
none_values_count = gdf.isna().sum()

# Display the count of NaN values per column
print("Count of None/NaN values per column:")
print(none_values_count)

Count of None/NaN values per column:
name                       24
name:en                  1000
amenity                   142
building                 1028
healthcare                104
healthcare:speciality     866
operator:type            1012
capacity:persons         1028
addr:full                1028
addr:city                 780
source                    935
name:ms                  1013
name:zh                   951
name:ta                  1028
osm_id                      0
osm_type                    0
geometry                    0
dtype: int64


## Clean the dataset

In [112]:
# Drop the unnecessary columns 
columns_to_drop = ['name:en', 'building', 'healthcare', 'healthcare:speciality', 'operator:type', 'capacity:persons', 'addr:full',
                    'addr:city', 'source', 'name:ms', 'name:zh', 'name:ta', 'osm_id', 'osm_type'] 
gdf = gdf.drop(columns=columns_to_drop)

# Verify that the columns are dropped
print(gdf.head())

                     name  amenity                   geometry
0   Family Doctors at 365  doctors  POINT (103.72029 1.33773)
1  Bukit Merah Polyclinic   clinic  POINT (103.81702 1.28369)
2      Geylang Polyclinic   clinic  POINT (103.88731 1.31957)
3    Woodlands Polyclinic   clinic  POINT (103.77523 1.43079)
4       Jurong Polyclinic   clinic  POINT (103.73073 1.34962)


Convert geometry column into longtitude and latitude columnns 

In [113]:
# Extract longitude and latitude from the geometry column
gdf['longitude'] = gdf['geometry'].apply(lambda point: point.x)  # Longitude is the x-coordinate
gdf['latitude'] = gdf['geometry'].apply(lambda point: point.y)   # Latitude is the y-coordinate

# Drop the 'geometry' column now 
gdf = gdf.drop(columns=['geometry'])

# Verify the new columns 
print(gdf.head())


                     name  amenity   longitude  latitude
0   Family Doctors at 365  doctors  103.720294  1.337734
1  Bukit Merah Polyclinic   clinic  103.817018  1.283691
2      Geylang Polyclinic   clinic  103.887310  1.319575
3    Woodlands Polyclinic   clinic  103.775225  1.430788
4       Jurong Polyclinic   clinic  103.730729  1.349623


use one map api to get the missing names

In [114]:
# Function to perform reverse geocoding with OneMap API
def fetch_onemap_facility(lat, lon):
    url = f"https://www.onemap.gov.sg/api/public/revgeocode?location={lat},{lon}&buffer=40&addressType=All&otherFeatures=N"
    headers = {"Authorization": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhYjgzY2U4MjhjOTg2MWNhMWVkN2JlZDcwY2IwNWEwZCIsImlzcyI6Imh0dHA6Ly9pbnRlcm5hbC1hbGItb20tcHJkZXppdC1pdC1uZXctMTYzMzc5OTU0Mi5hcC1zb3V0aGVhc3QtMS5lbGIuYW1hem9uYXdzLmNvbS9hcGkvdjIvdXNlci9wYXNzd29yZCIsImlhdCI6MTc0MDIwMzQ0OSwiZXhwIjoxNzQwNDYyNjQ5LCJuYmYiOjE3NDAyMDM0NDksImp0aSI6IjFsQjhNVUwxRVFLYmU0R2kiLCJ1c2VyX2lkIjo2MDU1LCJmb3JldmVyIjpmYWxzZX0.WdyaSp7VDBMtPG006XCuQjxkZieivgac6Pauoy_wBSE"}  
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if 'GeocodeInfo' in data and len(data['GeocodeInfo']) > 0:
            return data['GeocodeInfo'][0].get('BUILDINGNAME', None) 
    return None  # If no address found

In [115]:
# Fill missing values in 'name' column
for index, row in gdf[gdf['name'].isna()].iterrows():  # Filter only missing values
    latitude = row['latitude']
    longitude = row['longitude']

    # Fetch facility name
    facility_name = fetch_onemap_facility(latitude, longitude)

    # Update 'name' column if a valid facility is found
    if facility_name:
        gdf.at[index, 'name'] = facility_name  # Update specific row in DataFrame

# Check for NaN values in each column
none_values_count = gdf.isna().sum()

# Display the count of NaN values per column
print("Count of None/NaN values per column:")
print(none_values_count)

Count of None/NaN values per column:
name          24
amenity      142
longitude      0
latitude       0
dtype: int64


ie. unable to get these missing facilities --> shall just remove from dataset 

In [116]:
# Drop rows where the 'name' column has null values
gdf = gdf.dropna(subset=['name'])

# Verify that the rows with null values in 'name' have been dropped
gdf['name'].isna().sum()

0

In [117]:
# Check for duplicates --> not based on name because could be same hospital/clinics with mulitple outlets then these count as distinct
duplicates = gdf.duplicated(subset=['name', 'longitude']).sum() 

# Display the number of duplicate rows 
print(f"Number of duplicate healthcare facilities: {duplicates}")

Number of duplicate healthcare facilities: 2


In [118]:
# Drop duplicate rows based on name of facilities 
gdf = gdf.drop_duplicates(subset=["name", 'longitude'])

# Check if duplicated healthcare facilities have been dropped
duplicates = gdf.duplicated(subset=['name', 'longitude']).sum()
print(f"Number of duplicate healthcare facilities: {duplicates}")

# Get the number of rows and columns
num_rows, num_columns = gdf.shape

# Display the number of rows
print(f"Number of rows in the dataset: {num_rows}")

Number of duplicate healthcare facilities: 0
Number of rows in the dataset: 1002


## Save as csv 

In [119]:
csv_file = "healthcare_cleaned.csv"
gdf.to_csv(csv_file, index=False)

# Load and preview CSV file to confirm
df = pd.read_csv(csv_file)
display(df.head())


Unnamed: 0,name,amenity,longitude,latitude
0,Family Doctors at 365,doctors,103.720294,1.337734
1,Bukit Merah Polyclinic,clinic,103.817018,1.283691
2,Geylang Polyclinic,clinic,103.88731,1.319575
3,Woodlands Polyclinic,clinic,103.775225,1.430788
4,Jurong Polyclinic,clinic,103.730729,1.349623
