# Hawker Centre Location Dataset

In [28]:
! pip install geopandas beautifulsoup4 pyarrow



In [29]:
import geopandas as gpd
import pandas as pd
from bs4 import BeautifulSoup

## Load datasets

In [30]:
# Hawker centres facilities GeoJSON file 
geojson_file = "HawkerCentresGEOJSON.geojson" 
gdf = gpd.read_file(geojson_file)

# Show first few rows to inspect
display(gdf.head())

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.85016 1.28442 0)
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.77978 1.43354 0)
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.80472 1.29749 0)
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.87706 1.39159 0)
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.90481 1.40819 0)


## Extract the columns 

In [31]:
# Function to extract data from the 'Description' HTML field
def extract_description_info(description):
    soup = BeautifulSoup(description, "html.parser")
    data = {}

    # Extract the table rows from the HTML
    rows = soup.find_all('tr')

    # Loop through each row and extract the column names and values
    for row in rows:
        th_elements = row.find_all('th')
        td_elements = row.find_all('td')
        
        # Check if both <th> and <td> exist before extracting
        if th_elements and td_elements:
            key = th_elements[0].get_text(strip=True)
            value = td_elements[0].get_text(strip=True)
            data[key] = value

    return data

# Apply the extraction function to the 'Description' column
description_data = gdf['Description'].apply(extract_description_info)

# Convert the extracted data into a DataFrame
description_df = pd.json_normalize(description_data)

# Combine the new columns with the original GeoDataFrame (without overwriting the existing ones)
gdf = pd.concat([gdf, description_df], axis=1)

# Now, the GeoDataFrame should have new columns extracted from the Description field
print(gdf.head())


    Name                                        Description  \
0  kml_1  <center><table><tr><th colspan='2' align='cent...   
1  kml_2  <center><table><tr><th colspan='2' align='cent...   
2  kml_3  <center><table><tr><th colspan='2' align='cent...   
3  kml_4  <center><table><tr><th colspan='2' align='cent...   
4  kml_5  <center><table><tr><th colspan='2' align='cent...   

                        geometry AWARDED_DATE LANDYADDRESSPOINT  \
0  POINT Z (103.85016 1.28442 0)    17/7/2017           29650.7   
1  POINT Z (103.77978 1.43354 0)     4/9/2015          46139.03   
2  POINT Z (103.80472 1.29749 0)    11/5/2015          31094.91   
3  POINT Z (103.87706 1.39159 0)    30/7/2018          41500.77   
4  POINT Z (103.90481 1.40819 0)     8/8/2018          43336.13   

                                            PHOTOURL ADDRESSBLOCKHOUSENUMBER  \
0                                                                         50   
1  http://www.nea.gov.sg/images/default-source/Ha...      

In [32]:
# List all column names
print(gdf.columns.tolist())

['Name', 'Description', 'geometry', 'AWARDED_DATE', 'LANDYADDRESSPOINT', 'PHOTOURL', 'ADDRESSBLOCKHOUSENUMBER', 'DESCRIPTION', 'EST_ORIGINAL_COMPLETION_DATE', 'STATUS', 'APPROXIMATE_GFA', 'INFO_ON_CO_LOCATORS', 'NAME', 'ADDRESSBUILDINGNAME', 'HUP_COMPLETION_DATE', 'LANDXADDRESSPOINT', 'ADDRESSSTREETNAME', 'ADDRESSPOSTALCODE', 'IMPLEMENTATION_DATE', 'ADDRESS_MYENV', 'INC_CRC', 'FMEL_UPD_D']


Check for null values 

## Dataset statistics 

In [33]:
# Get the number of rows and columns
num_rows, num_columns = gdf.shape

# Display the number of rows
print(f"Number of rows in the dataset: {num_rows}")

Number of rows in the dataset: 125


In [34]:
# Check for empty strings in the entire GeoDataFrame -- same as checking for null values 
empty_strings = (gdf == "").sum()

# Display columns with empty strings
print("Empty strings per column:")
print(empty_strings)

Empty strings per column:
Name                              0
Description                       0
geometry                          0
AWARDED_DATE                    107
LANDYADDRESSPOINT                 0
PHOTOURL                         12
ADDRESSBLOCKHOUSENUMBER           3
DESCRIPTION                       0
EST_ORIGINAL_COMPLETION_DATE      1
STATUS                            0
APPROXIMATE_GFA                  93
INFO_ON_CO_LOCATORS             110
NAME                              0
ADDRESSBUILDINGNAME             109
HUP_COMPLETION_DATE              28
LANDXADDRESSPOINT                 0
ADDRESSSTREETNAME                 0
ADDRESSPOSTALCODE                 1
IMPLEMENTATION_DATE             107
ADDRESS_MYENV                    12
INC_CRC                           0
FMEL_UPD_D                        0
dtype: int64


In [35]:
# Check for duplicates 
duplicates = gdf.duplicated(subset=['NAME']).sum()

# Display the number of duplicate rows 
print(f"Number of duplicate hawker centre names: {duplicates}")

Number of duplicate hawker centre names: 0


## Clean the dataset 
1. Change geomtry to coordinates 
2. Drop unnecessary columns 
3. Rename and rearrange columns 

In [36]:
# Extract longitude and latitude from the geometry column
gdf['longitude'] = gdf['geometry'].apply(lambda point: point.x)  # Longitude is the x-coordinate
gdf['latitude'] = gdf['geometry'].apply(lambda point: point.y)   # Latitude is the y-coordinate

# Verify the new columns 
print(gdf.head())

    Name                                        Description  \
0  kml_1  <center><table><tr><th colspan='2' align='cent...   
1  kml_2  <center><table><tr><th colspan='2' align='cent...   
2  kml_3  <center><table><tr><th colspan='2' align='cent...   
3  kml_4  <center><table><tr><th colspan='2' align='cent...   
4  kml_5  <center><table><tr><th colspan='2' align='cent...   

                        geometry AWARDED_DATE LANDYADDRESSPOINT  \
0  POINT Z (103.85016 1.28442 0)    17/7/2017           29650.7   
1  POINT Z (103.77978 1.43354 0)     4/9/2015          46139.03   
2  POINT Z (103.80472 1.29749 0)    11/5/2015          31094.91   
3  POINT Z (103.87706 1.39159 0)    30/7/2018          41500.77   
4  POINT Z (103.90481 1.40819 0)     8/8/2018          43336.13   

                                            PHOTOURL ADDRESSBLOCKHOUSENUMBER  \
0                                                                         50   
1  http://www.nea.gov.sg/images/default-source/Ha...      

In [37]:
# Drop the unnecessary columns 
columns_to_drop = ['Name', 'Description', 'geometry', 'AWARDED_DATE', 'LANDYADDRESSPOINT', 'PHOTOURL', 'EST_ORIGINAL_COMPLETION_DATE',
                    'STATUS', 'APPROXIMATE_GFA', 'HUP_COMPLETION_DATE', 'LANDXADDRESSPOINT', 'IMPLEMENTATION_DATE', 'ADDRESS_MYENV', 
                    'INC_CRC', 'FMEL_UPD_D', 'ADDRESSBUILDINGNAME', 'INFO_ON_CO_LOCATORS'] 
gdf = gdf.drop(columns=columns_to_drop)

# Verify that the columns are dropped
print(gdf.head())

  ADDRESSBLOCKHOUSENUMBER             DESCRIPTION  \
0                      50  New Replacement Centre   
1                       4  New Replacement Centre   
2                     38A  New Replacement Centre   
3                      21              New Centre   
4                       1              New Centre   

                              NAME     ADDRESSSTREETNAME ADDRESSPOSTALCODE  \
0      Market Street Hawker Centre         Market Street             48940   
1     Marsiling Mall Hawker Centre   Woodlands Street 12            738623   
2     Margaret Drive Hawker Centre        Margaret Drive            142038   
3  Fernvale Hawker Centre & Market  Sengkang West Avenue            797650   
4        One Punggol Hawker Centre         Punggol Drive            828629   

    longitude  latitude  
0  103.850165  1.284425  
1  103.779785  1.433539  
2  103.804715  1.297486  
3  103.877060  1.391592  
4  103.904806  1.408190  


In [38]:
# Check if columns are dropped correctly
print(gdf.columns.tolist())

['ADDRESSBLOCKHOUSENUMBER', 'DESCRIPTION', 'NAME', 'ADDRESSSTREETNAME', 'ADDRESSPOSTALCODE', 'longitude', 'latitude']


In [None]:
# Rename columns
gdf = gdf.rename(columns={
    'NAME': 'name',
    'ADDRESSSTREETNAME': 'address',
    'ADDRESSPOSTALCODE': 'postalCode',
    'DESCRIPTION': 'hawkerStatus'
})

# Rearrange columns
gdf = gdf[['name', 'address', 'postalCode', 'hawkerStatus', 'longitude', 'latitude']]

# Sort the GeoDataFrame by hawker_centre_name alphabetically
gdf = gdf.sort_values(by='name', ascending=True)

# Verify the column names after renaming
print(gdf.head())

                                                 name        street_name  \
12                              Adam Road Food Centre          Adam Road   
13  Aljunied Ave 2 Blk 117 (Blk 117 Aljunied Marke...     Aljunied Ave 2   
14   Amoy Street Food Centre (Telok Ayer Food Centre)  Telok Ayer Street   
74                   Anchorvale Village Hawker Centre    Anchorvale Road   
15  Ang Mo Kio Ave 1 Blk 226D (Kebun Baru Market a...   Ang Mo Kio Ave 1   

   postal_code block_number                  status   longitude  latitude  
12      289876            2  HUP Standard Upgrading  103.814165  1.324134  
13      380117          117  HUP Standard Upgrading  103.887019  1.320648  
14       69111               HUP Standard Upgrading  103.846607  1.279218  
74      540339          339              New Centre  103.888490  1.396914  
15      564226         226D  HUP Standard Upgrading  103.839231  1.366788  


## Save as csv 

In [40]:
csv_file = "hawker_centres_cleaned.csv"
gdf.to_csv(csv_file, index=False)

# Load and preview CSV file to confirm
df = pd.read_csv(csv_file)
display(df.head())

Unnamed: 0,name,street_name,postal_code,block_number,status,longitude,latitude
0,Adam Road Food Centre,Adam Road,289876.0,2,HUP Standard Upgrading,103.814165,1.324134
1,Aljunied Ave 2 Blk 117 (Blk 117 Aljunied Marke...,Aljunied Ave 2,380117.0,117,HUP Standard Upgrading,103.887019,1.320648
2,Amoy Street Food Centre (Telok Ayer Food Centre),Telok Ayer Street,69111.0,,HUP Standard Upgrading,103.846607,1.279218
3,Anchorvale Village Hawker Centre,Anchorvale Road,540339.0,339,New Centre,103.88849,1.396914
4,Ang Mo Kio Ave 1 Blk 226D (Kebun Baru Market a...,Ang Mo Kio Ave 1,564226.0,226D,HUP Standard Upgrading,103.839231,1.366788
