# Community Centre Dataset

In [35]:
!pip install geopandas pyarrow beautifulsoup4



In [36]:
import geopandas as gpd
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Load dataset 

In [37]:
# Community Clubs GeoJSON file 
geojson_file = "SupermarketsGEOJSON.geojson"
gdf = gpd.read_file(geojson_file)

# Show first few rows to inspect
display(gdf.head())

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.90126 1.4023 0)
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.87091 1.31424 0)
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.88637 1.37332 0)
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.91494 1.33296 0)
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.95301 1.35345 0)


# Extract columns

In [38]:
# Function to extract data from the 'Description' HTML field
def extract_description_info(description):
    soup = BeautifulSoup(description, "html.parser")
    data = {}

    # Extract the table rows from the HTML
    rows = soup.find_all('tr')

    # Loop through each row and extract the column names and values
    for row in rows:
        th_elements = row.find_all('th')
        td_elements = row.find_all('td')
        
        # Check if both <th> and <td> exist before extracting
        if th_elements and td_elements:
            key = th_elements[0].get_text(strip=True)
            value = td_elements[0].get_text(strip=True)
            data[key] = value

    return data

# Apply the extraction function to the 'Description' column
description_data = gdf['Description'].apply(extract_description_info)

# Convert the extracted data into a DataFrame
description_df = pd.json_normalize(description_data)

# Combine the new columns with the original GeoDataFrame (without overwriting the existing ones)
gdf = pd.concat([gdf, description_df], axis=1)

# Now, the GeoDataFrame should have new columns extracted from the Description field
print(gdf.head())


    Name                                        Description  \
0  kml_1  <center><table><tr><th colspan='2' align='cent...   
1  kml_2  <center><table><tr><th colspan='2' align='cent...   
2  kml_3  <center><table><tr><th colspan='2' align='cent...   
3  kml_4  <center><table><tr><th colspan='2' align='cent...   
4  kml_5  <center><table><tr><th colspan='2' align='cent...   

                        geometry                                     LIC_NAME  \
0   POINT Z (103.90126 1.4023 0)  LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.   
1  POINT Z (103.87091 1.31424 0)              SHENG SIONG SUPERMARKET PTE LTD   
2  POINT Z (103.88637 1.37332 0)        COLD STORAGE SINGAPORE (1983) PTE LTD   
3  POINT Z (103.91494 1.33296 0)        COLD STORAGE SINGAPORE (1983) PTE LTD   
4  POINT Z (103.95301 1.35345 0)                      YES SUPERMARKET PTE LTD   

  BLK_HOUSE              STR_NAME UNIT_NO POSTCODE       LIC_NO  \
0      273C         PUNGGOL PLACE     884   823273  NE12I65N000   


In [39]:
# List all column names
print(gdf.columns.tolist())

['Name', 'Description', 'geometry', 'LIC_NAME', 'BLK_HOUSE', 'STR_NAME', 'UNIT_NO', 'POSTCODE', 'LIC_NO', 'INC_CRC', 'FMEL_UPD_D']


# Dataset statistics

In [40]:
# Get the number of rows
num_rows = gdf.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of rows in the dataset: 526


In [41]:
# Check for empty strings in the entire GeoDataFrame -- same as checking for null values 
empty_strings = (gdf == "").sum()

# Display columns with empty strings
print("Empty strings per column:")
print(empty_strings)

Empty strings per column:
Name            0
Description     0
geometry        0
LIC_NAME        0
BLK_HOUSE       0
STR_NAME        0
UNIT_NO        75
POSTCODE        0
LIC_NO          0
INC_CRC         0
FMEL_UPD_D      0
dtype: int64


# Clean the dataset
1. convert geometry into lat, lon form 
2. drop columns 
3. rename and rearrange columns 

In [42]:
# Extract longitude and latitude from the geometry column
gdf['longitude'] = gdf['geometry'].apply(lambda point: point.x)  # Longitude is the x-coordinate
gdf['latitude'] = gdf['geometry'].apply(lambda point: point.y)   # Latitude is the y-coordinate

# Verify the new columns 
print(gdf.head())

    Name                                        Description  \
0  kml_1  <center><table><tr><th colspan='2' align='cent...   
1  kml_2  <center><table><tr><th colspan='2' align='cent...   
2  kml_3  <center><table><tr><th colspan='2' align='cent...   
3  kml_4  <center><table><tr><th colspan='2' align='cent...   
4  kml_5  <center><table><tr><th colspan='2' align='cent...   

                        geometry                                     LIC_NAME  \
0   POINT Z (103.90126 1.4023 0)  LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.   
1  POINT Z (103.87091 1.31424 0)              SHENG SIONG SUPERMARKET PTE LTD   
2  POINT Z (103.88637 1.37332 0)        COLD STORAGE SINGAPORE (1983) PTE LTD   
3  POINT Z (103.91494 1.33296 0)        COLD STORAGE SINGAPORE (1983) PTE LTD   
4  POINT Z (103.95301 1.35345 0)                      YES SUPERMARKET PTE LTD   

  BLK_HOUSE              STR_NAME UNIT_NO POSTCODE       LIC_NO  \
0      273C         PUNGGOL PLACE     884   823273  NE12I65N000   


In [43]:
# Check for duplicates 
duplicates = gdf.duplicated(subset=['LIC_NAME', 'longitude']).sum()
print(f"Number of duplicate supermarket: {duplicates}")

Number of duplicate supermarket: 2


In [44]:
# Drop duplicate rows based on name of facilities 
gdf = gdf.drop_duplicates(subset=["LIC_NAME", 'longitude'])

# Check if duplicated healthcare facilities have been dropped
duplicates = gdf.duplicated(subset=['LIC_NAME', 'longitude']).sum()
print(f"Number of duplicate supermarket: {duplicates}")

# Get the number of rows and columns
num_rows, num_columns = gdf.shape

# Display the number of rows
print(f"Number of rows in the dataset: {num_rows}")

Number of duplicate supermarket: 0
Number of rows in the dataset: 524


DROP / KEEP : (read in edit mode)
Name            0 (drop because its kml_1, kml_2 etc)
Description     0 (drop because alr expanded into the capital letters column)
geometry        0 (drop because alr converted into lat lon columns)
LIC_NAME        0 
BLK_HOUSE       0
STR_NAME        0
UNIT_NO        75 (drop because not needed and got NA values)
POSTCODE        0 
LIC_NO          0 (drop because no explanation on what this is)
INC_CRC         0 (drop because no explanation on what this is)
FMEL_UPD_D      0 (drop because no explanation on what this is)


In [45]:
# Drop the unnecessary columns 
columns_to_drop = ['Name', 'Description', 'geometry', 'UNIT_NO', 'LIC_NO', 'INC_CRC', 'FMEL_UPD_D']
gdf = gdf.drop(columns=columns_to_drop)

# Verify that the columns are dropped
print(gdf.head())

                                      LIC_NAME BLK_HOUSE  \
0  LI LI CHENG SUPERMARKET (PUNGGOL) PTE. LTD.      273C   
1              SHENG SIONG SUPERMARKET PTE LTD        11   
2        COLD STORAGE SINGAPORE (1983) PTE LTD       683   
3        COLD STORAGE SINGAPORE (1983) PTE LTD       631   
4                      YES SUPERMARKET PTE LTD      201B   

               STR_NAME POSTCODE   longitude  latitude  
0         PUNGGOL PLACE   823273  103.901262  1.402303  
1  UPPER BOON KENG ROAD   380011  103.870914  1.314239  
2      HOUGANG AVENUE 8   530683  103.886366  1.373321  
3  BEDOK RESERVOIR ROAD   470631  103.914942  1.332959  
4    TAMPINES STREET 21   522201  103.953010  1.353453  


In [46]:
# Rename columns
gdf = gdf.rename(columns={
    'LIC_NAME': 'name',
    'BLK_HOUSE': 'block_number',
    'STR_NAME': 'street_name',
    'POSTCODE': 'postal_code',
})

# Rearrange columns
gdf = gdf[['name', 'street_name', 'postal_code', 'block_number', 'longitude', 'latitude']]

# Sort the GeoDataFrame by hawker_centre_name alphabetically
gdf = gdf.sort_values(by='name', ascending=True)

# Verify the column names after renaming
print(gdf.head())

                           name          street_name postal_code block_number  \
64             7SEVEN PTE. LTD.   NORTH COAST AVENUE      756992           51   
15     ABEDIN TRADING PTE. LTD.         GEYLANG ROAD      389538          599   
511  AJMAL SUPER MART PTE. LTD.  TUAS SOUTH AVENUE 1      637285           70   
479  AJMAL TRADE MART PTE. LTD.          KRANJI ROAD      739522           12   
453  AJMAL TRADE MART PTE. LTD.        MANDAI ESTATE      729937           23   

      longitude  latitude  
64   103.786437  1.456060  
15   103.886700  1.314207  
511  103.625765  1.313965  
479  103.759237  1.430694  
453  103.760562  1.406312  


# Save as CSV

In [47]:
csv_file = "supermarket_cleaned.csv"
gdf.to_csv(csv_file, index=False)

# Load and preview CSV file to confirm
df = pd.read_csv(csv_file)
display(df.head())

Unnamed: 0,name,street_name,postal_code,block_number,longitude,latitude
0,7SEVEN PTE. LTD.,NORTH COAST AVENUE,756992,51,103.786437,1.45606
1,ABEDIN TRADING PTE. LTD.,GEYLANG ROAD,389538,599,103.8867,1.314207
2,AJMAL SUPER MART PTE. LTD.,TUAS SOUTH AVENUE 1,637285,70,103.625765,1.313965
3,AJMAL TRADE MART PTE. LTD.,KRANJI ROAD,739522,12,103.759237,1.430694
4,AJMAL TRADE MART PTE. LTD.,MANDAI ESTATE,729937,23,103.760562,1.406312
