# Set-Up

In [21]:
#Import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time

In [None]:
#---- Google Places API Set-Up
GOOGLE_API_KEY = "AIzaSyDpu7X3vaLLr2GhCX6BcNWhfUtcJwU8F-A"
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
GEOCODE_URL = "https://maps.googleapis.com/maps/api/geocode/json"
# DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"
# NEARBY_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

In [23]:
#---- OneMap API Set-Up
ONEMAP_BASE_URL = "https://www.onemap.gov.sg/api/common/elastic/search"

# Web Scrape from Wikipedia

In [24]:
# Wiki details set-up
wiki_url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
wiki_response = requests.get(wiki_url, timeout = 5)
content = BeautifulSoup(wiki_response.content, "html.parser")

In [26]:
#---- 1. Extract mall names from main section
mall_names_main_section = []

# Mall names can be found under each region header
regions = content.find_all("h2")

for region in regions:
    region_text = region.get_text(strip=True)

    # Skip 'Contents' and 'References' Section
    if region_text in ["Contents", "References"]:
        continue

    div_col = region.find_next("div", class_="div-col")
    
    if div_col:
        malls = div_col.find_all("li")  # Extract malls from <li> tags

        for mall in malls:
            # Some malls are in <a> tags, some are plain text
            mall_name = mall.get_text(strip=True)
            if mall_name:
                mall_names_main_section.append(mall_name)

In [27]:
#---- 2. Extract mall names from references section (Current malls)

# Find the box containing current mall names
mall_names_ref_section = []
wiki_ref_mall_section = content.find("td", class_="navbox-list-with-group navbox-list navbox-odd hlist")

if wiki_ref_mall_section:
    # Locate only the <ul> lists inside the section to avoid the headers
    for ul in wiki_ref_mall_section.find_all("ul"):
        # Extract mall names from <a> tags
        for a in ul.find_all("a"):
            mall_name = a.text.strip()
            mall_names_ref_section.append(mall_name)
else: 
    print("Current malls section under references not found.")

In [28]:
#---- 3. Combine mall names obtained from both sections to get a complete list

combined_mall_names = mall_names_main_section.copy()
combined_mall_names.extend(mall_names_ref_section)
combined_mall_names.sort()

In [29]:
#---- 4. Cleaning of web-scrapped data
cleansed_wiki_mall_names = combined_mall_names.copy()

#--  Standardise all to be uppercase
cleansed_wiki_mall_names = [mall.upper() for mall in cleansed_wiki_mall_names]

#-- Remove whitespace
cleansed_wiki_mall_names = [mall.strip() for mall in cleansed_wiki_mall_names]

#-- Clean mall names 
# e.g. GRID (FORMERLY POMO)[1], KINEX (FORMERLY ONEKM)
def clean_mall_names(dataset):
    result = []
    for i in dataset:
        # Remove content inside square brackets [ ] including the brackets
        i = re.sub(r"\[.*?\]", "", i)
        # Remove the bracketed part for 'formerly' cases
        i = re.sub(r"\s*\(FORMERLY .*?\)", "", i, flags=re.IGNORECASE)
        result.append(i.strip())
    return result

cleansed_wiki_mall_names = clean_mall_names(cleansed_wiki_mall_names)

#-- Remove duplicates
def remove_duplicates(mall_list):
    result = []
    for i in mall_list:
        if i not in result:
            result.append(i)
    return result

cleansed_wiki_mall_names = remove_duplicates(cleansed_wiki_mall_names)

In [30]:
#---- 5. Manual Cleaning of web-scrapped data
wiki_replacement_dict = {
    'PAYA LEBAR QUARTER (PLQ)': 'PLQ MALL',
    'THE PARAGON': 'PARAGON SHOPPING CENTRE',
    'DJITSUN MALL BEDOK': 'DJITSUN MALL'
}
cleansed_wiki_mall_names = [wiki_replacement_dict.get(mall, mall) for mall in cleansed_wiki_mall_names]

wiki_malls_to_remove = {'TENGAH MALL (2027)', 'FAIRPRICE HUB', 'MARINA BAY SANDS', 'HOLLAND VILLAGE SHOPPING MALL', 'OD MALL', 'HOUGANG GREEN SHOPPING MALL'}
cleansed_wiki_mall_names = [mall for mall in cleansed_wiki_mall_names if mall not in wiki_malls_to_remove]

In [31]:
def get_mall_details_onemap(mall_name):
    params = {
        'searchVal': mall_name,
        'returnGeom': 'Y',
        'getAddrDetails': 'Y'
    }

    response = requests.get(ONEMAP_BASE_URL, params=params)

    if response.status_code == 200: 
        data = response.json()
        if data['found'] > 0:
            result = data['results'][0]
            return {
                'mall_name': mall_name,
                # 'onemap_mall_name': result['BUILDING'],
                'address': result['ADDRESS'],
                'latitude': result['LATITUDE'],
                'longitude': result['LONGITUDE']
            }
        return None

In [32]:
wiki_mall_details = []
for mall in cleansed_wiki_mall_names:
    mall_info = get_mall_details_onemap(mall)
    if mall_info:
        wiki_mall_details.append(mall_info)
df_malls_wiki = pd.DataFrame(wiki_mall_details)

# Google Places - Text Search

In [33]:
def get_mall_names_text_search():
    malls = []
    next_page_token = None

    while True:
        params = {
            "query": "shopping malls in Singapore",
            "region": "sg",
            "key": GOOGLE_API_KEY
        }
        if next_page_token:
            params["pagetoken"] = next_page_token

        response = requests.get(TEXT_SEARCH_URL, params=params)
        data = response.json()

        if "results" in data:
            for result in data["results"]:
                malls.append(result.get("name", ""))  # Only store the name

        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break

        time.sleep(3)  # To handle rate limits

    return malls

In [34]:
text_search_mall_names = get_mall_names_text_search()

# Manual cleaning
text_search_malls_to_remove = ['CLARKE QUAY', 'UNITED SQUARE SHOPPING MALL', 'GREAT WORLD CITY', 'IMM BUILDING', 'TAKASHIMAYA SHOPPING CENTRE']
text_search_mall_names = [mall.upper() for mall in text_search_mall_names]
text_search_mall_names = [mall for mall in text_search_mall_names if mall not in text_search_malls_to_remove]

text_search_mall_details = [get_mall_details_onemap(mall) for mall in text_search_mall_names]
df_malls_text_search = pd.DataFrame(text_search_mall_details)

# Combining Dataframes

In [35]:
df_combined = pd.concat([df_malls_text_search, df_malls_wiki], ignore_index=True)
df_combined = df_combined.sort_values(by='mall_name')
df_combined.reset_index(drop=True, inplace=True)

In [36]:
# Remove 'the', 'trailing spaces', 'make mall names uppercase'
df_combined['std_mall_name'] = df_combined['mall_name'].str.replace(r'\bthe\b', '', case=False, regex=True).str.strip().str.upper()

# Remove duplicates based on 'std_mall_name' column
df_combined.drop_duplicates(subset='std_mall_name', keep='first', inplace=True)

Fill in missing addresses that cannot be found on OneMap API --> Use Google Places API
(TBC, or just drop them instead)

In [37]:
def get_mall_details_google(mall_name):
    params = {
        "address": mall_name + ", Singapore",  # Restrict to Singapore
        "key": GOOGLE_API_KEY
    }
    
    response = requests.get(GEOCODE_URL, params=params)
    data = response.json()
    
    if data["status"] == "OK":
        result = data["results"][0]  # Take the first search result
        address = result["formatted_address"]
        lng = result["geometry"]["location"]["lng"]
        lat = result["geometry"]["location"]["lat"]
        
        return {"address": address, "longitude": lng, "latitude": lat}
    
    return {"address": None, "longitude": None, "latitude": None}  # Handle errors

In [38]:
# Filter rows where 'address' matches 'mall_name' 
def has_postal_code(address):
    return bool(re.search(r'\b\d{6}\b', str(address)))  # Matches exactly 6 digits
df_combined.loc[~df_combined['address'].apply(has_postal_code), ['address', 'longitude', 'latitude']] = None

# Get the list of mall names to search using Google Places API - Fill in addresses that cannot be found by onemap API
rmg_mall_names = df_combined[df_combined['address'].isnull()]['mall_name'].tolist()
print(rmg_mall_names)

def update_mall_details(mall_names):
    updated_data = []
    for mall in mall_names:
        mall_details = get_mall_details_google(mall)
        updated_data.append(mall_details)
    return updated_data

# Update dataframe
rmg_mall_details = update_mall_details(rmg_mall_names)
for i, mall_name in enumerate(rmg_mall_names):
    df_combined.loc[df_combined['mall_name'] == mall_name, ['address', 'longitude', 'latitude']] = [rmg_mall_details[i]['address'], rmg_mall_details[i]['longitude'], rmg_mall_details[i]['latitude']]

['ANCHORVALE VILLAGE', 'CLARKE QUAY CENTRAL']


Final Formatting

In [None]:
df_combined.drop(columns={'std_mall_name'}, inplace=True)
df_combined = df_combined.map(lambda x: x.upper() if isinstance(x, str) else x)
df_combined.sort_values('mall_name', inplace=True)