The code extracts the names of malls in Singapore from two sources:
1. Wikipedia Web Scraping – Mall names are extracted by scraping Wikipedia.
2. Google Places Text Search – Mall names are retrieved using the Google Places API.

Once the mall names are collected from both sources, the code queries the OneMap API to obtain additional details such as address, longitude, and latitude for each mall.

# Set-Up

In [4]:
#Import libraries
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time

In [5]:
#---- Google Places API Set-Up
GOOGLE_API_KEY = "AIzaSyDpu7X3vaLLr2GhCX6BcNWhfUtcJwU8F-A"
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
GEOCODE_URL = "https://maps.googleapis.com/maps/api/geocode/json"

In [6]:
#---- OneMap API Set-Up
ONEMAP_BASE_URL = "https://www.onemap.gov.sg/api/common/elastic/search"

# Web Scrape from Wikipedia

In [7]:
# Wiki details set-up
wiki_url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
wiki_response = requests.get(wiki_url, timeout = 5)
content = BeautifulSoup(wiki_response.content, "html.parser")

In [8]:
#---- 1. Extract mall names from main section
mall_names_main_section = []

# Mall names can be found under each region header
regions = content.find_all("h2")

for region in regions:
    region_text = region.get_text(strip=True)

    # Skip 'Contents' and 'References' Section
    if region_text in ["Contents", "References"]:
        continue

    div_col = region.find_next("div", class_="div-col")
    
    if div_col:
        malls = div_col.find_all("li")  # Extract malls from <li> tags

        for mall in malls:
            # Some malls are in <a> tags, some are plain text
            mall_name = mall.get_text(strip=True)
            if mall_name:
                mall_names_main_section.append(mall_name)

In [9]:
#---- 2. Extract mall names from references section (Current malls)

# Find the box containing current mall names
mall_names_ref_section = []
wiki_ref_mall_section = content.find("td", class_="navbox-list-with-group navbox-list navbox-odd hlist")

if wiki_ref_mall_section:
    # Locate only the <ul> lists inside the section to avoid the headers
    for ul in wiki_ref_mall_section.find_all("ul"):
        # Extract mall names from <a> tags
        for a in ul.find_all("a"):
            mall_name = a.text.strip()
            mall_names_ref_section.append(mall_name)
else: 
    print("Current malls section under references not found.")

In [10]:
#---- 3. Combine mall names obtained from both sections to get a complete list

combined_mall_names = mall_names_main_section.copy()
combined_mall_names.extend(mall_names_ref_section)
combined_mall_names.sort()

In [11]:
#---- 4. Cleaning of web-scrapped data
cleansed_wiki_mall_names = combined_mall_names.copy()

#--  Standardise all to be uppercase
cleansed_wiki_mall_names = [mall.upper() for mall in cleansed_wiki_mall_names]

#-- Remove whitespace
cleansed_wiki_mall_names = [mall.strip() for mall in cleansed_wiki_mall_names]

#-- Clean mall names 
# e.g. GRID (FORMERLY POMO)[1], KINEX (FORMERLY ONEKM)
def clean_mall_names(dataset):
    result = []
    for i in dataset:
        # Remove content inside square brackets [ ] including the brackets
        i = re.sub(r"\[.*?\]", "", i)
        # Remove the bracketed part for 'formerly' cases
        i = re.sub(r"\s*\(FORMERLY .*?\)", "", i, flags=re.IGNORECASE)
        result.append(i.strip())
    return result

cleansed_wiki_mall_names = clean_mall_names(cleansed_wiki_mall_names)

#-- Remove duplicates
def remove_duplicates(mall_list):
    result = []
    for i in mall_list:
        if i not in result:
            result.append(i)
    return result

cleansed_wiki_mall_names = remove_duplicates(cleansed_wiki_mall_names)

In [12]:
#---- 5. Manual Cleaning of web-scrapped data
wiki_replacement_dict = {
    'PAYA LEBAR QUARTER (PLQ)': 'PLQ MALL',
    'THE PARAGON': 'PARAGON SHOPPING CENTRE',
    'DJITSUN MALL BEDOK': 'DJITSUN MALL'
}
cleansed_wiki_mall_names = [wiki_replacement_dict.get(mall, mall) for mall in cleansed_wiki_mall_names]

wiki_malls_to_remove = {'TENGAH MALL (2027)', 'FAIRPRICE HUB', 'MARINA BAY SANDS', 'HOLLAND VILLAGE SHOPPING MALL', 'OD MALL', 'HOUGANG GREEN SHOPPING MALL'}
cleansed_wiki_mall_names = [mall for mall in cleansed_wiki_mall_names if mall not in wiki_malls_to_remove]

In [13]:
def get_mall_details_onemap(mall_name):
    params = {
        'searchVal': mall_name,
        'returnGeom': 'Y',
        'getAddrDetails': 'Y'
    }

    response = requests.get(ONEMAP_BASE_URL, params=params)

    if response.status_code == 200: 
        data = response.json()
        if data['found'] > 0:
            result = data['results'][0]
            return {
                'mall_name': mall_name,
                # 'onemap_mall_name': result['BUILDING'],
                'address': result['ADDRESS'],
                'latitude': result['LATITUDE'],
                'longitude': result['LONGITUDE']
            }
        return None

In [14]:
wiki_mall_details = []
for mall in cleansed_wiki_mall_names:
    mall_info = get_mall_details_onemap(mall)
    if mall_info:
        wiki_mall_details.append(mall_info)
df_malls_wiki = pd.DataFrame(wiki_mall_details)

# Google Places - Text Search

In [15]:
def get_mall_names_text_search():
    malls = []
    next_page_token = None

    while True:
        params = {
            "query": "shopping malls in Singapore",
            "region": "sg",
            "key": GOOGLE_API_KEY
        }
        if next_page_token:
            params["pagetoken"] = next_page_token

        response = requests.get(TEXT_SEARCH_URL, params=params)
        data = response.json()

        if "results" in data:
            for result in data["results"]:
                malls.append(result.get("name", ""))  # Only store the name

        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break

        time.sleep(3)  # To handle rate limits

    return malls

In [16]:
text_search_mall_names = get_mall_names_text_search()

# Manual cleaning
text_search_malls_to_remove = ['CLARKE QUAY', 'UNITED SQUARE SHOPPING MALL', 'GREAT WORLD CITY', 'IMM BUILDING', 'TAKASHIMAYA SHOPPING CENTRE']
text_search_mall_names = [mall.upper() for mall in text_search_mall_names]
text_search_mall_names = [mall for mall in text_search_mall_names if mall not in text_search_malls_to_remove]

text_search_mall_details = [get_mall_details_onemap(mall) for mall in text_search_mall_names]
df_malls_text_search = pd.DataFrame(text_search_mall_details)

# Combining Dataframes

In [17]:
df_combined = pd.concat([df_malls_text_search, df_malls_wiki], ignore_index=True)
df_combined = df_combined.sort_values(by='mall_name')
df_combined.reset_index(drop=True, inplace=True)

In [18]:
# Remove 'the', 'trailing spaces', 'make mall names uppercase'
df_combined['std_mall_name'] = df_combined['mall_name'].str.replace(r'\bthe\b', '', case=False, regex=True).str.strip().str.upper()

# Remove duplicates based on 'std_mall_name' column
df_combined.drop_duplicates(subset='std_mall_name', keep='first', inplace=True)

In [19]:
# Drop rows with missing address information that cannot be found on OneMap API
def has_valid_postal_code(address):
    return bool(re.search(r'\b\d{6}\b', str(address)))  # Ensure it's a 6-digit number

# Filter rows based on postal code presence
initial_rows = df_combined.shape[0]
df_combined = df_combined[df_combined['address'].apply(has_valid_postal_code)]
final_rows = df_combined.shape[0]

print(f"Number of rows dropped: {initial_rows - final_rows}")

Number of rows dropped: 4


Final Formatting

In [20]:
df_combined.drop(columns={'std_mall_name'}, inplace=True)
df_combined = df_combined.map(lambda x: x.upper() if isinstance(x, str) else x)
df_combined.sort_values('mall_name', inplace=True)