<h1> Scraper for Shopping Malls in SG </h1>

<h3> Scraping Malls from Wikipedia </h3>
<p> This is a simple webscraper to scrape the list of shopping malls off wikipedia.</p>

In [1]:
# Import libraries
from bs4 import BeautifulSoup
import requests
import re

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
response = requests.get(url, timeout = 5)
content = BeautifulSoup(response.content, "html.parser")

In [244]:
# Parse HTML
soup = BeautifulSoup(response.content, "html.parser")

# Find the 2nd <tbody> tag that contains all malls
tbody_tag = soup.find_all('tbody')[1]

# Check within <tbody> tag
if tbody_tag:
    
    # Iterate over <th> tags within <tbody>
    for th_tag in tbody_tag.find_all('th'):
        # Remove entire <tr> if "Demolished" or "Under construction" is in the string for <th> text
        if any(keyword in th_tag.get_text() for keyword in ['Demolished', 'Under construction']):
            # Extract the parent <tr> tag and remove it
            tr_tag = th_tag.find_parent('tr')
            tr_tag.extract()
            
    # Remove <th> and <tr> lines now
    for th_tag in tbody_tag.find_all('th'):
        # Remove the th_tag
        th_tag.extract()

# Extract all relevant malls
mall_list_html = tbody_tag.find_all("ul")

# Turn it into a mall beautiful soup and extract into list
mall_soup = BeautifulSoup(''.join(str(tag) for tag in list(mall_list_html)))
mall_elements = mall_soup.find_all('li')

In [254]:
# Extract mall name
malls = []

for node in mall_elements:
    mallName = "".join(node.findAll(string = True))
    if len(mallName) > 1:
        malls.append("".join(node.findAll(string = True)))
malls = set(malls)

# Set upper case on all malls
malls = [element.upper() for element in malls]
malls

['HARBOURFRONT CENTRE',
 'NORTHPOINT CITY',
 'SUNTEC CITY MALL',
 'CATHAY CINELEISURE ORCHARD',
 'HEARTLAND MALL',
 '100 AM',
 'YEW TEE SQUARE',
 'SUNSHINE PLACE',
 'YEW TEE POINT',
 'THE CATHAY',
 'SIM LIM SQUARE',
 'CHANGI CITY POINT',
 '313@SOMERSET',
 'JUNCTION 10',
 'ION ORCHARD',
 'CITY SQUARE MALL',
 'THE RAIL MALL',
 'EASTPOINT MALL',
 'MARINA BAY LINK MALL',
 'MARINA SQUARE',
 '888 PLAZA',
 'BUGIS+',
 'COMPASS ONE',
 'GEK POH SHOPPING CENTRE',
 'TIONG BAHRU PLAZA',
 'ESPLANADE MALL',
 'SINGPOST CENTRE',
 'PARKWAY PARADE',
 'MYVILLAGE @ SERANGOON',
 'DJITSUN MALL BEDOK',
 'LUCKY PLAZA',
 'TAMPINES MART',
 'THE PARAGON',
 'THE WOODLEIGH MALL',
 'TAMPINES MALL',
 'DAWSON PLACE',
 'PLQ MALL',
 'WOODLANDS NORTH PLAZA',
 'CLAYMORE CONNECT',
 'SQUARE 2',
 'RIVERVALE PLAZA',
 'CLEMENTI MALL',
 'JEWEL CHANGI AIRPORT',
 'HILLION MALL',
 'PLAZA SINGAPURA',
 'WESTGATE',
 'LEISURE PARK KALLANG',
 'BEDOK MALL',
 'JURONG POINT',
 'CENTURY SQUARE',
 'FU LU SHOU COMPLEX',
 'ANCHORPOINT',
 'QUE

<h3> Use OneMap API for Mall Coordinates </h3>

In [255]:
# Function to get latitude and longitude from onemap api
def get_coordinates(location):
    searchQuery = f"https://developers.onemap.sg/commonapi/search?searchVal={location}&returnGeom=Y&getAddrDetails=Y"
    response = requests.get(searchQuery)
    
    try:
        result = response.json()['results'][0]
        latitude = result['LATITUDE']
        longitude = result['LONGITUDE']
        return [latitude, longitude]
    except: 
        return 'INVALID LOCATION'

In [256]:
# Function to search latitude and longitude for all names in a list
def searchOneMap(locationList):
    lat_long_locations = []
    invalid_locations = []
    for location in locationList:
        lat_long = get_coordinates(location)
        if lat_long != 'INVALID LOCATION': 
            location_details = {}
            location_details['name'] = location
            location_details['latitude'] = lat_long[0]
            location_details['longitude'] = lat_long[1]
            lat_long_locations.append(location_details)
        else:
            invalid_locations.append(location)
    return [lat_long_locations, invalid_locations]

In [257]:
# Retrieve lat long of malls
result = searchOneMap(malls)
mall_lat_long = result[0]
missing_malls = result[1]

In [258]:
# View missing malls
missing_malls

['CITY VIBE',
 'SHAW HOUSE AND CENTRE',
 'CLARKE QUAY CENTRAL',
 'MANDARIN GALLERY',
 'CHANGE ALLEY']

In [27]:
# Manually search for missing malls
def addLocationToList(locationList, mall, lat, long):
    location_details = {}
    location_details['name'] = mall
    location_details['latitude'] = lat
    location_details['longitude'] = long
    locationList.append(location_details)

In [28]:
# Add missing locations
addLocationToList(mall_lat_long, 'CITY GATE', '1.3023159', '103.8623317')
addLocationToList(mall_lat_long, 'THE CENTRAL', '1.2888386', '103.8465580')
addLocationToList(mall_lat_long, 'GOLDEN VILLAGE - YISHUN TEN', '1.4299181', '103.8364202')
addLocationToList(mall_lat_long, 'RAFFLES HOLLAND V', '1.3105635', '103.7961072')
addLocationToList(mall_lat_long, 'JEM', '1.3330614', '103.7435037')
addLocationToList(mall_lat_long, 'MUSTAFA CENTRE', '1.3101122', '103.8552908')
addLocationToList(mall_lat_long, 'SHAW HOUSE', '1.3058098', '103.8315073')
addLocationToList(mall_lat_long, 'UNITED SQUARE', '1.3171945', '103.8436114')
addLocationToList(mall_lat_long, 'MYVILLAGE @ SERANGOON', '1.3651149', '103.8651595')

In [29]:
# View hashmap
mall_lat_long

[{'name': '100 AM',
  'latitude': '1.27458821795427',
  'longitude': '103.84347073660999'},
 {'name': '112 KATONG',
  'latitude': '1.30508681845447',
  'longitude': '103.905098915055'},
 {'name': '313@SOMERSET',
  'latitude': '1.3013851021471399',
  'longitude': '103.837684350436'},
 {'name': '321 CLEMENTI',
  'latitude': '1.3120249182444',
  'longitude': '103.764960537008'},
 {'name': '600 @ TOA PAYOH',
  'latitude': '1.33404171129957',
  'longitude': '103.850955458676'},
 {'name': '888 PLAZA',
  'latitude': '1.4371305244487',
  'longitude': '103.795289911954'},
 {'name': 'AMK HUB',
  'latitude': '1.36922321403002',
  'longitude': '103.848467911464'},
 {'name': 'ADMIRALTY PLACE',
  'latitude': '1.43988095490574',
  'longitude': '103.80179101959499'},
 {'name': 'ALEXANDRA RETAIL CENTRE',
  'latitude': '1.2738426388845099',
  'longitude': '103.801375038176'},
 {'name': 'ANCHORPOINT',
  'latitude': '1.28893477974498',
  'longitude': '103.805607779399'},
 {'name': 'APERIA',
  'latitude': 

In [30]:
# Convert to malls_lat_long to pandas dataframe
df_malls = pd.DataFrame(mall_lat_long)

# Export to csv
df_malls.to_csv('mall_coordinates_updated.csv')