# Active SG 

In [1]:
!pip install geopandas pyarrow beautifulsoup4



In [2]:
import geopandas as gpd
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import re 

## Web scrape

In [3]:
# Base URL for pages 2 onwards
base_url = "https://www.activesgcircle.gov.sg/facilities?page={}"

# First page URL (different format)
first_page_url = "https://www.activesgcircle.gov.sg/facilities"

# List to store scraped data
facilities_data = []

# Function to scrape a given URL
def scrape_page(url, page_num):
    print(f"Scraping page {page_num}...")  # Debugging output
    
    # Send request with headers to mimic a real browser
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"❌ Failed to retrieve page {page_num}, skipping...")
        return

    # Parse the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all facility blocks
    cst_cnt_divs = soup.find_all('div', class_='cst-cnt')

    for cst_cnt_div in cst_cnt_divs:
        # Extract facility name (filter out unwanted h4 tags)
        h4_tags = cst_cnt_div.find_all('h4')
        excluded_texts = {"Operating Hours", "Phone Number", "Address"}
        facility_names = [h4.get_text(strip=True) for h4 in h4_tags if h4.get_text(strip=True) not in excluded_texts]

        # Extract additional details
        address = cst_cnt_div.find('div', class_='cst-address')
        type_of_facility = cst_cnt_div.find('div', class_='cst-type-of-facility')
        direction = cst_cnt_div.find('div', class_='cst-direction')

        # Extract text safely
        address_text = address.get_text(strip=True) if address else "N/A"
        type_text = type_of_facility.get_text(strip=True) if type_of_facility else "N/A"
        direction_text = direction.get_text(strip=True) if direction else "N/A"

        # Add data to list
        for facility_name in facility_names:
            facilities_data.append({
                "Facility Name": facility_name,
                "Address": address_text,
                "Type of Facility": type_text,
                "Direction": direction_text
            })

# Scrape first page separately
scrape_page(first_page_url, 1)

# Scrape pages 2 to 39
for page in range(2, 40):  # Pages 2 to 39
    scrape_page(base_url.format(page), page)
    time.sleep(1)  # Be polite! Avoid being blocked

# Convert to Pandas DataFrame
df = pd.DataFrame(facilities_data)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...


In [4]:
display(df.head())

Unnamed: 0,Facility Name,Address,Type of Facility,Direction
0,ActiveSG Gym @ Ang Mo Kio Community Centre,795 Ang Mo Kio Avenue 1 Singapore 569976,Gym,Northeast
1,ActiveSG Gym @ Enabling Village,20 Lengkok Bahru Singapore 159053,Gym,Central
2,ActiveSG Gym @ Fernvale Square,51A Sengkang West Avenue Singapore 797384,Gym,Northeast
3,ActiveSG Gym @ Serangoon Central,264 Serangoon Central Singapore 550264,Gym,Northeast
4,ActiveSG Gym @ Toa Payoh,127A Lorong 1 Toa Payoh Singapore 319899,Gym,Central


In [5]:
# Create a new column 'postal_code' by extracting the 6-digit postal code from the 'address' column
df['Postal Code'] = df['Address'].apply(lambda x: re.search(r'\d{6}', str(x)).group(0) if re.search(r'\d{6}', str(x)) else None)

print(df.head())

                                Facility Name  \
0  ActiveSG Gym @ Ang Mo Kio Community Centre   
1             ActiveSG Gym @ Enabling Village   
2              ActiveSG Gym @ Fernvale Square   
3            ActiveSG Gym @ Serangoon Central   
4                    ActiveSG Gym @ Toa Payoh   

                                     Address Type of Facility  Direction  \
0   795 Ang Mo Kio Avenue 1 Singapore 569976              Gym  Northeast   
1          20 Lengkok Bahru Singapore 159053              Gym    Central   
2  51A Sengkang West Avenue Singapore 797384              Gym  Northeast   
3     264 Serangoon Central Singapore 550264              Gym  Northeast   
4   127A Lorong 1 Toa Payoh Singapore 319899              Gym    Central   

  Postal Code  
0      569976  
1      159053  
2      797384  
3      550264  
4      319899  


In [6]:
# Get the number of rows
num_rows = df.shape[0]
print(f"Number of rows in the dataset: {num_rows}")

Number of rows in the dataset: 386


In [7]:
# Check for NaN values in each column
none_values_count = df.isna().sum()

# Display the count of NaN values per column
print("Count of None/NaN values per column:")
print(none_values_count)

Count of None/NaN values per column:
Facility Name       0
Address             0
Type of Facility    0
Direction           0
Postal Code         0
dtype: int64


use one map api to get geometry

In [13]:
# Create a function to fetch the address details from OneMap API
def fetch_onemap_geometry(name):
    url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={name}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
    headers = {"Authorization": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJhYjgzY2U4MjhjOTg2MWNhMWVkN2JlZDcwY2IwNWEwZCIsImlzcyI6Imh0dHA6Ly9pbnRlcm5hbC1hbGItb20tcHJkZXppdC1pdC1uZXctMTYzMzc5OTU0Mi5hcC1zb3V0aGVhc3QtMS5lbGIuYW1hem9uYXdzLmNvbS9hcGkvdjIvdXNlci9wYXNzd29yZCIsImlhdCI6MTc0MDIwMzQ0OSwiZXhwIjoxNzQwNDYyNjQ5LCJuYmYiOjE3NDAyMDM0NDksImp0aSI6IjFsQjhNVUwxRVFLYmU0R2kiLCJ1c2VyX2lkIjo2MDU1LCJmb3JldmVyIjpmYWxzZX0.WdyaSp7VDBMtPG006XCuQjxkZieivgac6Pauoy_wBSE"}  
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if data['found'] > 0:
            result = data['results'][0]
            latitude = result.get('LATITUDE', None)
            longitude = result.get('LONGITUDE', None)
            return latitude, longitude
    return None, None  # Return None if no location found


In [14]:
df['latitude'] = None
df['longitude'] = None

# Iterate through rows and update missing values
for index, row in df.iterrows():
    name = row['Facility Name']

    # Fetch latitude & longitude from OneMap API
    lat, lon = fetch_onemap_geometry(name)

    # Update dataset if valid values are returned
    if lat and lon:
        df.at[index, 'latitude'] = lat
        df.at[index, 'longitude'] = lon

# Convert columns to numeric type (just in case they are stored as strings)
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Check for NaN values in each column
none_values_count = df.isna().sum()

# Display the count of NaN values per column
print("Count of None/NaN values per column:")
print(none_values_count)


Count of None/NaN values per column:
Facility Name       0
Address             0
Type of Facility    0
Direction           0
Postal Code         0
latitude            0
longitude           0
dtype: int64


## Save to CSV

In [15]:
# Save to CSV
df.to_csv("ActiveSG_Facilities.csv", index=False)

print("Data saved to ActiveSG_Facilities.csv.")

df.head() 

Data saved to ActiveSG_Facilities.csv.


Unnamed: 0,Facility Name,Address,Type of Facility,Direction,Postal Code,latitude,longitude
0,ActiveSG Gym @ Ang Mo Kio Community Centre,795 Ang Mo Kio Avenue 1 Singapore 569976,Gym,Northeast,569976,1.369519,103.848462
1,ActiveSG Gym @ Enabling Village,20 Lengkok Bahru Singapore 159053,Gym,Central,159053,1.332229,103.720201
2,ActiveSG Gym @ Fernvale Square,51A Sengkang West Avenue Singapore 797384,Gym,Northeast,797384,1.332229,103.720201
3,ActiveSG Gym @ Serangoon Central,264 Serangoon Central Singapore 550264,Gym,Northeast,550264,1.374802,103.84724
4,ActiveSG Gym @ Toa Payoh,127A Lorong 1 Toa Payoh Singapore 319899,Gym,Central,319899,1.334036,103.850978
