# Set-Up

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_rows', None)

In [2]:
#---- OneMap API Set-Up
ONEMAP_BASE_URL = "https://www.onemap.gov.sg/api/common/elastic/search"

In [3]:
#---- Google API Set-Up
GOOGLE_API_KEY = "AIzaSyDpu7X3vaLLr2GhCX6BcNWhfUtcJwU8F-A"
TEXT_SEARCH_URL = "https://maps.googleapis.com/maps/api/place/textsearch/json"
GEOCODE_URL = "https://maps.googleapis.com/maps/api/geocode/json"

## General

In [4]:
# Function to get latitude and longitude using OneMap API for postal code
def get_lat_lon_onemap(postal_code):

    # Parameters for the API call (including your API Key)
    params = {
        'searchVal': postal_code,
        'returnGeom': 'Y',
        'getAddrDetails': 'Y'
    }
    
    # Make the API request
    response = requests.get(ONEMAP_BASE_URL, params=params)
    
    # Parse the response JSON
    data = response.json()
    
    # Check if the response contains results
    if data['found'] > 0:
        # Extract latitude and longitude from the response
        latitude = data['results'][0]['LATITUDE']
        longitude = data['results'][0]['LONGITUDE']
        return latitude, longitude
    else:
        # Return None if no results are found
        return None, None


## Pre-School Education

In [5]:
df_preschool_raw = pd.read_csv('preschool_loc_details_raw.csv', na_values=['na', 'NA', 'N/A', 'NULL'])

In [6]:
df_preschool_raw.isnull().sum()

tp_code                         1848
centre_code                       84
centre_name                       48
organisation_code                  0
organisation_description           0
service_model                     38
centre_contact_no                  0
centre_email_address              49
centre_address                     0
postal_code                        0
centre_website                   475
infant_vacancy_current_month       0
infant_vacancy_next_month          0
infant_vacancy_third_month         0
infant_vacancy_fourth_month        0
infant_vacancy_fifth_month         0
infant_vacancy_sixth_month         0
infant_vacancy_seventh_month       0
pg_vacancy_current_month           0
pg_vacancy_next_month              0
pg_vacancy_third_month             0
pg_vacancy_fourth_month            0
pg_vacancy_fifth_month             0
pg_vacancy_sixth_month             0
pg_vacancy_seventh_month           0
n1_vacancy_current_month           0
n1_vacancy_next_month              0
n

In [7]:
# Remove rows where 'centre_name' is null
df_preschool_raw = df_preschool_raw[df_preschool_raw['centre_name'].notnull()]

# Remove duplicates based on both 'centre_name' and 'postal_code'
df_preschool_raw = df_preschool_raw.drop_duplicates(subset=['centre_name', 'postal_code'], keep='first')

# Obtain relevant columns
df_preschool_loc = df_preschool_raw[['centre_name', 'centre_address', 'postal_code']]

In [None]:
# Apply the function to get lat and lon for each preschool based on postal code
# df_preschool_loc['latitude'], df_preschool_loc['longitude'] = zip(*df_preschool_loc['postal_code'].apply(get_lat_lon_onemap))
# df_preschool_loc['category'] = 'Preschool'
df_preschool_loc.loc[:, 'latitude'], df_preschool_loc.loc[:, 'longitude'] = zip(*df_preschool_loc['postal_code'].apply(get_lat_lon_onemap))
df_preschool_loc.loc[:, 'category'] = 'Preschool'
df_preschool_loc.rename(columns={'centre_name': 'school_name'}, inplace=True)
df_preschool_loc.drop(columns={'postal_code'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preschool_loc.loc[:, 'latitude'], df_preschool_loc.loc[:, 'longitude'] = zip(*df_preschool_loc['postal_code'].apply(get_lat_lon_onemap))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_preschool_loc.loc[:, 'latitude'], df_preschool_loc.loc[:, 'longitude'] = zip(*df_preschool_loc['postal_code'].apply(get_lat_lon_onemap))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pa

## MOE Schools

In [None]:
df_moe_raw = pd.read_csv('moe_schools_raw.csv', na_values=['na', 'NA', 'N/A', 'NULL'])
df_moe_raw = df_moe_raw.drop_duplicates(subset=['school_name', 'postal_code'], keep='first')
df_moe_loc = df_moe_raw[['school_name', 'address', 'postal_code', 'mainlevel_code']]
df_moe_loc.loc[df_moe_loc['mainlevel_code'] == 'MIXED LEVELS', 'mainlevel_code'] = 'IP/IB PROGRAMME'


df_moe_loc['latitude'], df_moe_loc['longitude'] = zip(*df_moe_loc['postal_code'].apply(get_lat_lon_onemap))
df_moe_loc.rename(columns={'mainlevel_code': 'category'}, inplace=True)
df_moe_loc.drop(columns=['postal_code'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moe_loc['latitude'], df_moe_loc['longitude'] = zip(*df_moe_loc['postal_code'].apply(get_lat_lon_onemap))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moe_loc['latitude'], df_moe_loc['longitude'] = zip(*df_moe_loc['postal_code'].apply(get_lat_lon_onemap))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moe_loc.rename(columns={'mainlevel_code': '

# Tertiary Education

In [None]:
#---- Wiki URLs
# School names are obtained from multiple source that give the most complete list of each category.
UNI_WIKI_URL = "https://en.wikipedia.org/wiki/List_of_universities_in_Singapore"
POLY_WIKI_URL = "https://en.wikipedia.org/wiki/Education_in_Singapore"
ITE_WIKI_URL = "https://en.wikipedia.org/wiki/Institute_of_Technical_Education#Colleges"

Obtain school names, category

In [11]:
def get_university_names():
    # Fetch the page content
    response = requests.get(UNI_WIKI_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the "Universities in Singapore" section by its ID
    section_div = soup.find("div", id="Universities_in_Singapore145")

    # List to store university data
    university_list = []

    if section_div:
        # Locate "Education in Singapore" section to stop before it
        stop_section = soup.find("div", id="Education_in_Singapore254")

        # Find all categories (th elements)
        categories = section_div.find_all_next("th", scope="row", class_="navbox-group")

        for category in categories:
            # Stop if we reach "Education in Singapore"
            if stop_section and category.find_previous("div") == stop_section:
                break

            # Find the corresponding <td> which contains university names
            next_td = category.find_next_sibling("td")
            if next_td:
                for link in next_td.find_all("a"):
                    university_name = link.get_text(strip=True)
                    university_list.append({"school_name": university_name, "category": "University"})
    return pd.DataFrame(university_list)

df_universities = get_university_names()
df_universities = df_universities[df_universities['school_name'] != 'Singapore College of Islamic Studies']

In [12]:
def get_polytechnics():
    response = requests.get(POLY_WIKI_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the "Polytechnics" section
    polytechnic_section = soup.find("h2", {"id": "Polytechnics"})
    polytechnic_list = []

    if polytechnic_section:
        # Locate the nearest paragraph (<p>) containing polytechnic names
        para = polytechnic_section.find_next("p")

        # Extract all links within that paragraph
        for link in para.find_all("a"):
            polytechnic_name = link.get_text(strip=True)
            polytechnic_list.append({"school_name": polytechnic_name, "category": "Polytechnic"})
    
    return pd.DataFrame(polytechnic_list)

df_polytechnics = get_polytechnics()
df_polytechnics = df_polytechnics[df_polytechnics['school_name'] != '[71]'] # data cleaning

In [13]:
def get_ite_names():
    response = requests.get(ITE_WIKI_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the "Colleges" section under an <h2> tag
    ite_colleges_header = soup.find("h2", {"id": "Colleges"})
    ite_colleges_list = []

    if ite_colleges_header:
        # Locate the first <ul> after the <h2> heading (which contains the list)
        ul = ite_colleges_header.find_next("ul")

        if ul:
            for link in ul.find_all("a"):
                college_name = link.get_text(strip=True)
                ite_colleges_list.append({"school_name": college_name, "category": "ITE College"})

    return pd.DataFrame(ite_colleges_list)

df_ite_colleges = get_ite_names()

In [14]:
df_tertiary = pd.concat([df_universities, df_polytechnics, df_ite_colleges], ignore_index=True)

Obtain location information

In [15]:
import requests

def get_ter_school_details_google(school_name):
    query = school_name + " Singapore"  # Construct query for text search
    
    params = {
        "query": query,  # The text search query
        "key": GOOGLE_API_KEY
    }
    
    response = requests.get(TEXT_SEARCH_URL, params=params)
    data = response.json()
    
    results_list = []
    
    if data["status"] == "OK":
        for result in data["results"]:  # Loop through all the results
            name = result["name"]
            address = result["formatted_address"]
            lat = result["geometry"]["location"]["lat"]
            lng = result["geometry"]["location"]["lng"]
            
            results_list.append({
                "school_name": school_name,
                "google_school_name": name,  # Store the official name returned
                "address": address,
                "longitude": lng,
                "latitude": lat
            })
    
    # If no valid results, return a placeholder
    if not results_list:
        results_list.append({
            "school_name": school_name,
            "google_school_name": None,
            "address": None,
            "longitude": None,
            "latitude": None
        })
    
    return results_list  # Returns a list of all results

In [16]:
# def get_ter_school_details_onemap(school_name):
#     params = {
#         "searchVal": school_name,  # The school name to search
#         "returnGeom": "Y",  # Return geometry (latitude/longitude)
#         "getAddrDetails": "Y",  # Get detailed address information
#     }
    
#     response = requests.get(ONEMAP_BASE_URL, params=params)
#     data = response.json()
    
#     results_list = []
    
#     if data["found"] > 0:
#         for result in data["results"]:  # Loop through all the results
#             address = result["ADDRESS"]
#             lat = result["LATITUDE"]
#             lng = result["LONGITUDE"]
            
#             results_list.append({
#                 "school_name": school_name,
#                 "onemap_school_name": address,  # Store the address returned by OneMap
#                 "address": address,
#                 "longitude": lng,
#                 "latitude": lat
#             })
    
#     # If no valid results, return a placeholder
#     if not results_list:
#         results_list.append({
#             "school_name": school_name,
#             "onemap_school_name": None,
#             "address": None,
#             "longitude": None,
#             "latitude": None
#         })
    
#     return results_list  # Returns a list of results

In [17]:
ter_loc_details = []
for _, row in df_tertiary.iterrows():
    school_name = row['school_name']
    category = row['category']
    ter_loc_details.extend(get_ter_school_details_google(school_name))  # Extend with multiple results

df_ter_loc_details = pd.DataFrame(ter_loc_details)

# Merge back 
df_tertiary = df_tertiary.merge(df_ter_loc_details, on=['school_name'], how='left')

In [18]:
# Manual cleaning
df_tertiary = df_tertiary[df_tertiary['google_school_name'] != 'Singapore Institute of Manufacturing Technology (SIMTech)']

df_tertiary = df_tertiary[~((df_tertiary['school_name'] == 'University of the Arts Singapore') & 
                             (~df_tertiary['google_school_name'].str.contains('University of the Arts Singapore', na=False)))]

df_tertiary = df_tertiary[~((df_tertiary['school_name'] == 'SDH Institute') & 
                             (~df_tertiary['google_school_name'].str.contains('SDH Institute', na=False)))]

df_tertiary.drop(columns={'google_school_name'}, inplace=True)

## Combining of datasets

In [25]:
df_preschool_loc = df_preschool_loc.rename(columns={'centre_address': 'address'})

In [None]:
desired_col_order = ['school_name', 'category', 'address', 'longitude', 'latitude']

df_preschool_loc = df_preschool_loc[desired_col_order]
df_moe_loc = df_moe_loc[desired_col_order]
df_tertiary = df_tertiary[desired_col_order]

df_combined = pd.concat([df_preschool_loc, df_moe_loc, df_tertiary], ignore_index=True)
df_combined = df_combined.map(lambda x: x.strip().upper() if isinstance(x, str) else x)
df_combined = df_combined.sort_values(by=['category', 'school_name'])