## Part 1 - Data Curation (Web Scraping)

### Web Scraping

In [1]:
# Import necessary dependencies
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import json

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
# Set wait times
waittime = 30
sleeptime = 0.5

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Define target website
home_page = "https://www.99.co/singapore/rent/condos-apartments"

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(options=options) 

# Get driver to retrieve homepage
driver.get(home_page)

In [3]:
# Get last page number (of all listings)
def get_last_page_number():
    pagination_elems = driver.find_elements_by_class_name("SearchPagination") 
    page_numbers = [elem.text.split("\n") for elem in pagination_elems][0]
    last_page = int(page_numbers[-2])
    return last_page

In [4]:
# Collate all web links on current listings page
def collate_web_links():
    elems = driver.find_elements_by_css_selector("._31ajL [href]") # Div class just above div style="opacity:1"
    links = [elem.get_attribute('href') for elem in elems] # Get the web links present on current page
    
    substring = '/singapore/rent/property/'
    # Truncate web links to remove unnecessary last part of string
    condo_links = [link.split("?", 1)[0] for link in links if substring in link] # List comprehension ensuring link is directed to property page, not ad
    
    return condo_links

In [5]:
# Scrape detailed page (Old web template)
# Example of old version: https://www.99.co/singapore/rent/property/parc-sophia-condo-kcqL4yS7E38TEc29bHwNrY

def scrape_page_old():
    details_dict = {}

    # Scrape info on page
    class_name_1 = 'Z0npN'
    details_dict['title'] = driver.find_element_by_xpath(f"//h1[@class='{class_name_1}']").text
    details_dict['rental'] = driver.find_elements_by_xpath(f"//div[@id='price']/h3[@class='{class_name_1}']")[0].text

    # Property details
    elems = driver.find_elements_by_xpath(f"//p[@class='JPolj _5q1E-']")    
    details = [elem.text.split(" ") for elem in elems]
    for sub_list in details:
        if len(sub_list) == 1:
            details_dict[f"{sub_list[0].lower()}"] = sub_list[0].lower() # This is to cater to Studio label
        else:
            details_dict[f"{sub_list[1].lower()}"] = sub_list[0] # Catering to the other property details e.g. no. of baths

    class_name_title = 'FuuOS'
    class_name_div = '_28kbc'
    class_name_category = '_24Agy'
    class_name_value = 'JPolj'

    num_of_property_details = len(driver.find_elements_by_xpath(f"//div[@class='{class_name_category}']"))
    for i in range(num_of_property_details):
        detail_category = driver.find_elements_by_xpath(f"//div[@class='{class_name_category}']")[i].text
        detail_category = detail_category.lower()
        detail_category = detail_category.replace(" ","_")
        details_dict[f'{detail_category}'] = driver.find_elements_by_xpath(f"//div[@class='{class_name_div}']/p[@class='{class_name_value}']")[i].text

        
    # Development details
    dev_details = driver.find_elements_by_xpath(f"//div[@class='_2U7f7']/div/p[@class='JPolj']")
    for detail in dev_details:
        detail_text = detail.text.split(": ")
        detail_category = detail_text[0].lower()
        detail_category = detail_category.replace(" ","_")
        details_dict[f'{detail_category}'] = detail_text[1]     
        
    # MRT
    try:
        details_dict['nearest_mrt_name'] = driver.find_element_by_xpath(f"//a[@class='_2aXf0 QE8xc']").text
        details_dict['nearest_mrt_dist'] = driver.find_element_by_xpath(f"//p[@class='JPolj _2ZoIs']").text
    except:
        pass
    
    # Amenities
    amenities_elems = driver.find_elements_by_xpath("//p[@class='JPolj _26v8n']")
    details_dict['amenities'] = [str(elem.text) for elem in amenities_elems]

    details_dict['electoral_div'] = driver.find_elements_by_xpath(f"//h2[@class='Z0npN _3NW6g']")[0].text

    map_element = driver.find_element_by_id("location")
    actions = ActionChains(driver)
    actions.move_to_element(map_element).perform()

    class_name_location_div = '_3OnRG'
    class_name_location = 'yMCxv _1YwzE _1vzK2'
    details_dict['travel_time_changi'] = WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']"))).text
    details_dict['travel_time_raffles'] = driver.find_elements_by_xpath(f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']")[1].text
    details_dict['travel_time_orchard'] = driver.find_elements_by_xpath(f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']")[2].text
    
    return details_dict

In [6]:
# Scrape detailed page (New Web Template)
def scrape_page_new():
        
    details_dict = {}

    # Scrape info on page
    class_name_1 = '_1zGm8 _1vzK2'
    WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//h1[@class='{class_name_1}']")))
    details_dict['title'] = driver.find_element_by_xpath(f"//h1[@class='{class_name_1}']").text #Updated
    details_dict['rental'] = driver.find_elements_by_xpath(f"//h2[@class='{class_name_1}']")[0].text #Updated
    details_dict['bed'] = driver.find_elements_by_xpath(f"//h2[@class='{class_name_1}']")[1].text #Updated
    details_dict['bath'] = driver.find_elements_by_xpath(f"//h2[@class='{class_name_1}']")[2].text #Updated
    details_dict['sqft'] = driver.find_elements_by_xpath(f"//h2[@class='{class_name_1}']")[3].text #Updated

    class_name_2 = 'dniCg _2rhE-'
    details_dict['psf'] = driver.find_elements_by_xpath(f"//p[@class='{class_name_2}']")[1].text 

    # This section will take into account these details (if present): Availability, Lease, Furnishing, Property Type, 
    # Name, Unit Types, Total Units, Built Year, Tenure, Developer, and Neighbourhood
    class_name_td1 = '_3NChA'
    class_name_td2 = 'dm2g6'
    num_of_property_details = len(driver.find_elements_by_xpath(f"//td[contains(@class, '{class_name_td1}')]"))
    for i in range(num_of_property_details):
        detail_category = driver.find_elements_by_xpath(f"//td[contains(@class, '{class_name_td1}')]")[i].text
        detail_category = detail_category.lower()
        detail_category = detail_category.replace(" ","_")
        details_dict[f'{detail_category}'] = driver.find_elements_by_xpath(f"//td[@class='{class_name_td2}']")[i].text

    # MRT
    try:
        details_dict['nearest_mrt_name'] = driver.find_element_by_xpath(f"//a[@class='_2aXf0 QE8xc']").text
        details_dict['nearest_mrt_dist'] = driver.find_element_by_xpath(f"//p[@class='JPolj _2ZoIs']").text
    except:
        pass
    
    # Expand amenities
    button_class = 'cFGt2 _1P_YF' # For amenities
    WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//button[@class='{button_class}']")))
    driver.find_element_by_xpath(f"//button[@class='{button_class}']").click()
    time.sleep(sleeptime)

    # Extract all amenities 
    amenities_elems = driver.find_elements_by_xpath(f"//p[@class='_2sIc2 AIgs2 _2rhE-']")
    details_dict['amenities'] = [str(elem.text) for elem in amenities_elems]

    details_dict['electoral_div'] = driver.find_elements_by_xpath(f"//h2[@class='Z0npN _3NW6g']")[0].text

    # Scroll down to reveal Google map section
    map_element = driver.find_element_by_xpath("//div[@class='z3BrQ']")
    actions = ActionChains(driver)
    actions.move_to_element(map_element).perform()
    
    class_name_location_div = '_3OnRG'
    class_name_location = 'yMCxv _1YwzE _1vzK2'
    
    time.sleep(sleeptime)
    details_dict['travel_time_changi'] = WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']"))).text
    details_dict['travel_time_raffles'] = driver.find_elements_by_xpath(f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']")[1].text
    details_dict['travel_time_orchard'] = driver.find_elements_by_xpath(f"//div[@class='{class_name_location_div}']/h4[@class='{class_name_location}']")[2].text
    
    return details_dict

In [7]:
# Open web link and scrape information on that page
def open_and_scrape(web_link, master_list_of_dict):
        
    # Switch to web link
    driver.get(web_link)
    
    # Scrape page based on whether old web design or new design
    # Old version has the class name "_2yeD-" just below the <div id = appContent>
    if len(driver.find_elements_by_class_name("_2yeD-"))!=0:
        details_dict = scrape_page_old() 
         
    else:
        details_dict = scrape_page_new()
        
    master_list_of_dict.append(details_dict)


In [8]:
# Create an empty list to store the dictionaries (each listing should give one dictionary)
master_list_of_dict = []

# Get the last page number in terms of pages of available listings
last_page_num = get_last_page_number()

In [9]:
def execute_scraping(start_page):
        
    for i in range(1, last_page_num+1):
        current_page = WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, "//li[@class='active']"))).text
        current_page = int(current_page)

        if current_page < last_page_num:
            condo_links = collate_web_links()
            for index,link in enumerate(condo_links):
                try:
                    open_and_scrape(link, master_list_of_dict)
                except:
                    pass
                print(f"Completed {index+1} out of {len(condo_links)} links of Page {start_page}")
            start_page += 1
            try:
                next_link = f'{home_page}?page_num={start_page}'
                driver.get(next_link)
            except:
                time.sleep(10)
                next_link = f'{home_page}?page_num={start_page}'
                driver.get(next_link)

        else:
            condo_links = collate_web_links()
            for link in condo_links:
                open_and_scrape(link, master_list_of_dict)
            print('Scraping complete')
            break

In [None]:
execute_scraping(1)

Completed 1 out of 35 links of Page 1
Completed 2 out of 35 links of Page 1
Completed 3 out of 35 links of Page 1
Completed 4 out of 35 links of Page 1
Completed 5 out of 35 links of Page 1


In [17]:
# Number of dictionaries in the list
len(master_list_of_dict)

7958

In [12]:
# Create function to remove duplicate entries in the list of dictionaries
def remove_dupe_dicts(l):
    list_of_strings = [
    json.dumps(d, sort_keys=True)
    for d in l]

    list_of_strings = set(list_of_strings)

    return [json.loads(s) for s in list_of_strings]

In [13]:
master_list = remove_dupe_dicts(master_list_of_dict)

In [14]:
# Total number of listings successfully scraped
len(master_list)

7317

In [18]:
# Save as JSON file
with open('master_list_cleaned_v3.json', 'w') as file:
    file.write(json.dumps(master_list, indent=4))