## Part 1 - Data Curation (Web Scraping)

### Web Scraping

In [1]:
# Import necessary dependencies
from bs4 import BeautifulSoup
import urllib
import re
import time
import pandas as pd
import json

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager



In [2]:
# Set wait times
waittime = 5
sleeptime = 1

# Initiate web driver
try:
    driver.close() # Close any existing WebDrivers
except Exception:
    pass

# Define target website
home_page = "https://www.99.co/singapore/rent"

# Set webdriver options
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('ignore-certificate-errors')

# Initiate webdriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = options)
# Get driver to retrieve homepage
driver.get(home_page)

In [3]:
# Get last page number (of all listings)
def get_last_page_number():
    pagination_elems = driver.find_elements(By.CLASS_NAME, "SearchPagination") 
    page_numbers = [elem.text.split("\n") for elem in pagination_elems][0]
    last_page = int(page_numbers[-2])
    return last_page

In [4]:
# Collate all web links on current listings page
def collate_web_links():
    elems = driver.find_elements(By.CSS_SELECTOR, "._31ajL [href]") # Div class just above div style="opacity:1"
    links = [elem.get_attribute('href') for elem in elems] # Get the web links present on current page
    
    substring = '/singapore/rent/property/'
    # Truncate web links to remove unnecessary last part of string
    condo_links = [link.split("?", 1)[0] for link in links if substring in link] # List comprehension ensuring link is directed to property page, not ad
    
    return condo_links

In [18]:
# Scrape detailed page (New Web Template)
def scrape_page_new():
        
    details_dict = {}

    # Scrape info on page
    district_class_name = 'dniCg _3j72o _2rhE-'
    rental_price_class_name = "_1zGm8 _3na6W _1vzK2"
    class_name_1 = "_2sIc2 _29qfj _2rhE-"
    title_class_name = "_3Wogd JMF8h lFqTi _1vzK2"
    WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//h1[@class='{title_class_name}']")))
    details_dict['title'] = driver.find_element(By.XPATH,f"//h1[@class='{title_class_name}']").text #Updated
    details_dict['rental'] = driver.find_element(By.XPATH,f"//p[@class='{rental_price_class_name}']").text #Updated
    row = driver.find_elements(By.XPATH,f"//p[@class='{class_name_1}']")
    if (len(row) == 2): 
        details_dict['bed'] = row[0].text #Updated
        details_dict['sqft'] = row[1].text #Updated
    else:
        details_dict['bed'] = row[0].text #Updated
        details_dict['bath'] = row[1].text #Updated
        details_dict['sqft'] = row[2].text #Updated
        if (len(row) == 4):
            details_dict['sqft'] = details_dict['sqft'] + " / " + row[3].text #Updated

    address_details = driver.find_elements(By.XPATH,f"//p[@class='{district_class_name}']//span")
    if (len(address_details) > 1):
        details_dict['address'] = address_details[0].text #Updated
        details_dict['district'] = address_details[len(address_details)-1].text #Updated
    elif (len(address_details) == 1):
        details_dict['district'] = address_details[0].text #Updated
    # This section will take into account these details (if present): Availability, Lease, Furnishing, Property Type, 
    # Name, Unit Types, Total Units, Built Year, Tenure, Developer, and Neighbourhood
    class_name_td1 = '_3r4yN NomDX'
    class_name_td2 = '_3r4yN XCAFU'
    
    map_element = driver.find_element(By.ID,"propertyDetails")
    actions = ActionChains(driver)
    actions.move_to_element(map_element).perform()
    
    WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//table[@class='_3NpKo']")))
    num_of_property_details = len(driver.find_elements(By.XPATH,f"//td[contains(@class, '{class_name_td1}')]"))
    time.sleep(sleeptime)
    for i in range(num_of_property_details):
        detail_category = driver.find_elements(By.XPATH,f"//td[contains(@class, '{class_name_td1}')]")[i].text
        detail_category = detail_category.lower()
        detail_category = detail_category.replace(" ","_")
        details_dict[f'{detail_category}'] = driver.find_elements(By.XPATH,f"//td[contains(@class, '{class_name_td2}')]")[i].text

    # MRT
    try:
        details_dict['nearest_mrt_name'] = driver.find_element(By.XPATH,f"//p[@class='_2sIc2 _2rhE- _1c-pJ']//a").text
    except:
        pass
    
#     # Expand amenities
#     button_class = 'cFGt2 _1P_YF' # For amenities
#     WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//button[@class='{button_class}']")))
#     driver.find_element(By.XPATH,f"//button[@class='{button_class}']").click()

#     # Extract all amenities 
#     amenities_elems = driver.find_elements(By.XPATH,f"//p[@class='_2sIc2 AIgs2 _2rhE-']")
#     details_dict['amenities'] = [str(elem.text) for elem in amenities_elems]

#     details_dict['electoral_div'] = driver.find_element(By.XPATH,f"//h2[@class='Z0npN _3NW6g']")[0].text

    # Scroll down to reveal Google map section
    map_element = driver.find_element(By.ID,"location")
    actions = ActionChains(driver)
    actions.move_to_element(map_element).perform()
    
    class_name_location_div = '_3OnRG'
    class_name_location = 'yMCxv _1YwzE _1vzK2'
    try:
        details_dict['travel_time_changi'] = WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, f"//div[@class='{class_name_location_div}']/div[@class='{class_name_location}']"))).text
        details_dict['travel_time_raffles'] = driver.find_elements(By.XPATH,f"//div[@class='{class_name_location_div}']/div[@class='{class_name_location}']")[1].text
        details_dict['travel_time_orchard'] = driver.find_elements(By.XPATH,f"//div[@class='{class_name_location_div}']/div[@class='{class_name_location}']")[2].text
    except TimeoutException:
        pass
    return details_dict

In [6]:
# Open web link and scrape information on that page
def open_and_scrape(web_link, master_list_of_dict):
        
    # Switch to web link
    driver.get(web_link)
    
    # Scrape page based on whether old web design or new design
    # Old version has the class name "_2yeD-" just below the <div id = appContent>
    details_dict = scrape_page_new()
    master_list_of_dict.append(details_dict)


In [7]:
# Create an empty list to store the dictionaries (each listing should give one dictionary)
master_list_of_dict = []

# Get the last page number in terms of pages of available listings
last_page_num = get_last_page_number()

In [8]:
def execute_scraping(start_page, end_page):
    for i in range(1, end_page+1):
        try:
            current_page = WebDriverWait(driver, waittime).until(EC.presence_of_element_located((By.XPATH, "//li[@class='active']"))).text
            current_page = int(current_page)
        except:
            print(f"Skip")
            pass

        if current_page < end_page:
            condo_links = collate_web_links()
            for index,link in enumerate(condo_links):
                open_and_scrape(link, master_list_of_dict)
                print(f"Completed {index+1} out of {len(condo_links)} links of Page {start_page}")
            start_page += 1
            try:
                next_link = f'{home_page}?page_num={start_page}'
                driver.get(next_link)
            except:
                time.sleep(10)
                next_link = f'{home_page}?page_num={start_page}'
                driver.get(next_link)

        else:
            try:
                condo_links = collate_web_links()
                for link in condo_links:
                    open_and_scrape(link, master_list_of_dict)
                print('Scraping complete')
                break
            except:
                print(f"Skip {current_page}")
                pass

In [None]:
execute_scraping(1, last_page_num)

In [22]:
# Number of dictionaries in the list
len(master_list_of_dict)

4510

In [None]:
# Create function to remove duplicate entries in the list of dictionaries
def remove_dupe_dicts(l):
    list_of_strings = [
    json.dumps(d, sort_keys=True)
    for d in l]

    list_of_strings = set(list_of_strings)

    return [json.loads(s) for s in list_of_strings]

In [None]:
master_list = remove_dupe_dicts(master_list_of_dict)

In [None]:
# Total number of listings successfully scraped
len(master_list)

In [None]:
# Save as JSON file
with open('master_list_cleaned_v3.json', 'w') as file:
    file.write(json.dumps(master_list, indent=4))

In [None]:
import pandas as pd
df = pd.read_json (r'master_list_cleaned_v3.json')
df.to_csv (r'99_co.csv', index = None)