# Note before Running

Before running, make sure that there is a 'data/raw' directory. Downloaded CSV files will be saved there.

In [5]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
import time
import re
from tqdm import tqdm_notebook as tqdm
from glob import glob

pd.options.display.max_rows = 6

In [6]:
URL = 'https://www.ura.gov.sg/realEstateIIWeb/resiRental/search.action'
DISTRICTS = 28

In [7]:
start_date = 'MAR-2014'
curr_dir = os.getcwd()
dl_dir = os.path.join(curr_dir, "data", "raw")

In [8]:
def scrape_all(start_date):
    '''
    Initialises a Chrome browser and send clicks there to download the data.
    Loops over districts and property types and downloads.
    
    Arguments:
        start_date: str. Date to start scraping, as per the dropdown menu on the website. e.g. MAR-2014
        
    Returns:
        None. CSV files will be saved 'data/raw' directory
        
    Example:
        scrape_all('MAR-2014')
    '''
    
    chrome_options = webdriver.ChromeOptions()
    prefs = {"download.default_directory" : dl_dir}
    chrome_options.add_experimental_option("prefs",prefs)

    browser = webdriver.Chrome(chrome_options = chrome_options)
    
    wait = WebDriverWait(browser, 10)
    
    dl_dir_no_csvs = len(glob(os.path.join(dl_dir, "*.csv")))

    for district in tqdm(range(DISTRICTS), desc = 'Overall', unit = 'District'):
        for prop_type, prop_type_txt in enumerate(tqdm(['landed', 'nonlanded', 'EC'], 
                                                       desc = 'District {}'.format(district + 1), 
                                                       unit = 'Property Type')):
            # the property type on the page is 1-indexed but python is 0-indexed
            prop_type = prop_type + 1 
            
            # go to search page
            browser.get(URL)
            browser.find_element_by_link_text('Search by property type and postal district').click()

            # set property type
            browser.find_element_by_xpath('//*[@id="district"]/div[2]/div[{}]/label'.format(prop_type)).click()

            # set from date
            browser.find_element_by_xpath('//*[@id="searchForm_from_Date"]/option[text()="{}"]'.format(start_date)).click()

            
            try:
                # click on districts
                browser.find_element_by_id("addToPostal_{}".format(district)).click()
                
                # click on search button
                time.sleep(2)
                browser.find_element_by_id("searchForm_2").click()

                # wait till new page loads
                wait.until_not(EC.presence_of_element_located((By.ID, 'searchForm_2')))
                body_text = browser.find_element_by_xpath('/html/body/div/div[3]/div[2]/div').text

                if re.match("No results were found", body_text):
                    print("No results for property type {}, district {}".format(prop_type_txt, district + 1))
                else:
                    # download csv
                    browser.find_element_by_xpath('//*[@id="SubmitSortForm"]/div[1]/div[3]/input').click()
                    while True:
                        curr_dl_dir_no_csvs = len(glob(os.path.join(dl_dir, "*.csv")))
                        if dl_dir_no_csvs == curr_dl_dir_no_csvs:
                            time.sleep(0.5)
                        else:
                            dl_dir_no_csvs = curr_dl_dir_no_csvs
                            break

            except:
                print('Failed at property type {}, district {}'.format(prop_type_txt, district + 1))
    
    # browser.close()

In [9]:
# this might fail at district 28 for all property types, because
# somehow there is no district 24 in that website
scrape_all(start_date)

No results for property type EC, district 1
No results for property type EC, district 2
No results for property type EC, district 3
No results for property type EC, district 4
No results for property type EC, district 5
No results for property type landed, district 6
No results for property type EC, district 6
No results for property type EC, district 7
No results for property type EC, district 8
No results for property type EC, district 9
No results for property type EC, district 10
No results for property type EC, district 11
No results for property type EC, district 12
No results for property type EC, district 13
No results for property type EC, district 14
No results for property type EC, district 15
No results for property type EC, district 16
No results for property type EC, district 17
No results for property type EC, district 21
No results for property type EC, district 25
Failed at property type landed, district 28
Failed at property type nonlanded, district 28
Failed at prope