Eugene's Script for scraping the first level of a property website getting the links to further scrape the property details.

In [1]:
from typing import Tuple
from time import sleep

In [2]:
import spacy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import pandas as pd
import re
print(spacy.__version__)

3.5.0


In [3]:
# Scraping with MS Edge

desired_cap = {
    "os" : "OS X",
    "os_version" : "Ventura",
    "browser" : "Edge",
    "browser_version" : "109.0.1518.70",
    "browserstack.local" : "false",
    "browserstack.selenium_version" : "3.141.0"
}

# store an edge driver in the same folder you store this script
browser = webdriver.Edge(executable_path='/Users/eugene/projects/address_parser/msedgedriver', capabilities=desired_cap)

In [4]:
# input the target website for scraping
init_url = 'http://cppcl.property.hk/tran_prop.php'

In [5]:

# create a function for fetching the links with specific property details for second layer scraping
def transaction_details_pattern(uid: Tuple[str, str]) -> str:
    return f"http://cppcl.property.hk/tran_prop_detail.php?ref={uid[0]}&year={uid[1]}"

# create a function for fetching the links with specific property details for second layer scraping
def create_address_model(data_tuple) -> dict:
    return {
        'id': data_tuple[0],
        'source': data_tuple[1],
        'chinese': {
            'raw': None,
            'address': data_tuple[2],
            'floor': data_tuple[3],
            'flat': data_tuple[4],
        },
        'english': {
            'raw': None,
            'address': None,
            'floor': None,
            'flat': None,
        },
        'data': {
            'registration_number': None,
            'registration_date': None,
            'docs_date': None,
            'gross_floor_area': data_tuple[5],
            'saleable_floor_area': None,
            'price(million)': data_tuple[6],
            'price_per_feet_gross': data_tuple[7],
            'price_per_feet_saleable': None,
            'district': None,
            'usage': None,
            'occupation_permit': None,
            'lease_term': None,
            'renewable_term': None,
            'facing': None,
            'layout': None,
            'seller': None,
            'buyer': None,
            'remark': None,
        }
    }

In [6]:
# gathering the XPATHs for the target information we want
# they will be put in the lower block for the browser.find_elements() function
xpath_address = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[3]'
xpath_floor = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[4]'
xpath_flat = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[5]'
xpath_gross_area = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[6]'
xpath_price = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[7]'
xpath_price_per_feet = '/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[8]'

In [7]:
# create a blank data listing for storing the scraped data
data = []

In [None]:
# fetching info online with the XPATH
# info of a single property is wrapped in "data_tuple" in the format of "create_address_model" dictionary above, the data_tuples are dictionaries
# the data_tuples are stored in the data list above, so the data list is a list of dictionaries
browser.get(init_url)
for j in range(1, 7):
    browser.find_elements_by_xpath(f'/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[1]/tbody[1]/tr[2]/td[4]/select/option[{j}]')[0].click()
    browser.find_elements_by_xpath('/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[1]/tbody[1]/tr[2]/td[8]/input[1]')[0].click()
    browser.find_elements_by_xpath("//*[contains(text(), '尾頁')]")[0].click()
    max_page = max(re.findall("\d+", browser.find_element(By.CLASS_NAME, "stdPrevNext").text))
    for i in range(1, int(max_page) + 1):
        browser.execute_script(f'javascript:findForm_submit("page",{i})')
        uids_tuple = list(map(lambda extract: re.findall("open_tran_prop_detail\(\"([a-zA-Z0-9]+)\"\,\"([0-9]+)", extract)[0], 
                                            list(map(lambda xpath: xpath.get_attribute("onclick"), 
                                                        browser.find_elements_by_xpath('/html/body/table/tbody/tr[2]/td/table/tbody/tr[3]/td/form/table[3]/tbody//td[9]/a')))))
        uids = list(map(lambda x: f"{x[0]}{x[1]}", uids_tuple))
        urls = list(map(transaction_details_pattern, uids_tuple))
        address = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_address)[1::]))
        floor = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_floor)[1::]))
        flat = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_flat)[1::]))
        gross_area = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_gross_area)[1::]))
        price = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_price)[1::]))
        price_per_feet = list(map(lambda x: x.text, browser.find_elements_by_xpath(xpath_price_per_feet)[1::]))

        data_tuple = list(zip(uids, urls, address, floor, flat, gross_area, price, price_per_feet))
        data.extend(list(map(create_address_model, data_tuple)))

In [None]:
# check how many properties were scraped
len(data)

In [None]:
# save the data into pickle
import pickle
with open('ccpcl.pickle', 'wb') as f:
    pickle.dump(data, f)

In [None]:
# open the pickle file and check
import pickle

objects = []
with (open("ccpcl.pickle", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break