# Preparation

<b>Load Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver

# data structures
import numpy as np
import pandas as pd

# others
import sys, os, re, datetime

<b>UDF</b>

In [2]:
# get scrapping time
def get_scrapping_time() -> str:
    return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [3]:
# get pages to iterate
def get_pages(browser: WebDriver) -> list:
    pages = []
    for button in browser.find_elements(By.XPATH, "//span[@class='ButtonLabel']"):
        try:
            float(button.text.strip())
            pages.append(button)
        except:
            continue

    return pages

In [4]:
# extract table from each page
def extract_table(raw_html: str, table: dict) -> None:
    soup = BeautifulSoup(raw_html)
    rows = soup.find_all('table')[1].find_all('tr')
    ## we do not use the last row
    for row in rows[:-1]:
        data = [td.text.strip() for td in row.find_all('td') if td.text.strip() != '']
        ###
        keys = list(table.keys())
        for i in range(len(data)):
            table[keys[i]].append(data[i])
        ###
        table['scrapping_date'] = get_scrapping_time()

    return None

In [5]:
# get the button for switching to table format
def get_table_button(browser):
    try:
        ## box
        button = browser.find_element(By.XPATH, '/html/body/div[1]/div[8]/div[1]/div[2]/div[1]/div[2]/div/div/div/div[2]/div[2]/ul/li[2]/span')
    except:
        ## drop-down
        button = browser.find_element(By.XPATH, '/html/body/div[1]/div[8]/div[1]/div[3]/div[1]/div[1]/div/div/div/div/div[2]/div/div[2]/div[2]/div/div/div/label/div/select/option[2]')

    return button

# Scraping - Selenium

In [6]:
## options
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
## browser
browser = webdriver.Chrome(options=chrome_options)
browser.implicitly_wait(10)

In [7]:
# switch to table format
browser.get('https://www.redfin.com/city/29470/IL/Chicago')
button = get_table_button(browser)
button.click()

In [30]:
## pages 
pages = get_pages(browser)
## iterate each page to scrape
table = {'address': [], 'location': [], 'price': [], 'beds': [], 
         'baths': [], 'sq.ft': [], '$/sq.ft': [], 'on_redfin': [], 
         'scrapping_date': []}
for page in pages:
    page.click()
    extract_table(browser.page_source, table)

In [31]:
browser.quit()