# Preparation

<b>Load Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# data structures
import numpy as np
import pandas as pd

# others
import sys, os, re, datetime, time

<b>UDF</b>

In [2]:
# get scrapping time
def get_scrapping_time() -> str:
    return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [3]:
# get pages to iterate
def get_pages(browser: WebDriver) -> list:
    pages = []
    for button in browser.find_elements(By.XPATH, "//span[@class='ButtonLabel']"):
        try:
            float(button.text.strip())
            pages.append(button)
        except:
            continue

    return pages

In [4]:
# extract data from row
def extract_data_from_row(row: WebElement, table: dict) -> None:
    # explicit data
    tds = [td.text.strip() for td in row.find_elements(By.XPATH, ".//child::td") if td.text.strip() != '']
    keys = list(table.keys())
    for i, td in enumerate(tds):
        table[keys[i]].append(td)
    table[keys[-1]].append(get_scrapping_time())

    return None

In [5]:
# switch row
def switch_row(row: WebElement) -> None:
    i = 1
    while True:
        try:
            row.click()
            break
        except:
            print(f'Trial: {i}')
            i += 1
            time.sleep(1)

    return None
# extract homecard from row
def extract_homecard_from_row(browser: WebDriver, row: WebElement, homecards: dict) -> None:
    switch_row(row)
    ## homecard
    homecard = browser.find_element(By.XPATH, "//div[@class='TableViewHomecardContent flex flex-column']")
    ## agent's name
    try:
        agent_name = homecard.find_element(By.XPATH, "//p[@class='ListingAgentCard__name']").text
        homecards['agent_name'].append(agent_name)
    except:
        homecards['agent_name'].append(None)
    ## titles & values
    """
    Keep re-find elements untile they retrive non-null strings (maybe due to bad connections)
    """
    titles = ['']
    while titles[0] == '':
        titles = [title.text.strip() for title in homecard.find_elements(By.XPATH, ".//descendant::span[@class='title']")]
        values = [value.text.strip() for value in homecard.find_elements(By.XPATH, ".//descendant::span[@class='value']")]
    ##
    keys = [key for key in homecards.keys() if key != 'agent_name']
    for key in keys:
        if key not in titles:
            homecards[key].append(None)
        else:
            idx = titles.index(key)
            homecards[key].append(values[idx])

    return None

In [6]:
# get the button for switching to table format
def get_table_button(browser):
    try:
        ## box
        button = browser.find_element(By.XPATH, "//span[@data-text='Table']")
    except:
        ## drop-down
        button = browser.find_element(By.XPATH, "//select[@role='combobox']//option[@value='table']")

    return button

In [7]:
# login
def login_to_redfin(email: str, password: str, browser: WebDriver) -> None:
    ## login button
    browser.implicitly_wait(10)
    browser.find_element(By.XPATH, "//span[text()='Join / Sign in']").click()
    # email
    browser.find_element(By.XPATH, "//input[@id='emailInput']").send_keys(email)
    browser.find_element(By.XPATH, "//span[text()='Continue with email']").click()
    browser.find_element(By.XPATH, "//span[text()='Sign in with email instead']").click()
    ## password
    browser.find_element(By.XPATH, "//input[@id='passwordInput']").send_keys(password)
    ## finish
    browser.find_element(By.XPATH, "//span[text()='Continue with email']").click()

    return None

# Scraping - Selenium

In [8]:
## options
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
## browser
browser = webdriver.Chrome(options=chrome_options)
browser.implicitly_wait(10)

In [9]:
# switch to table format
browser.get('https://www.redfin.com/city/29470/IL/Chicago')
button = get_table_button(browser)
button.click()

In [10]:
# login to redfin
# email = 'john.lukestein@gmail.com'
# password = 'redfin.0504'
# login_to_redfin(email, password, browser)

In [11]:
# by using this xpath we'll exclude the first table with no rows
rows_to_click = browser.find_elements(By.XPATH, "//table//tbody[@class='tableList']//tr")

In [12]:
table = {'address': [], 'location': [], 'price': [], 
         'beds': [], 'baths': [], 'Sq.Ft': [], 
         '$/Sq.Ft': [], 'on_redfin': [], 'scrap_date': []}

In [13]:
homecards = {'agent_name': [], '$/Sq. Ft.': [], 'On Redfin': [], 
             'HOA': [], 'Year Built': [], 'Status': []}

for row in rows_to_click:
    extract_homecard_from_row(browser, row, homecards)

# Draft

In [66]:
x = browser.find_element(By.XPATH, "//div[@id='MapHomeCard_0']").find_element(By.XPATH, "//script")

In [9]:
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
r = requests.get('https://www.redfin.com/city/29470/IL/Chicago', headers={'User-Agent': user_agent})
soup = BeautifulSoup(r.content, 'html.parser')

In [49]:
x = soup.find('div', {'id': 'MapHomeCard_39'}).find('script').text
x = [i for i in x if i not in '[]']
y = ''.join(x)

In [50]:
eval(y)[0]

{'@context': 'http://schema.org',
 'name': '6831 S Oakley Ave, Chicago, IL 60636',
 'url': 'https://www.redfin.com/IL/Chicago/6831-S-Oakley-Ave-60636/home/13936095',
 'address': {'@type': 'PostalAddress',
  'streetAddress': '6831 S Oakley Ave',
  'addressLocality': 'Chicago',
  'addressRegion': 'IL',
  'postalCode': '60636',
  'addressCountry': 'US'},
 'geo': {'@type': 'GeoCoordinates',
  'latitude': 41.7692403,
  'longitude': -87.6805736},
 'numberOfRooms': 4,
 'floorSize': {'@type': 'QuantitativeValue', 'value': 1796, 'unitCode': 'FTK'},
 '@type': 'SingleFamilyResidence'}