# Preparation

<b>Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# data structures
import numpy as np
import pandas as pd

# database
import sqlite3
import csv

# others
import sys, os, re, glob, datetime, time
from lxml import etree

<b>UDF</b>

In [2]:
# login to redfin
def login_to_redfin(browser: WebDriver, email: str, password: str) -> None:
    time.sleep(5)
    # begin logging
    browser.find_element(By.XPATH, "//span[text()='Join / Sign in']/..").click()
    
    # email
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='emailInput']").send_keys(email)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    # password
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='passwordInput']").send_keys(password)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    return None

In [3]:
# get links to cities' pages
def get_cities_links(browser: WebDriver) -> tuple[list, list]:
    time.sleep(1)
    # get list of cities
    cities = browser.find_elements(By.XPATH, f"//span[text()='Search for homes by city']/following-sibling::ul/child::li")
    
    # show more button to get all the names of cities
    try:
        browser.find_element(By.XPATH, "//span[text()='Search for homes by city']/parent::*//span[text()='Show more']").click()
    except:
        pass

    # get link for each city
    def _adjust_format(web_ele: WebElement) -> str:
        text = web_ele.text.strip().lower()\
            .replace(' real estate', '')\
                .replace(' ', '_')\
                    .replace('.', '')
        return text
    names = list(map(_adjust_format, cities))
    links = [city.find_element(By.XPATH, ".//child::a").get_attribute('href') for city in cities]

    return names, links

In [4]:
def write_differ(writer, existing_rows: list, current_row: list) -> bool:
    flag = all(list(map(lambda x: x == current_row, existing_rows)))
    if flag:
        writer.writerows([current_row])

<b>UDC</b>

In [5]:
class CSV_Transformer():
    def __init__(self, intermediate_dir: str, target_dir: str, fixed_name: str) -> None:
        self.intermediate_dir = intermediate_dir
        self.target_dir = target_dir
        self.fixed_name = fixed_name

    # clear intermediate directory
    def _cldir(self):
        files_names = os.listdir(self.intermediate_dir)
        for name in files_names:
            os.remove(f'{self.intermediate_dir}/{name}')

    # write difference
    def _write_difference(self, existing_rows: list, current_rows: list):
        rows_to_add = []
        for row in current_rows:
            flag = all(list(map(lambda x: x == row, existing_rows)))
            if flag:
                rows_to_add.append(row)

        return rows_to_add

    # create a new file
    def _create_new_file(self):
        try:
            # get selected rows except the 2nd row
            file_name = os.listdir(self.intermediate_dir)[0]
            with open(f'{self.intermediate_dir}/{file_name}', 'r+') as f:
                reader = csv.reader(f, delimiter=',')
                rows = [row for i, row in enumerate(reader) if i != 1]

            # add new csv file
            with open(f'{self.target_dir}/{self.fixed_name}.csv', 'w+') as f:
                writer = csv.writer(f)
                writer.writerows(rows)

        except Exception as e:
            print(e)

    # update an existing file
    def _update_file(self):
        # reading the existing file
        with open(f'{self.target_dir}/{self.fixed_name}.csv', 'r+') as f:
            reader = csv.reader(f, delimiter=',')
            existing_rows = [row for row in reader]

        # reading the downloaded file
        try:
            # get new rows to add
            file_name = os.listdir(self.intermediate_dir)[0]
            with open(f'{self.intermediate_dir}/{file_name}', 'r+') as f:
                reader = csv.reader(f, delimiter=',')
                current_rows = [row for i, row in enumerate(reader) if i != 1]
                new_rows = self._write_difference(existing_rows, current_rows)

            # append new rows to the existing file if available
            if new_rows:
                with open(f'{self.target_dir}/{self.fixed_name}.csv', 'a+') as f:
                    writer = csv.writer(f)
                    writer.writerows(new_rows)
        except Exception as e:
            print(e)

    # transform content inside
    def transform(self):
        # available 
        if not os.path.exists(f'{self.target_dir}/{self.fixed_name}.csv'):
            self._create_new_file()
            print(f'Created {self.fixed_name}.csv')
        
        # unavailable
        else:
            self._update_file()
            print(f'Updated {self.fixed_name}.csv')

        # clear files for new iteration
        self._cldir()

# Selenium

<b>Preparation</b>

In [5]:
# header: user-agent
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
# options
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
# browser
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

In [6]:
# redfin logging
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

<b>Store Data as CSV</b>

In [6]:
# prepare directories
cwd = os.getcwd()
# intermediate
os.chdir('../resource/temp')
intermediate_dir=os.getcwd()

# target 
os.chdir('../data/csv/api')
target_dir = os.getcwd()
os.chdir(cwd)

In [7]:
# options for browser
user_agent = r'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--headless')

# directory for meta data
prefs = {"download.default_directory": intermediate_dir, 
         "download.directory_upgrade": True, 
         "download.prompt_for_download": False}
chrome_options.add_experimental_option('prefs', prefs)

# connect to redfin homepage
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

# login 
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

In [8]:
# cities links
city = dict()
city['names'], city['links'] = get_cities_links(browser)

# get data
for name, city_link in zip(city['names'], city['links']):
    browser.get(city_link)
    transfomer = CSV_Transformer(intermediate_dir, target_dir, name)
    time.sleep(1)
    
    # download csv if available
    try:
        # click download button
        download_button = browser.find_element(By.XPATH, "//a[text()='(Download All)']")
        download_button.click()
        time.sleep(1)

        # save download link for later usage
        download_link = download_button.get_attribute('href')
        with open('../resource/data/csv/api/links.csv', 'a+') as f:
            reader, writer = csv.reader(f, delimiter='r'), csv.writer(f, delimiter=',')
            existing_rows, current_row = [row for row in reader], [[name, download_link]]
            new_row = transfomer._write_difference(existing_rows, current_row)
            if new_row:
                writer.writerow(new_row[0])

        # process and push the csv file to the target directory
        transfomer.transform()

    # when it's not available
    except:
        # save city for later usage
        with open('../resource/data/csv/web_scrapping/links.csv', 'a+') as f:
            reader, writer = csv.reader(f, delimiter=','), csv.writer(f, delimiter=',')
            existing_rows, current_row = [row for row in reader], [[name, city_link]]
            new_row = transfomer._write_difference(existing_rows, current_row)
            if new_row:
                writer.writerow(new_row[0])

Created albuquerque.csv
Created alexandria.csv
Created anchorage.csv
Created arlington.csv
Created ashburn.csv
Created aurora.csv
Created bakersfield.csv
Created baltimore.csv
Created baton_rouge.csv
Created beaverton.csv
Created bend.csv
Created birmingham.csv
Created boca_raton.csv
Created boise.csv
Created boston.csv
Created bowie.csv
Created brentwood.csv
Created buffalo.csv
Created burlington.csv
Created cape_coral.csv
Created chandler.csv
Created charleston.csv
Created charlotte.csv
Created chattanooga.csv
Created chicago.csv
Created cincinnati.csv
Created columbia.csv
Created columbus.csv
Created dallas.csv
Created des_moines.csv
Created detroit.csv
Created el_paso.csv
Created eugene.csv
Created fairfax.csv
Created flagstaff.csv
Created fort_lauderdale.csv
Created fort_myers.csv
Created fort_worth.csv
Created frederick.csv
Created fremont.csv
Created frisco.csv
Created gilbert.csv
Created glenview.csv
Created honolulu.csv
Created houston.csv
Created indianapolis.csv
Created irvi

In [13]:
browser.quit()

<b>Draft</b>

In [2]:
with requests.Session() as s:
    # get respone of the GET request
    user_agent = r'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
    headers = {'User-Agent': user_agent}
    test_url = 'https://www.redfin.com/city/30818/TX/Austin'
    r = s.get(test_url, headers=headers)

    # soup
    soup = BeautifulSoup(r.content)

    # DOM object
    dom = etree.HTML(str(soup))

In [3]:
# DOM object
dom = etree.HTML(str(soup))

In [4]:
map_home_cards = dom.xpath("//div[contains(@id, 'MapHomeCard')]")
eval(map_home_cards[0].xpath(".//descendant::script")[0].text)[0]

{'@context': 'http://schema.org',
 'name': '2109 Maxwell Ln Unit B, Austin, TX 78741',
 'url': 'https://www.redfin.com/TX/Austin/2109-Maxwell-Ln-78741/unit-B/home/108558574',
 'address': {'@type': 'PostalAddress',
  'streetAddress': '2109 Maxwell Ln Unit B',
  'addressLocality': 'Austin',
  'addressRegion': 'TX',
  'postalCode': '78741',
  'addressCountry': 'US'},
 'geo': {'@type': 'GeoCoordinates',
  'latitude': 30.220883,
  'longitude': -97.6987679},
 'numberOfRooms': 2,
 'floorSize': {'@type': 'QuantitativeValue', 'value': 888, 'unitCode': 'FTK'},
 '@type': 'SingleFamilyResidence'}

In [5]:
tag_content = re.sub(r'\\', '', dom.xpath("//script")[-2].text)
text = re.findall(r"\"homes\":\[[^'']*\],\"dataSources\"", tag_content)
x1 = re.sub(r",\"dataSources\"",'', text[0])[8:]
x2 = re.sub(r"false", "False", x1)
x3 = re.sub(r"true", "True", x2)
eval(x3)

[{'mlsId': {'label': 'MLS#', 'value': '6682437'},
  'showMlsId': False,
  'mlsStatus': 'Active',
  'showDatasourceLogo': False,
  'price': {'value': 349000, 'level': 1},
  'hideSalePrice': False,
  'hoa': {'level': 1},
  'isHoaFrequencyKnown': True,
  'sqFt': {'value': 888, 'level': 1},
  'pricePerSqFt': {'value': 393, 'level': 1},
  'lotSize': {'value': 3027, 'level': 1},
  'beds': 2,
  'baths': 1.5,
  'fullBaths': 1,
  'partialBaths': 1,
  'location': {'value': 'Maxwell Condos', 'level': 1},
  'latLong': {'value': {'latitude': 30.220883, 'longitude': -97.6987679},
   'level': 1},
  'streetLine': {'value': '2109 Maxwell Ln Unit B', 'level': 1},
  'unitNumber': {'value': 'Unit B', 'level': 1},
  'city': 'Austin',
  'state': 'TX',
  'zip': '78741',
  'postalCode': {'value': '78741', 'level': 1},
  'countryCode': 'US',
  'showAddressOnMap': True,
  'soldDate': 1623222000000,
  'searchStatus': 1,
  'propertyType': 3,
  'uiPropertyType': 2,
  'listingType': 1,
  'propertyId': 108558574,
  