# Preparation

<b>Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# data structures
import numpy as np
import pandas as pd

# database
import sqlite3

# others
import sys, os, re, datetime, time

<b>UDF</b>

In [2]:
# login to redfin
def login_to_redfin(browser: WebDriver, email: str, password: str) -> None:
    time.sleep(5)
    ## begin logging
    browser.find_element(By.XPATH, "//span[text()='Join / Sign in']/..").click()
    ## email
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='emailInput']").send_keys(email)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()
    ## password
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='passwordInput']").send_keys(password)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    return None

In [3]:
# get links to cities' pages
def get_cities_links(browser: WebDriver) -> tuple[list, list]:
    time.sleep(1)
    ## get list of cities
    cities = browser.find_elements(By.XPATH, f"//span[text()='Search for homes by city']/following-sibling::ul/child::li")
    ## show more button to get all the names of cities
    try:
        browser.find_element(By.XPATH, "//span[text()='Search for homes by city']/parent::*//span[text()='Show more']").click()
    except:
        pass
    ## get link for each city
    names = list(map(lambda x: re.sub(' real estate', '', x.text.strip()), cities))
    links = [city.find_element(By.XPATH, ".//child::a").get_attribute('href') for city in cities]

    return names, links

In [4]:
# get links to download the csv files
def get_csv_link(browser: WebDriver) -> str:
    link =  browser.find_element(By.XPATH, "//a[text()='(Download All)']").get_attribute('href')

    return link

# Selenium

<b>Draft</b>

In [5]:
# header: user-agent
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
# options
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
# browser
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

In [6]:
# redfin logging
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

Forget to get the name of the city - Store data in a csv file

In [51]:
# store links downloading csv files in a txt file
cities_links = get_cities_links(browser)
for link in cities_links:
    browser.get(link)
    time.sleep(1)
    try:
        link = get_csv_link(browser)
        with open('../resource/data/csv_links.txt', 'a') as f:
            f.write(f'{link}\n')
    except:
        with open('../resource/data/failed_to_get_csv_link.txt', 'a') as f:
            f.write(f'{link}\n')

Attach the name of each city to the link - Store data in a database

In [None]:
# get csv links
## get links to cities' pages
cities = {'names': None, 'links': None}
cities['names'], cities['links'] = get_cities_links(browser)
##
with sqlite3.connect(r'../resource/data/homes_by_city.db') as conn:
    cursor = conn.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS csv_links (id INTEGER PRIMARY KEY, \
                                                            name TEXT, \
                                                            available INTEGER, \
                                                            link TEXT, \
                                                            scrapped_time TEXT)')
    conn.commit()
    ### insert data to table
    for name, city_link in zip(cities['names'], cities['links']):
        ###
        browser.get(city_link)
        time.sleep(1)
        scrapped_time = datetime.datetime.now().strftime(r'%Y-%m-%d %H-%M-%S')
        ###
        try:
            csv_link = get_csv_link(browser)
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time) \
                                VALUES (?, ?, ?, ?)", (name, 1, csv_link, scrapped_time))
            conn.commit()
        except:
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time)\
                                VALUES (?, ?, ?, ?)", (name, 0, city_link, scrapped_time))
            conn.commit()

    cursor.close()

Download data from direct csv links

In [5]:
# start a connection
with sqlite3.connect(r'../resource/data/homes_by_city.db') as conn:
    cursor = conn.cursor()

    cursor.close()