# Preparation

<b>Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# data structures
import numpy as np
import pandas as pd

# database
import sqlite3

# others
import sys, os, re, datetime, time

<b>UDF</b>

In [2]:
# login to redfin
def login_to_redfin(browser: WebDriver, email: str, password: str) -> None:
    time.sleep(5)
    ## begin logging
    browser.find_element(By.XPATH, "//span[text()='Join / Sign in']/..").click()
    ## email
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='emailInput']").send_keys(email)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()
    ## password
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='passwordInput']").send_keys(password)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    return None

In [3]:
# get links to cities' pages
def get_cities_links(browser: WebDriver) -> tuple[list, list]:
    time.sleep(1)
    ## get list of cities
    cities = browser.find_elements(By.XPATH, f"//span[text()='Search for homes by city']/following-sibling::ul/child::li")
    ## show more button to get all the names of cities
    try:
        browser.find_element(By.XPATH, "//span[text()='Search for homes by city']/parent::*//span[text()='Show more']").click()
    except:
        pass
    ## get link for each city
    names = list(map(lambda x: re.sub(' real estate', '', x.text.strip()), cities))
    links = [city.find_element(By.XPATH, ".//child::a").get_attribute('href') for city in cities]

    return names, links

In [4]:
# get links to download the csv files
def get_csv_link(browser: WebDriver) -> str:
    link =  browser.find_element(By.XPATH, "//a[text()='(Download All)']").get_attribute('href')

    return link

# Selenium

<b>Preparation</b>

In [5]:
# header: user-agent
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
# options
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
# browser
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

In [6]:
# redfin logging
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

<b>Store csv links as a txt file</b>

In [51]:
# store links downloading csv files in a txt file
cities_links = get_cities_links(browser)
for link in cities_links:
    browser.get(link)
    time.sleep(1)
    try:
        link = get_csv_link(browser)
        with open('../resource/data/csv_links.txt', 'a') as f:
            f.write(f'{link}\n')
    except:
        with open('../resource/data/failed_to_get_csv_link.txt', 'a') as f:
            f.write(f'{link}\n')

<b>Store csv links as a table using SQLite</b>

In [7]:
# get csv links
## 
cities = {'names': None, 'links': None}
cities['names'], cities['links'] = get_cities_links(browser)
##
with sqlite3.connect(r'../resource/data/homes_by_city.db') as conn:
    cursor = conn.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS csv_links (id INTEGER PRIMARY KEY, \
                                                            name TEXT, \
                                                            available INTEGER, \
                                                            link TEXT, \
                                                            scrapped_time TEXT)')
    conn.commit()
    ### insert data to table
    for name, city_link in zip(cities['names'], cities['links']):
        ###
        browser.get(city_link)
        time.sleep(1)
        scrapped_time = datetime.datetime.now().strftime(r'%Y-%m-%d %H:%M:%S')
        ###
        try:
            csv_link = get_csv_link(browser)
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time) \
                                VALUES (?, ?, ?, ?)", (name, 1, csv_link, scrapped_time))
            conn.commit()
        except:
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time)\
                                VALUES (?, ?, ?, ?)", (name, 0, city_link, scrapped_time))
            conn.commit()

    cursor.close()

<b>Load data from csv links</b>

In [11]:
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36''Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
headers = {'User-Agent': user_agent}
test_url = 'https://www.redfin.com/stingray/api/gis-csv?al=3&has_att_fiber=false&has_deal=false&has_dishwasher=false&has_laundry_facility=false&has_laundry_hookups=false&has_parking=false&has_pool=false&has_short_term_lease=false&include_pending_homes=false&isRentals=false&is_furnished=false&is_income_restricted=false&is_senior_living=false&market=newmexico&num_homes=350&ord=redfin-recommended-asc&page_number=1&pool=false&region_id=513&region_type=6&sf=1,2,3,5,6,7&status=9&travel_with_traffic=false&travel_within_region=false&uipt=1,2,3,4,5,6,7,8&utilities_included=false&v=8'
# keep requesting until we get it !
with requests.Session() as s:
    r = s.get(test_url, headers=headers)
    print(r.content.decode('utf-8'))

SALE TYPE,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,DAYS ON MARKET,$/SQUARE FEET,HOA/MONTH,STATUS,NEXT OPEN HOUSE START TIME,NEXT OPEN HOUSE END TIME,URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING),SOURCE,MLS#,FAVORITE,INTERESTED,LATITUDE,LONGITUDE
"In accordance with local MLS rules, some MLS listings are not included in the download"
MLS Listing,,Single Family Residential,1809 Marble Ave NW,Albuquerque,NM,87104,850000,4,4.5,Albuquerque,4548,11325,1936,29,187,,Active,,,https://www.redfin.com/NM/Albuquerque/1809-Marble-Ave-NW-87104/home/92012508,Southwest MLS,1068815,N,Y,35.0951712,-106.6660328
MLS Listing,,Single Family Residential,717 Truman St NE,Albuquerque,NM,87110,324000,3,3.0,Albuquerque,1679,7840,1958,1,193,,Active,,,https://www.redfin.com/NM/Albuquerque/717-Truman-St-NE-87110/home/92044371,Southwest MLS,1070699,N,Y,35.090089,-106.5878185
MLS Lis