# Preparation

<b>Libraries</b>

In [1]:
# bs4
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# data structures
import numpy as np
import pandas as pd

# database
import sqlite3
import csv

# others
import sys, os, re, glob, datetime, time

<b>UDF</b>

In [2]:
# login to redfin
def login_to_redfin(browser: WebDriver, email: str, password: str) -> None:
    time.sleep(5)
    # begin logging
    browser.find_element(By.XPATH, "//span[text()='Join / Sign in']/..").click()
    
    # email
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='emailInput']").send_keys(email)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    # password
    time.sleep(1)
    browser.find_element(By.XPATH, "//input[@name='passwordInput']").send_keys(password)
    browser.find_element(By.XPATH, "//span[text()='Continue with Email']/..").click()

    return None

In [3]:
# get links to cities' pages
def get_cities_links(browser: WebDriver) -> tuple[list, list]:
    time.sleep(1)
    # get list of cities
    cities = browser.find_elements(By.XPATH, f"//span[text()='Search for homes by city']/following-sibling::ul/child::li")
    
    # show more button to get all the names of cities
    try:
        browser.find_element(By.XPATH, "//span[text()='Search for homes by city']/parent::*//span[text()='Show more']").click()
    except:
        pass

    # get link for each city
    def _adjust_format(web_ele: WebElement) -> str:
        text = web_ele.text.strip().lower()\
            .replace(' real estate', '')\
                .replace(' ', '_')\
                    .replace('.', '')
        return text
    names = list(map(_adjust_format, cities))
    links = [city.find_element(By.XPATH, ".//child::a").get_attribute('href') for city in cities]

    return names, links

In [4]:
# get links to download the csv files
def get_csv_link(browser: WebDriver) -> str:
    link =  browser.find_element(By.XPATH, "//a[text()='(Download All)']").get_attribute('href')

    return link

<b>UDC</b>

In [5]:
class CSV_Transformer():
    def __init__(self, intermediate_dir: str, target_dir: str, fixed_name: str) -> None:
        self.intermediate_dir = intermediate_dir
        self.target_dir = target_dir
        self.fixed_name = fixed_name

    #
    def _cldir(self):
        files_names = os.listdir(self.intermediate_dir)
        for name in files_names:
            os.remove(f'{self.intermediate_dir}/{name}')

    # 
    def _create_new_file(self):
        try:
            file_name = os.listdir(self.intermediate_dir)[0]
            # get selected rows
            with open(f'{self.intermediate_dir}/{file_name}', 'r+') as f:
                reader = csv.reader(f, delimiter=',')
                rows = [row for i, row in enumerate(reader) if i != 1]

            # add new csv file
            with open(f'{self.target_dir}/{self.fixed_name}.csv', 'w+') as f:
                writer = csv.writer(f)
                writer.writerows(rows)

        except Exception as e:
            print(e)

    # 
    def _update_file(self):
        # error here
        with open(f'{self.target_dir}/{self.fixed_name}.csv', 'r+') as f:
            reader = csv.reader(f, delimiter=',')
            existing_rows = [row for row in reader]

        try:
            file_name = os.listdir(self.intermediate_dir)[0]
            with open(f'{self.intermediate_dir}/{file_name}', 'r+') as f:
                reader = csv.reader(f, delimiter=',')
                new_rows = [row for row in reader if row not in existing_rows]

            # append new rows to the existing file
            if new_rows:
                with open(f'{self.target_dir}/{self.fixed_name}.csv', 'a+') as f:
                    writer = csv.writer(f)
                    writer.writerows(new_rows)
        except Exception as e:
            print(e)

    # transform content inside
    def transform(self):
        if not os.path.exists(f'{self.target_dir}/{self.fixed_name}.csv'):
            self._create_new_file()
            print(f'Created {self.fixed_name}.csv')
        else:
            self._update_file()
            print(f'Updated {self.fixed_name}.csv')


        # clear files for new iteration
        self._cldir()

# Selenium

<b>Preparation</b>

In [5]:
# header: user-agent
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
# options
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
# browser
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

In [6]:
# redfin logging
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

<b>Store csv links as a txt file</b>

In [51]:
# store links downloading csv files in a txt file
cities_links = get_cities_links(browser)
for link in cities_links:
    browser.get(link)
    time.sleep(1)
    try:
        link = get_csv_link(browser)
        with open('../resource/data/csv_links.txt', 'a') as f:
            f.write(f'{link}\n')
    except:
        with open('../resource/data/failed_to_get_csv_link.txt', 'a') as f:
            f.write(f'{link}\n')

<b>Store csv links as a table using SQLite</b>

In [None]:
# get csv links
## 
cities = {'names': None, 'links': None}
cities['names'], cities['links'] = get_cities_links(browser)
##
with sqlite3.connect(r'../resource/data/homes_by_city.db') as conn:
    cursor = conn.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS csv_links (id INTEGER PRIMARY KEY, \
                                                            name TEXT, \
                                                            available INTEGER, \
                                                            link TEXT, \
                                                            scrapped_time TEXT)')
    conn.commit()
    ### insert data to table
    for name, city_link in zip(cities['names'], cities['links']):
        ###
        browser.get(city_link)
        time.sleep(1)
        scrapped_time = datetime.datetime.now().strftime(r'%Y-%m-%d %H:%M:%S')
        ###
        try:
            csv_link = get_csv_link(browser)
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time) \
                                VALUES (?, ?, ?, ?)", (name, 1, csv_link, scrapped_time))
            conn.commit()
        except:
            cursor.execute("INSERT INTO csv_links (name, available, link, scrapped_time)\
                                VALUES (?, ?, ?, ?)", (name, 0, city_link, scrapped_time))
            conn.commit()

    cursor.close()

<b>Load data from csv links</b>

<b>Draft</b>

In [None]:
# prepare directories
cwd = os.getcwd()
# intermediate
os.chdir('../resource/temp')
intermediate_dir=os.getcwd()

# target 
os.chdir('../data/csv/api')
target_dir = os.getcwd()
os.chdir(cwd)

In [6]:
# options for browser
user_agent = r'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options = ChromeOptions()
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--headless')

# directory for meta data
prefs = {"download.default_directory": intermediate_dir, 
         "download.directory_upgrade": True, 
         "download.prompt_for_download": False}
chrome_options.add_experimental_option('prefs', prefs)

# connect to redfin webpage
url_redfin = 'https://www.redfin.com/'
browser = webdriver.Chrome(options=chrome_options)
browser.get(url_redfin)
time.sleep(5)

# login 
email = 'john.lukestein@gmail.com'
password = 'redfin.0504'
login_to_redfin(browser, email, password)

In [None]:
# cities links
city = dict()
city['names'], city['links'] = get_cities_links(browser)

# get data
for name, link in zip(city['names'], city['links']):
    browser.get(link)
    time.sleep(1)
    #
    try:
        browser.find_element(By.XPATH, "//a[text()='(Download All)']").click()
        time.sleep(5)
        #
        transfomer = CSV_Transformer(intermediate_dir, target_dir, name)
        transfomer.transform()
    except:
        with open('../resource/data/csv/web_scrapping/links.csv', 'a+') as f:
            #
            reader = csv.reader(f, delimiter=',')
            writer = csv.writer(f)

            # 
            existing_rows = [row for row in reader]
            current_row = [name, link]
            if current_row not in existing_rows:
                writer.writerow(current_row)

In [19]:
browser.quit()