In [1]:
import pandas as pd
import re

from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep, ctime

import psycopg2
from sqlalchemy import create_engine
from config import driver, username, password, host, port, database

def init_browser():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser('chrome', **executable_path, headless=False)

In [2]:
# Cities to search
cities = ['Los-Angeles_CA','New-York_NY', 'Chicago_IL', 'Houston_TX']

cities = ['Los-Angeles_CA']

In [3]:
# Start browser
browser = init_browser()



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [C:\Users\kesam\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe] found in cache


In [4]:
# Create blank lists/dictionaries to store attributes
prices = []
beds = []
baths = []
sizes = []
addresses = []
statuses = []
detail_pages = []

In [5]:
listing_indicators = {}
types = []
fees = []
pricesqfts = []
garages = []
years = []

In [6]:
## Identify listing attributes from result card

In [7]:
# Loop through each city
for city in cities:

    print(f"Searching {city}...{ctime()}\n")
    
    # Loop through each search result page
    for i in range(1, 6, 1):
        
        # Set dynamic URL
        url = f"https://www.realtor.com/realestateandhomes-search/{city}/pg-{i}"
        browser.visit(url)
        
        print(f"Scraping page {i} at {ctime()}")
        
        # HTML object
        html = browser.html
        
        # Parse HTML with Beautiful Soup
        soup = bs(html, "html.parser")

        # Identify all listings
        listings = soup.find_all('li', attrs={"data-testid": "result-card"})

        # Loop through each listing to identify attributes
        for listing in listings:          
            try:
                price = listing.find('span', attrs={"data-label": "pc-price"}).text.strip('$')
                prices.append(price)
            except:
                prices.append('No Info')
            try:
                bed = int(listing.find('li', attrs={"data-label": "pc-meta-beds"}).text.strip('bed'))
                beds.append(bed)
            except:
                beds.append('No Info')
            try:
                bath = float(listing.find('li', attrs={"data-label": "pc-meta-baths"}).text.strip()[0])
                baths.append(bath)
            except:
                baths.append('No Info')
            try:
                size = listing.find('li', attrs={"data-label": "pc-meta-sqft"}).text.strip('sqft')
                sizes.append(size)
            except:
                sizes.append('No Info')
            try:
                address = listing.find('div', attrs={"data-label": "pc-address"}).text
                addresses.append(address)
            except:
                addresses.append('No Info')
            try:
                status = listing.find('span', attrs={"class": "jsx-3853574337 statusText"}).text
                statuses.append(status)
            except:
                statuses.append('No Info')

            
            # Identify URL to listing detail page
            detail_page = listing.find('a').get('href')
            
            # Append to list
            detail_pages.append(detail_page)
            
        # Generate random number between 2 to 10 seconds to wait before continuing loop
        sleep(randint(2,10))
            
    print(f"\n----------------------------{ctime()}\n")
    
print('Scraping complete')

Searching Los-Angeles_CA...Thu Sep 23 18:24:08 2021

Scraping page 1 at Thu Sep 23 18:24:13 2021
Scraping page 2 at Thu Sep 23 18:24:22 2021
Scraping page 3 at Thu Sep 23 18:24:28 2021
Scraping page 4 at Thu Sep 23 18:24:38 2021
Scraping page 5 at Thu Sep 23 18:24:47 2021

----------------------------Thu Sep 23 18:24:49 2021

Scraping complete


In [8]:
## Identify listing attributes from detail page

In [None]:
i = 1
num_page = len(detail_pages)
print(f"Total of {num_page} listings found\n")

# Loop through each listing detail page
for detail_page in detail_pages:
    
    print(f"Scraping details from listing {i} of {num_page}")
    
    # Navigate to each href
    detail_url = f"https://www.realtor.com{detail_page}"
    
    browser.visit(detail_url)
    
    # HTML object
    html = browser.html

    # Parse HTML with Beautiful Soup
    detail_soup = bs(html, "html.parser")

    # Identify all listing-indicators
    details = detail_soup.find('div', attrs={"data-testid": "listing-indicator"})
    re_li = re.compile('rui*')
    
    try:
        for x in details.find_all('li', re_li):
            listing_indicators[x.find_all('div', attrs={'class': re_li})[0].text] = x.find_all('div', attrs={'class': re_li})[1].text
            
        try:
            property_type = listing_indicators['Property Type']
            types.append(property_type)
        except:
            types.append('No Info')
        try:
            hoa_fee = listing_indicators['HOA Fees'].strip('/mo')
            hoa_fee = hoa_fee.strip('$')
            fees.append(hoa_fee)
        except:
            fees.append('No Info')
        try:
            pricesqft = listing_indicators['Price per sqft'].strip('$')
            pricesqfts.append(pricesqft)
        except:
            pricesqfts.append('No Info') 
        try:
            garage = listing_indicators['Garage'].strip(' car')
            garage = garage.strip(' cars')
            garages.append(garage)
        except:
            garages.append('No Info')
        try:
            year = listing_indicators['Year Built']
            years.append(year)
        except:
            years.append('No Info')

    except:
        types.append('No Info')
        fees.append('No Info')
        pricesqfts.append('No Info')
        garages.append('No Info')
        years.append('No Info')
        
        sleep(randint(2,15))
    
    i = i + 1
    
    # Generate random number between 2 to 10 seconds to wait before continuing loop
    # sleep(randint(2,10))
    
print('\nScraping complete')

Total of 42 listings found

Scraping details from listing 1 of 42
Scraping details from listing 2 of 42
Scraping details from listing 3 of 42
Scraping details from listing 4 of 42
Scraping details from listing 5 of 42
Scraping details from listing 6 of 42
Scraping details from listing 7 of 42
Scraping details from listing 8 of 42
Scraping details from listing 9 of 42
Scraping details from listing 10 of 42
Scraping details from listing 11 of 42
Scraping details from listing 12 of 42
Scraping details from listing 13 of 42
Scraping details from listing 14 of 42
Scraping details from listing 15 of 42
Scraping details from listing 16 of 42
Scraping details from listing 17 of 42
Scraping details from listing 18 of 42
Scraping details from listing 19 of 42
Scraping details from listing 20 of 42
Scraping details from listing 21 of 42
Scraping details from listing 22 of 42
Scraping details from listing 23 of 42
Scraping details from listing 24 of 42
Scraping details from listing 25 of 42
Scrapi

In [None]:
# Add attributes to dataframe
df = pd.DataFrame({'Address': addresses, 'Status': status, 'Property Type': types, 'Price': prices, 'Price per sqft': pricesqfts, 'HOA Fees': fees, 'Bed': beds, 'Bath': baths, 'Size': sizes, 'Garage': garages, 'Year Built': years})

# Extract address into Street, City, State, Zip
street_city = df['Address'].str.split(',', expand=True)
street_city = street_city.rename(columns={0: 'Street', 1: 'City', 2: 'state_zip'})
state_zip = street_city['state_zip'].str.split(' ', expand=True)
state_zip = state_zip.rename(columns={1: 'State', 2: 'Zip'})
street_city = street_city.drop(columns='state_zip')
state_zip = state_zip.drop(columns=0)

street_city.reset_index(drop=True, inplace=True)
state_zip.reset_index(drop=True, inplace=True)

# Create merged_df
address_df = pd.concat([street_city, state_zip], axis=1) 

merged_df = pd.concat([df, address_df], axis=1) 

In [None]:
merged_df.to_csv('listings.csv', index=False)

In [None]:
browser.quit()

In [None]:
merged_df

In [None]:
# Setup database connection
connection_string = f"{driver}://{username}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string)
connection = engine.connect()