In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def scrape_redfin(base_url, num_pages=9):
    all_data = {
        'sqft': [],
        'price': [],
        'beds': [],
        'baths': [],
        'address': []
    }

    Headers = {
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.5',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0'
    }

    for page in range(1, num_pages + 1):
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}/page-{page}"

        try:

          time.sleep(3)
          response = requests.get(url, headers=Headers)
          response.raise_for_status()
          soup = BeautifulSoup(response.text, 'html.parser')

          prices = soup.find_all('span', attrs='bp-Homecard__Price--value')
          beds = soup.find_all('span', attrs='bp-Homecard__Stats--beds text-nowrap')
          baths = soup.find_all('span', attrs='bp-Homecard__Stats--baths text-nowrap')
          addresses = soup.find_all('div', attrs='bp-Homecard__Address flex align-center color-text-primary font-body-xsmall-compact')
          sqft = soup.find_all('span', attrs='bp-Homecard__Stats--sqft text-nowrap')

          if not prices:
              print(f"No more listings found on page {page}")
              break

          all_data['sqft'].extend([sq.text.strip() for sq in sqft])
          all_data['price'].extend([price.text.strip() for price in prices])
          all_data['beds'].extend([bed.text.strip() for bed in beds])
          all_data['baths'].extend([bath.text.strip() for bath in baths])
          all_data['address'].extend([address.text.strip() for address in addresses])

          print(f"Scraped page {page} successfully")

        except requests.exceptions.RequestException as e:
            print(f"Error scraping page {page}: {e}")
            break

    df = pd.DataFrame(all_data)
    return df

In [3]:
base_url = 'https://www.redfin.com/city/17150/UT/Salt-Lake-City'
df = scrape_redfin(base_url, num_pages=9)

Scraped page 1 successfully
Scraped page 2 successfully
Scraped page 3 successfully
Scraped page 4 successfully
Scraped page 5 successfully
Scraped page 6 successfully
Scraped page 7 successfully
Scraped page 8 successfully
Scraped page 9 successfully


In [8]:
df.to_csv('real_estate.csv')