In [5]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import time

def clean_text(text):
    """Clean text by removing extra whitespace and special characters"""
    return ' '.join(text.strip().split()) if text else 'N/A'

def scrape_redfin_properties(city_url, max_pages=100):
    """Scrape property listings from Redfin"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
    }
    
    properties = []
    base_url = "https://www.redfin.com"
    
    for page in range(1, max_pages + 1):
        try:
            # Construct URL for pagination
            page_url = f"{city_url}/page-{page}" if page > 1 else city_url
            response = requests.get(page_url, headers=headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, "html.parser")
            homecards = soup.find_all("div", class_="HomeCardContainer")
            
            if not homecards:
                break
                
            for card in homecards:
                try:
                    # Extract link
                    link_elem = card.find("a", class_="link-and-anchor")
                    link = urljoin(base_url, link_elem["href"]) if link_elem else 'N/A'
                    
                    # Extract price
                    price_elem = card.find("span", class_="bp-Homecard__Price--value")
                    price = clean_text(price_elem.text) if price_elem else 'N/A'
                    
                    # Extract beds
                    beds_elem = card.find("span", class_="bp-Homecard__Stats--beds")
                    beds = clean_text(beds_elem.text) if beds_elem else 'N/A'
                    
                    # Extract baths
                    baths_elem = card.find("span", class_="bp-Homecard__Stats--baths")
                    baths = clean_text(baths_elem.text) if baths_elem else 'N/A'
                    
                    # Extract square footage
                    sqft_elem = card.find("span", class_="bp-Homecard__Stats--sqft")
                    if not sqft_elem:
                        # Alternative class names for square footage
                        sqft_elem = card.find("span", {"data-rf-test-id": "homecard-stats-sqft"})
                    if not sqft_elem:
                        # Try finding by text pattern
                        stats_section = card.find("div", class_="bp-Homecard__Stats")
                        if stats_section:
                            stats_text = stats_section.get_text()
                            import re
                            sqft_match = re.search(r'(\d{1,3}(?:,\d{3})*)\s*sq\s*ft', stats_text, re.IGNORECASE)
                            sqft = sqft_match.group(1) + " sq ft" if sqft_match else 'N/A'
                        else:
                            sqft = 'N/A'
                    else:
                        sqft = clean_text(sqft_elem.text)
                    
                    # Extract address
                    address_elem = card.find("div", class_="bp-Homecard__Address")
                    address = clean_text(address_elem.text) if address_elem else 'N/A'
                    
                    # Store property data - now including sqft
                    property_data = {
                        "Price": price,
                        "Beds": beds,
                        "Baths": baths,
                        "Sqft": sqft,
                        "Address": address,
                        "Link": link
                    }
                    properties.append(property_data)
                    
                except AttributeError as e:
                    print(f"Error parsing property card: {e}")
                    continue
            
            time.sleep(1)
            
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break
    
    return properties

def save_to_csv(properties, filename="redfin_properties.csv"):
    """Save properties to CSV file"""
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Price", "Beds", "Baths", "Sqft", "Address", "Link"])
        writer.writeheader()
        writer.writerows(properties)

if __name__ == "__main__":
    city_urls = {
        'Florida': "https://www.redfin.com/city/13655/FL/Orlando",
        "FLorida": "https://www.redfin.com/city/18142/FL/Tampa",
        'Florida': "https://www.redfin.com/county/442/FL/Broward-County",
        'Florida': "https://www.redfin.com/city/11458/FL/Miami",
    }
    all_properties = []
    for city, url in city_urls.items():
        print(f"Scraping {city}...")
        properties = scrape_redfin_properties(url)
        for prop in properties:
            prop["City"] = city
        all_properties.extend(properties)
    if all_properties:
        with open("redfin_properties_all_cities.csv", mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=["City", "Price", "Beds", "Baths", "Sqft", "Address", "Link"])
            writer.writeheader()
            writer.writerows(all_properties)
        print("All properties saved to redfin_properties_all_cities.csv")
    else:
        print("No properties found for any city.")


Scraping Florida...
Scraping FLorida...
All properties saved to redfin_properties_all_cities.csv
