# Zillow Web Scraper
A simple web scraper written using scrapy.  
Based off of https://www.scrapehero.com/how-to-scrape-real-estate-listings-on-zillow-com-using-python-and-lxml/

In [None]:
!pip install lxml requests unicodecsv

In [6]:
import os
from lxml import html
import requests
import unicodecsv as csv
import argparse

# List of San Francsisco zip codes to scrape
SF_ZIPCODES = [94102,
             94104,
             94103,
             94105,
             94108,
             94107,
             94110,
             94109,
             94112,
             94111,
             94115,
             94114,
             94117,
             94116,
             94118,
             94121,
             94123,
             94122,
             94124,
             94127,
             94126,
             94129,
             94131,
             94133,
             94132,
             94134,
             94139,
             94143,
             94146,
             94151,
             94159,
             94158,
             94188,
             94177]

In [7]:
def parse(zipcode: str, filter: str = None):
    if filter == "newest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
        url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(
            zipcode)

    for i in range(5):
        # try:
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, sdch, br',
            'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
            'cache-control': 'max-age=0',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        print(response.status_code)
        parser = html.fromstring(response.text)
        search_results = parser.xpath("//div[@id='search-results']//article")
        properties_list = []

        for properties in search_results:
            raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
            raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
            raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
            raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
            raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
            raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
            raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
            url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
            raw_title = properties.xpath(".//h4//text()")

            address = ' '.join(' '.join(raw_address).split()) if raw_address else None
            city = ''.join(raw_city).strip() if raw_city else None
            state = ''.join(raw_state).strip() if raw_state else None
            postal_code = ''.join(raw_postal_code).strip() if raw_postal_code else None
            price = ''.join(raw_price).strip() if raw_price else None
            info = ' '.join(' '.join(raw_info).split()).replace(u"\xb7", ',')
            broker = ''.join(raw_broker_name).strip() if raw_broker_name else None
            title = ''.join(raw_title) if raw_title else None
            property_url = "https://www.zillow.com" + url[0] if url else None
            is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')
            properties = {
                'address': address,
                'city': city,
                'state': state,
                'postal_code': postal_code,
                'price': price,
                'facts and features': info,
                'real estate provider': broker,
                'url': property_url,
                'title': title
            }
            if is_forsale:
                properties_list.append(properties)
        return properties_list

In [8]:
def scrape_all_sf_area_codes(output_folder: str, zipcodes: list):
    """Scrape from all SF zip codes"""
    sort = 'newest'
    for zipcode in zipcodes:
        print ("Fetching data for %s" % (zipcode))
        scraped_data = parse(str(zipcode), sort)
        print ("Writing data to output file")
        with open(f"{output_folder}/properties-{zipcode}.csv", 'wb')as csvfile:
            fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features',
                          'real estate provider', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for row in scraped_data:
                writer.writerow(row)
def do_scraping(output_folder: str, zipcodes: list):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    scrape_all_sf_area_codes(output_folder=output_folder, zipcodes=zipcodes)

In [9]:
data_dir = './scraped_data'
do_scraping(output_folder=data_dir, zipcodes=SF_ZIPCODES)

Fetching data for 94102
200
Writing data to output file
Fetching data for 94104
200
Writing data to output file
Fetching data for 94103
200
Writing data to output file
Fetching data for 94105
200
Writing data to output file
Fetching data for 94108
200
Writing data to output file
Fetching data for 94107
200
Writing data to output file
Fetching data for 94110
200
Writing data to output file
Fetching data for 94109
200
Writing data to output file
Fetching data for 94112
200
Writing data to output file
Fetching data for 94111
200
Writing data to output file
Fetching data for 94115
200
Writing data to output file
Fetching data for 94114
200
Writing data to output file
Fetching data for 94117
200
Writing data to output file
Fetching data for 94116
200
Writing data to output file
Fetching data for 94118
200
Writing data to output file
Fetching data for 94121
200
Writing data to output file
Fetching data for 94123
200
Writing data to output file
Fetching data for 94122
200
Writing data to outp

## Data is saved to a .csv for each zip code

In [20]:
%%bash -s "$data_dir"
ls $1

properties-94102.csv
properties-94103.csv
properties-94104.csv
properties-94105.csv
properties-94107.csv
properties-94108.csv
properties-94109.csv
properties-94110.csv
properties-94111.csv
properties-94112.csv
properties-94114.csv
properties-94115.csv
properties-94116.csv
properties-94117.csv
properties-94118.csv
properties-94121.csv
properties-94122.csv
properties-94123.csv
properties-94124.csv
properties-94126.csv
properties-94127.csv
properties-94129.csv
properties-94131.csv
properties-94132.csv
properties-94133.csv
properties-94134.csv
properties-94139.csv
properties-94143.csv
properties-94146.csv
properties-94151.csv
properties-94158.csv
properties-94159.csv
properties-94177.csv
properties-94188.csv


# Load and view data in DataFrame

In [21]:
import glob
import pandas as pd
def load_data_to_dataframe(data_dir: str) -> pd.DataFrame:
    """
    Load all .csv files from data_dir and concatenate them into a single DataFrame.  
    Args:
        data_dir: path to data directory
    Returns:
        pd.DataFrame: all data from files in data_dir
    Notes:
        All duplicate rows will be discarded
    """
    all_csvs = []
    # load the csv files from all scraping runs
    csv_filenames = os.path.join(data_dir, '*.csv')
    print('loading data {csv_filenames}'.format(csv_filenames=csv_filenames))
    for filename in glob.glob(csv_filenames):
        all_csvs.append(pd.read_csv(filename))
    # combine all dataframes together and drop any duplicate entries
    df = pd.concat(all_csvs, ignore_index=True).drop_duplicates()
    print("Found a total of {count} data points".format(count=len(df)))
    # save this combined dataframe as csv for safe keeping
    df.to_csv(os.path.join(data_dir, 'all_data.csv'), index=False)
    return df

df = load_data_to_dataframe(data_dir=data_dir)
df.head()

loading data ./scraped_data/*.csv
Found a total of 481 data points


Unnamed: 0,title,address,city,state,postal_code,price,facts and features,real estate provider,url
0,New Construction,288 Pacific Ave # YWAK0X,San Francisco,CA,94111,"$2,300,000+","2 bds , 3 ba , 1,207+ sqft",,https://www.zillow.com/community/288-pacific/2...
1,For Sale by Owner,1234 Sansome And Un,San Francisco,CA,94111,"$800,000","3 bds , 2 ba , 1,000 sqft",,https://www.zillow.com/homedetails/1234-Sansom...
2,Condo For Sale,101 Lombard St APT 303E,SAN FRANCISCO,CA,94111,"$1,999,000","2 bds , 2 ba , -- sqft",Sotheby's International Realty - San Francisco...,https://www.zillow.com/homedetails/101-Lombard...
3,Condo For Sale,288 Pacific Ave # 2A,San Francisco,CA,94111,"$2,300,000","2 bds , 3 ba , 1,207 sqft",Pacific Union International Inc.,https://www.zillow.com/homedetails/288-Pacific...
4,Condo For Sale,733 Front St UNIT 606,SAN FRANCISCO,CA,94111,"$1,195,000","1 bd , 1 ba , 920 sqft",Climb Real Estate,https://www.zillow.com/homedetails/733-Front-S...
