# Retrieve latest data from sold homes in a given zip code

## Seeking sq ft, num bed, num bath, year built, lot size, home type, garage, num floors, swimming pool, walkability

can also consider engineering price per sq foot, age of home, distance to major highway, recent renovation

In [1]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd

In [2]:
# randomize user agent
UAS = ("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", 
       "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
       "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
       )

ua = UAS[random.randrange(len(UAS))]
headers = {'user-agent': ua}

In [3]:
zipcode = '97035'
more_pages = True
homes = []
new_content = True

# starting page
page = 1

while new_content == True:
    new_content = False
    url = 'https://www.redfin.com/zipcode/' + zipcode + '/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-' + str(page)
    
    response = requests.get(url, headers=headers)
    print(f"scraping page {page}. URL: {url}")
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
    
        hyperlinks = soup.find_all("a")
        for hyperlink in hyperlinks:
            if 'href' in hyperlink.attrs:
                href = hyperlink['href']
                # identifies the links ending in 8 digit UUID that redfin assigns
                if href[-8:].isdigit():
                    if href not in homes:
                        homes.append(href)
                        new_content = True
        page += 1
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

print(f'All pages scanned. {len(homes)} homes found:')

scraping page 1. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-1
scraping page 2. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-2
scraping page 3. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-3
scraping page 4. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-4
scraping page 5. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-5
scraping page 6. URL: https://www.redfin.com/zipcode/97035/filter/sort=hi-sale-date,property-type=house+condo+townhouse,include=sold-6mo/page-6
All pages scanned. 178 homes found:


## [Optional] View homes

In [4]:
for home in homes:
    print(f'https://www.redfin.com{home}')

https://www.redfin.com/OR/Lake-Oswego/4965-Park-Bluff-Pl-97035/home/183781964
https://www.redfin.com/OR/Lake-Oswego/3569-Sunwood-Ct-97035/home/26080164
https://www.redfin.com/OR/Lake-Oswego/5225-Firwood-Rd-97035/home/25995645
https://www.redfin.com/OR/Lake-Oswego/38-Cervantes-Cir-97035/home/143114291
https://www.redfin.com/OR/Lake-Oswego/3119-Douglas-Cir-97035/home/26031594
https://www.redfin.com/OR/Lake-Oswego/6338-Washington-Ct-97035/home/25845395
https://www.redfin.com/OR/Lake-Oswego/4672-Winthrop-Ct-97035/home/25833476
https://www.redfin.com/OR/Lake-Oswego/16900-Gassner-Ln-97035/home/26079578
https://www.redfin.com/OR/Lake-Oswego/47-Eagle-Crest-Dr-97035/unit-4/home/26653782
https://www.redfin.com/OR/Lake-Oswego/4968-Bilford-Ln-97035/home/26024417
https://www.redfin.com/OR/Lake-Oswego/15988-White-Oaks-Dr-97035/home/26038327
https://www.redfin.com/OR/Lake-Oswego/12-Cervantes-Cir-97035/home/25763801
https://www.redfin.com/OR/Lake-Oswego/12-Cervantes-Cir-97035/unit-7A/home/26633869
htt

In [5]:
homes_data = {}
unused_spans = []

for home in homes:
    url = 'https://www.redfin.com'+home

    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
    
        divs = soup.find_all("div")
        num_divs = len(divs)
        i = 0
        for div in divs:
            if 'class' in div.attrs:
                if 'street-address' in div['class']:
                    st_addr = div['title']
                    homes_data[st_addr] = {}
                    current_house = homes_data[st_addr]
                    current_house['link'] = url
                if 'beds-section' in div['class']:
                    if 'price' in div['data-rf-test-id']:
                        current_house['price'] = div.div.text
                    else:
                        current_house['beds'] = divs[i+1].text
                if 'baths-section' in div['class']:
                    current_house['baths'] = divs[i+1].text
                if 'sqft-section' in div['class']:
                    current_house['sqft'] = div.span.text
                
                if 'super-group-content' in div['class']:
                    spans = div.find_all('span')
                    for span in spans:
                        if 'class' in span.attrs:
                            if 'entryItemContent' in span['class']:
                                if 'Parking Features' in span.text:
                                    current_house['parking_features'] = span.span.text
                                elif 'Parking Total' in span.text:
                                    current_house['parking_total'] = span.span.text
                                elif 'Garage Type' in span.text:
                                    current_house['garage_type'] = span.span.text
                                elif 'Garage Spaces' in span.text:
                                    current_house['garage_spaces'] = span.span.text
                                elif 'Hot Water Description' in span.text:
                                    current_house['hot_water_desc'] = span.span.text
                                elif 'Fireplace YN' in span.text:
                                    current_house['fireplace_yn'] = span.span.text
                                elif 'Basement' in span.text:
                                    current_house['basement'] = span.span.text
                                elif 'Lot Size Square Feet' in span.text:
                                    current_house['lot_size_sqft'] = span.span.text
                                elif 'Property Type' in span.text:
                                    current_house['property_type'] = span.span.text
                                elif 'Year Built' in span.text:
                                    current_house['yr_built'] = span.span.text
                                elif 'Fuel Description' in span.text:
                                    current_house['fuel_desc'] = span.span.text
                                elif 'Has HOA' in span.text:
                                    current_house['has_hoa'] = span.span.text
                                elif 'View YN' in span.text:
                                    current_house['view_yn'] = span.span.text
                                elif 'Fireplaces Total' in span.text:
                                    current_house['fireplace_total'] = span.span.text
                                elif 'Main Level Area Total' in span.text:
                                    current_house['main_level_area'] = span.span.text
                                elif 'Sewer' in span.text:
                                    current_house['sewer'] = span.span.text
                                elif 'Cooling YN' in span.text:
                                    current_house['cooling_yn'] = span.span.text
                                elif 'Senior Community YN' in span.text:
                                    current_house['senior_community_yn'] = span.span.text
                                elif 'Lot Size' in span.text:
                                    current_house['lot_size'] = span.span.text
                                elif 'Style' in span.text:
                                    current_house['style'] = span.span.text
                                elif 'Year Renovated' in span.text:
                                    current_house['yr_renovated'] = span.span.text
                                elif 'County' in span.text:
                                    current_house['county'] = span.span.text
                                elif 'New Construction YN' in span.text:
                                    current_house['new_construction_yn'] = span.span.text
                                elif 'Stories' in span.text:
                                    current_house['stories'] = span.span.text
                                elif 'Roof' in span.text:
                                    current_house['roof'] = span.span.text
                                # to see what data I'm not using
                                else:
                                    if span.text not in unused_spans:
                                        unused_spans.append(span.text)
            i += 1
    
    else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [6]:
df = pd.DataFrame(homes_data).transpose()
df.head()

Unnamed: 0,link,price,beds,baths,sqft,parking_features,parking_total,garage_type,garage_spaces,hot_water_desc,...,fuel_desc,sewer,cooling_yn,has_hoa,senior_community_yn,stories,style,yr_renovated,county,new_construction_yn
4965 SW Park Bluff Pl,https://www.redfin.com/OR/Lake-Oswego/4965-Par...,"$2,099,000",5,4.5,4568,"Driveway, On Street",2,Attached,2,Gas,...,Gas,Public Sewer,Yes,Yes,No,—,Single Family Residential,—,Clackamas County,Yes
3569 Sunwood Ct,https://www.redfin.com/OR/Lake-Oswego/3569-Sun...,"$700,000",3,1.5,1350,"Driveway, Off Street",2,"Attached, Oversized",2,Gas,...,Gas,Public Sewer,No,,No,1.0,Single Family Residential,—,Clackamas County,No
5225 Firwood Rd,https://www.redfin.com/OR/Lake-Oswego/5225-Fir...,"$1,550,000",3,2.5,3240,"Driveway, RV Access/Parking",2,"Attached, Extra Deep, Oversized",2,Gas,...,Gas,Septic Tank,Yes,,No,1.0,Single Family Residential,—,Clackamas County,No
38 Cervantes Cir,https://www.redfin.com/OR/Lake-Oswego/38-Cerva...,"$293,000",2,1.5,912,Off Street,1,Other,1,Electricity,...,Electricity,"All Landscaping, Athletic Court, Basketball Co...",No,Yes,No,—,Condo/Co-op,—,Multnomah County,No
3119 Douglas Cir,https://www.redfin.com/OR/Lake-Oswego/3119-Dou...,"$2,675,000",4,6.0,6524,RV Access/Parking,14,"Attached, Detached, Oversized",14,Gas,...,"Electricity, Gas",Public Sewer,Yes,,No,2.0,Single Family Residential,—,Clackamas County,No


In [7]:
df.to_csv(f'{zipcode}_homes_data.csv')

In [8]:
# review later to see if there is any data to use later
for span in unused_spans:
    print(span)
    print()

Has Attached Garage: Yes

Virtual Tour Type: Video

Has Virtual Tour Unbranded: Yes

Virtual Tour (Unbranded)

Virtual Tour Type 2: Video

Has Virtual Tour Unbranded 2: Yes

Virtual Tour 2 (Unbranded)

Virtual Tour Type 3: Video

Has Virtual Tour Unbranded 3: Yes

Virtual Tour 3 (Unbranded)

Appliances: Built-in Oven, Built-in Range, Built-in Refrigerator, Cooktop, Dishwasher, Disposal, Island, Microwave, Quartz, Stainless Steel Appliance(s), Wine Cooler

Interior Features: 3rd Floor, Air Cleaner, Engineered Hardwood, Garage Door Opener, Hardwood Floors, High Ceilings, Laundry, Quartz, Soaking Tub, Vaulted Ceiling(s), Wall to Wall Carpet, Wood Floors

Fireplace Features: Gas

Window Features: Double Pane Windows

Bathrooms Full: 4

Bathrooms Full Lower Level: 1

Bathrooms Full Main Level: 1

Bathrooms Full Upper Level: 2

Bathrooms Partial: 1

Bathrooms Partial Main Level: 1

Bathrooms Total Integer: 5

Bathrooms Total Lower Level: 1

Bathrooms Total Main Level: 1.1

Bathrooms Total Up