In [168]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 

import numpy as np
import pandas as pd

import pickle
import re
import time
import os

chromedriver = f"{os.environ['HOME']}/.local/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

In [189]:
# For test purposes
url = "https://www.realtor.com/realestateandhomes-detail/4745-16th-Ave-NE_Seattle_WA_98105_M14191-38985https://www.realtor.com/realestateandhomes-detail/4745-16th-Ave-NE_Seattle_WA_98105_M14191-38985"
driver.get(url)
soup1 = BeautifulSoup(driver.page_source, 'html.parser')
type(soup1)

bs4.BeautifulSoup

In [190]:
def floorplan_scrapper(soup, room_type):
    content = soup.find(id=room_type)
    
    df = pd.DataFrame(columns=['Room Type','Bath','Sqft','Price'])
    
    for row in content.find_all('tr'):
        name = room_type.replace('floorplan-','')
        bath = float(row.find(class_="col-bath").text.strip()[0])
        sqft = float(row.
                     find(class_="col-sqft").
                     text.strip().
                     split()[0].
                     replace(',',''))
        price = row.find(class_="col-price").text.strip()
        if price == "N/A":
            continue
        price = float(price.replace('$','').replace(',',''))
        
        temp = [name,bath,sqft,price]
        df = df.append(pd.DataFrame([temp],columns=['Room Type','Bath','Sqft','Price']),ignore_index=True)
        
    return df

In [191]:
def floorplan_scrapper1(soup):
    ids = []
    floorplan=soup.find(class_="list-floorplans")
    df = pd.DataFrame(columns=['Room Type','Bath','Sqft','Price'])
    if floorplan: 
        for panel in floorplan.find_all(class_="panel"):
            ids.append(panel["id"])

        for id_ in ids:
            df = df.append(floorplan_scrapper(soup, id_),ignore_index=True)
        
        return df

In [192]:
# for test purposes
df_floorplan = floorplan_scrapper1(soup1)
df_floorplan

Unnamed: 0,Room Type,Bath,Sqft,Price
0,3-bedroom,2.0,1200.0,2495.0


In [193]:
def get_address_str(address):
    a = address.text.strip().split('\n')[1:]
    b = []
    for add in a:
        b.append(add.strip())
    return ' '.join(b)

In [194]:
def get_features_list(list_a):    
    la = []
    for l in list_a:
        la.append(l.text.strip().split('\n'))
    lb = []
    for l in la:
        lbb = []
        for ll in l:
            lbb.append(ll.strip())
        lb.append(lbb)
    return lb

In [195]:
def trait_scrapper(soup):
    ty = soup.find("li", attrs={"data-label":"property-type"})
    bu = soup.find("li", attrs={"data-label":"property-year"})
        
    # Add columns of property type and year-built
    df = pd.DataFrame(columns=["Type", "Built"])
    if ty is not None:
        df["Type"] = (ty.
                      find(text=re.compile("Type")).
                      find_next("div").
                      contents) 
    if bu is not None:
        df["Built"] = (bu.
                       find(text=re.compile("Built")).
                       find_next("div").
                       contents)
    
    # Add columns of property name and address
    if soup.find(class_="ldp-header-address "):
        address = soup.find(class_="ldp-header-address ").find(itemprop="address")
        name = address.text.strip().split('\n')[0]
        if name[0].isnumeric():
            df["Address"] = name + ' ' + get_address_str(address)
        else:    
            df["Address"] = get_address_str(address)
            df["Name"] = name

    # Add columns of Community Features and Unit Features,
    # which are stored in a list
    section = soup.find(class_="listing-subsection-features")
    if section:
        list1 = get_features_list(section.find_all(class_="col-sm-6"))
    
    # Change df into object type to enable it to store list values
    df.astype(object)
    if soup.find(text=re.compile("Community Features")):
        df["Community Features"] = [list1[0]+list1[1]]
        if soup.find(text=re.compile("Unit Features")):       
            df["Unit Features"] = [list1[2]+list1[3]]
    elif soup.find(text=re.compile("Unit Features")):
        df["Unit Features"] = [list1[0]+list1[1]]
    
    # Add columns of number of surrounding rated schools
    # and their average ratings 
    school = soup.find_all(class_="school-rating")
    
    rating_list = []
    for s in school:
        rating_list.append(s.text)
    
    rating = [int(a) for a in rating_list if a.isnumeric() ]
    df["Number of Schools"] = len(rating)
    df["Average School Rating"] = round(np.mean(rating),2)
    
    # Add columns of median rental price and median listing price
    # in the neighborhood
    nb = soup.find_all(class_ = "neighborhood-flex-item")
    
    if len(nb) != 0:
        nb_list = []
        for nb1 in nb:
            nb_list.append(nb1.text.strip().split('\n'))

        title1 = nb_list[0][-1].strip()
        title2 = nb_list[1][-1].strip()
        value1 = float(nb_list[0][0].replace('$','').replace(',',''))
        value2 = float(nb_list[1][0].replace('$','').replace(',',''))
        df[title1] = value1
        df[title2] = value2

    return df

In [196]:
soup1.find(text=re.compile("Seattle, WA Real Estate & Homes for Sale"))

In [197]:
soup1.find(text=re.compile("Seattle, WA Real Estate & Homes for Sale")) is not None

False

In [198]:
trait_scrapper(soup1)

Unnamed: 0,Type,Built,Address,Unit Features,Number of Schools,Average School Rating,Median Rental Price,Median Listing Price
0,Apartment,,"4745 16th ave NE 4745 16th Ave NE, Seattle, WA...","[Dishwasher, Disposal]",5,8.0,2650.0,684990.0


In [199]:
def meta_extract(soup, info):
    meta = soup.find(id="ldp-property-meta")
    if meta:
        l = meta.find("li", attrs={"data-label": f"property-meta-{info}"})
        if l:
            return l.text.strip().split()

In [200]:
l = soup1.find(id="ldp-property-meta")
l

<div class="ldp-header-meta mobile-wrapper margin-bottom" id="ldp-property-meta">
<ul class="property-meta list-horizontal list-style-disc list-spaced" itemprop="description">
<li data-label="property-meta-beds">
<span class="data-value">3</span>
      beds
    </li>
<li data-label="property-meta-bath">
<span class="data-value">2</span>
      baths
    </li>
<li data-label="property-meta-sqft">
<span class="data-value">1,200</span> sq ft
    </li>
</ul>
</div>

In [214]:
def single_page_scraper(soup):
    df_trait = trait_scrapper(soup)

    # Add floorplan to properties which have a table of floorplans,
    # othrewise extract meta property information
    if floorplan_scrapper1(soup) is not None:
        df_floorplan = floorplan_scrapper1(soup)
        # Avoid expanding the DataFrame for multiple times by accident
        if(df_trait.shape[0] != df_floorplan.shape[0]):
            df_trait = pd.concat([df_trait] * df_floorplan.shape[0], 
                           ignore_index=True)  
        # Right append the trait columns to the floorplan DataFram
        df1 = pd.concat([df_trait,df_floorplan], axis=1)

    else:
        df1 = df_trait
        beds = meta_extract(soup, "beds")
        bath = meta_extract(soup, "bath")
        sqft = meta_extract(soup, "sqft")
        pets = meta_extract(soup, "pets")
        price = soup.find(itemprop="price").text
        if beds:
            df1["Room Type"] = beds[0] + "-bedroom" 
        if bath:
            if len(bath) <= 2:
                df1["Bath"] = float(bath[0])
            else:
                df1["Bath"] = float(bath[0]) + float(bath[2])*0.5 
        if sqft:        
            df1["Sqft"] = float(sqft[0].replace(',',''))
        if pets:
            df1["Unit Features"] = ' '.join(pets)
        if len(price) != 0:
            df1["Price"] = float(price.
                                 strip().
                                 replace('$','').
                                 replace(',',''))

    # Reorder columns and fill missing values as Nan objects
    cols = ["Name","Type","Address","Built",
                     "Room Type","Bath","Sqft","Price",
                    "Number of Schools","Average School Rating",
                    "Median Rental Price", "Median Listing Price",
                    "Community Features","Unit Features"]     
    df2 = pd.DataFrame(columns = cols)
    df = pd.concat([df2,df1], ignore_index = True)
    df = df[cols]

    return df

In [205]:
soup1.find(itemprop = "price")

In [203]:
# for test purposes
df1 = single_page_scraper(soup1)
df1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Name,Type,Address,Built,Room Type,Bath,Sqft,Price,Number of Schools,Average School Rating,Median Rental Price,Median Listing Price,Community Features,Unit Features
0,,Apartment,"4745 16th ave NE 4745 16th Ave NE, Seattle, WA...",,3-bedroom,2.0,1200.0,2495.0,5,8.0,2650.0,684990.0,,"[Dishwasher, Disposal]"


## Let's try multiple pages

In [16]:
def url_scrapper(total_page):
    driver.get("https://www.realtor.com/apartments/Seattle_WA")
    
    # Scrap 22 pages of urls of apartment for leasing into a list
    j = 0 
    urls = []
    while j < total_page:
        next_button = driver.find_element_by_class_name("next ")
        next_button.click()
        time.sleep(5) # Let's give the computer 5 seconds rest 
                      # after scraping a whole page of urls
        soup = BeautifulSoup(driver.page_source,'html.parser')
        for link in soup.find_all(class_="photo-wrap"): 
            urls.append("https://www.realtor.com"+link.find("a")["href"])
        j += 1

    # Save the url list in a csv file
    url_file = open("data/urls.csv", 'w')
    for url in urls:
        url_file.write(url + '\n')
    url_file.close()

In [90]:
url_scrapper(25)

In [91]:
with open("data/urls.csv", 'r') as f:
    urls = f.read().split('\n')
    urls.remove('') # Remove empty strings
    f.close()

with open("data/urls.pkl",'wb') as picklefile:
    pickle.dump(urls, picklefile)

In [184]:
len(urls)

1200

In [185]:
urls = list(set(urls)) # Keep unique urls

In [186]:
len(urls)

1167

In [93]:
temp = []
for url in urls:
    if url.startswith("https://www.realtor.com/realestateandhomes-detail/"):
        temp.append(url)
len(temp)

1200

In [83]:
def pickle_files(name):
    if name.split('.')[1] == "csv":
        data = pd.read_csv(name)
    else: 
        f = open(name, 'r')
        data = f.read().split('\n')

    pkl = name.split('.')[0] + ".pkl"
    with open(pkl,'wb') as picklefile:
        pickle.dump(data, picklefile)

In [84]:
urls[:5]

['https://www.realtor.com/realestateandhomes-detail/2324-W-Newton-St-Apt-302_Seattle_WA_98199_M21149-70194',
 'https://www.realtor.com/realestateandhomes-detail/3418A-22nd-Ave-W_Seattle_WA_98199_M29660-20775',
 'https://www.realtor.com/realestateandhomes-detail/835-S-142nd-St_Burien_WA_98168_M24971-16354',
 'https://www.realtor.com/realestateandhomes-detail/4308-Winslow-Pl-N-B_Seattle_WA_98103_M29900-95950',
 'https://www.realtor.com/realestateandhomes-detail/8004-25th-Ave-NW_Seattle_WA_98117_M22901-40086']

In [85]:
cols = ["Name","Type","Address","Built",
           "Room Type","Bath","Sqft","Price",
           "Number of Schools","Average School Rating",
           "Median Rental Price", "Median Listing Price",
           "Community Features","Unit Features","url"]

In [229]:
df = pd.DataFrame(columns = cols)
i = 0
for url in urls[526:600]:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,"html5lib")
    df_property = single_page_scraper(soup)
    df_property["url"] = url
    df = pd.concat([df, df_property], ignore_index=True)
    df.to_csv("data/rental_data13.csv")
    time.sleep(1)
    i += 1
    print(i)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


ValueError: No objects to concatenate

In [230]:
def multiple_page_scraper(url_source, data_destination, total_url): 
    
    cols = ["Name","Type","Address","Built",
           "Room Type","Bath","Sqft","Price",
           "Number of Schools","Average School Rating",
           "Median Rental Price", "Median Listing Price",
           "Community Features","Unit Features","url"]
    df = pd.DataFrame(columns = cols)
    
    with open(url_source, 'r') as f:
        urls = f.read().split('\n')
        f.close()

    i = 0
    
    for url in urls[:total_url]:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source,"html5lib")
        df_property = single_page_scraper(soup)
        df_property["url"] = url
        df = pd.concat([df, df_property], ignore_index=True)
        df.to_csv(data_destination)
        time.sleep(1)
        i += 1
        print(i)
        

In [None]:
pickle_files("data/rental_data.csv")

!ls