In [326]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 

import numpy as np
import pandas as pd

import pickle
import re
import time
import os

chromedriver = f"{os.environ['HOME']}/.local/bin/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

In [307]:
# For test purposes
url = "https://www.realtor.com/realestateandhomes-detail/8-W-Howe-St_Seattle_WA_98119_M17329-57568"
driver.get(url)
soup1 = BeautifulSoup(driver.page_source, 'html.parser')
type(soup1)

bs4.BeautifulSoup

In [76]:
def floorplan_scrapper(soup, room_type):
    content = soup.find(id=room_type)
    
    df = pd.DataFrame(columns=['Room Type','Bath','Sqft','Price'])
    
    for row in content.find_all('tr'):
        name = room_type.replace('floorplan-','')
        bath = float(row.find(class_="col-bath").text.strip()[0])
        sqft = float(row.
                     find(class_="col-sqft").
                     text.strip().
                     split()[0].
                     replace(',',''))
        price = row.find(class_="col-price").text.strip()
        if price == "N/A":
            continue
        price = float(price.replace('$','').replace(',',''))
        
        temp = [name,bath,sqft,price]
        df = df.append(pd.DataFrame([temp],columns=['Room Type','Bath','Sqft','Price']),ignore_index=True)
        
    return df

In [274]:
def floorplan_scrapper1(soup):
    ids = []
    floorplan=soup.find(class_="list-floorplans")
    df = pd.DataFrame(columns=['Room Type','Bath','Sqft','Price'])
    if floorplan: 
        for panel in floorplan.find_all(class_="panel"):
            ids.append(panel["id"])

        for id_ in ids:
            df = df.append(floorplan_scrapper(soup, id_),ignore_index=True)
        
        return df

In [275]:
# for test purposes
df_floorplan = floorplan_scrapper1(soup1)
df_floorplan

In [276]:
def get_address_str(address):
    a = address.text.strip().split('\n')[1:]
    b = []
    for add in a:
        b.append(add.strip())
    return ' '.join(b)

In [277]:
def get_features_list(list_a):    
    la = []
    for l in list_a:
        la.append(l.text.strip().split('\n'))
    lb = []
    for l in la:
        lbb = []
        for ll in l:
            lbb.append(ll.strip())
        lb.append(lbb)
    return lb

In [329]:
def trait_scrapper(soup):
    section = soup.find(class_="listing-subsection-overview")
    items = section.find_all(class_="ldp-key-fact-item")
    
    traits = []
    for item in items:
        traits.append(item.text.split())
        
    # Add columns of property type and year-built
    df = pd.DataFrame(traits)
    df = df.transpose()
    headers=df.iloc[0]
    df=df.iloc[1:]
    df.rename(columns=headers, inplace=True)
    df.index = df.reset_index().index
    
    # Add columns of property name and address
    address = soup.find(class_="ldp-header-address ").find(itemprop="address")
    name = address.text.strip().split('\n')[0]
    if name[0].isnumeric():
        df["Address"] = name + ' ' + get_address_str(address)
    else:    
        df["Address"] = get_address_str(address)
        df["Name"] = name

    # Add columns of Community Features and Unit Features,
    # which are stored in a list
    section = soup.find(class_="listing-subsection-features")
    if section:
        list1 = get_features_list(section.find_all(class_="col-sm-6"))
    
    # Change df into object type to enable it to store list values
    df.astype(object)
    if soup.find(text=re.compile("Community Features")):
        df["Community Features"] = [list1[0]+list1[1]]
    if soup.find(text=re.compile("Unit Features")):       
        df["Unit Features"] = [list1[2]+list1[3]]
    
    # Add columns of number of surrounding rated schools
    # and their average ratings 
    school = soup.find_all(class_="school-rating")
    
    rating_list = []
    for s in school:
        rating_list.append(s.text)
    
    rating = [int(a) for a in rating_list if a.isnumeric() ]
    df["Number of Schools"] = len(rating)
    df["Average School Rating"] = round(np.mean(rating),2)
    
    # Add columns of median rental price and median listing price
    # in the neighborhood
    nb = soup.find_all(class_ = "neighborhood-flex-item")
    
    if len(nb) != 0:
        nb_list = []
        for nb1 in nb:
            nb_list.append(nb1.text.strip().split('\n'))

        title1 = nb_list[0][-1].strip()
        title2 = nb_list[1][-1].strip()
        value1 = float(nb_list[0][0].replace('$','').replace(',',''))
        value2 = float(nb_list[1][0].replace('$','').replace(',',''))
        df[title1] = value1
        df[title2] = value2

    return df

In [309]:
soup1.find("div", text="Property Features")

In [310]:
trait_scrapper(soup1)

Unnamed: 0,Type,Address,Number of Schools,Average School Rating,Median Rental Price,Median Listing Price
0,Apartment,"8 W Howe Street 8 W Howe St, Seattle, WA 98119",5,8.4,2650.0,684990.0


In [311]:
def meta_extract(soup, info):
    meta = soup.find(id="ldp-property-meta")
    l = meta.find("li", attrs={"data-label": f"property-meta-{info}"})
    if l:
        return l.text.strip().split()

In [312]:
l = soup1.find(id="ldp-property-meta")
l

<div class="ldp-header-meta mobile-wrapper margin-bottom" id="ldp-property-meta">
<ul class="property-meta list-horizontal list-style-disc list-spaced" itemprop="description">
<li data-label="property-meta-beds">
<span class="data-value">0</span>
      beds
    </li>
<li data-label="property-meta-bath">
<span class="data-value">1</span>
      bath
    </li>
<li data-label="property-meta-sqft">
<span class="data-value">400</span> sq ft
    </li>
</ul>
</div>

In [322]:
def single_page_scraper(soup):
    df_trait = trait_scrapper(soup)

    # Add floorplan to properties which have a table of floorplans,
    # othrewise extract meta property information
    if floorplan_scrapper1(soup) is not None:
        df_floorplan = floorplan_scrapper1(soup)
        # Avoid expanding the DataFrame for multiple times by accident
        if(df_trait.shape[0] != df_floorplan.shape[0]):
            df_trait = pd.concat([df_trait] * df_floorplan.shape[0], 
                           ignore_index=True)  
        # Right append the trait columns to the floorplan DataFram
        df1 = pd.concat([df_trait,df_floorplan], axis=1)
        
    else:
        df1 = df_trait
        beds = meta_extract(soup, "beds")
        bath = meta_extract(soup, "bath")
        sqft = meta_extract(soup, "sqft")
        pets = meta_extract(soup, "pets")
        if beds:
            df1["Room Type"] = beds[0] + "-bedroom" 
        if bath:
            if len(bath) <= 2:
                df1["Bath"] = int(bath[0])
            else:
                df1["Bath"] = int(bath[0]) + int(bath[2])*0.5 
        if sqft:        
            df1["Sqft"] = float(sqft[0].replace(',',''))
        if pets:
            df1["Unit Features"] = ' '.join(pets)
        df1["Price"] = float(soup.
                             find(itemprop="price").
                             text.strip().
                             replace('$','').
                             replace(',',''))
    
    # Reorder columns and fill missing values as Nan objects
    cols = ["Name","Type","Address","Built",
                     "Room Type","Bath","Sqft","Price",
                    "Number of Schools","Average School Rating",
                    "Median Rental Price", "Median Listing Price",
                    "Community Features","Unit Features"]     
    df2 = pd.DataFrame(columns = cols)
    df = pd.concat([df2,df1], ignore_index = True)
    df = df[cols]
    
    return df

In [314]:
# for test purposes
df1 = single_page_scraper(soup1)
df1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Name,Type,Address,Built,Room Type,Bath,Sqft,Price,Number of Schools,Average School Rating,Median Rental Price,Median Listing Price,Community Features,Unit Features
0,,Apartment,"8 W Howe Street 8 W Howe St, Seattle, WA 98119",,studio,1.0,400.0,1245.0,5,8.4,2650.0,684990.0,,


## Let's try multiple pages

In [27]:
def url_scrapper():
    driver.get("https://www.realtor.com/apartments/Seattle_WA")
    
    # Scrap 22 pages of urls of apartment for leasing into a list
    j = 0 
    urls = []
    while j < 22:
        next_button = driver.find_element_by_class_name("next ")
        next_button.click()
        time.sleep(5) # Let's give the computer 5 seconds rest 
                      # after scraping a whole page of urls
        soup = BeautifulSoup(driver.page_source,'html.parser')
        for link in soup.find_all(class_="photo-wrap"): 
            urls.append("https://www.realtor.com"+link.find("a")["href"])
        j += 1

    # Save the url list in a csv file
    url_file = open("urls.csv", 'w')
    for url in urls:
        url_file.write(url + '\n')
    url_file.close()

In [29]:
url_scrapper()

In [44]:
with open("urls.csv", 'r') as f:
    urls = f.read().split('\n')
    urls = list(set(urls)) # Keep unique urls
    urls.remove('') # Remove empty strings
    f.close()

with open("urls.pkl",'wb') as picklefile:
    pickle.dump(urls, picklefile)

In [317]:
def pickle_files(name):
    if name.split('.')[1] == "csv":
        data = pd.read_csv(name)
    else: 
        f = open(name, 'r')
        data = f.read().split('\n')

    pkl = name.split('.')[0] + ".pkl"
    with open(pkl,'wb') as picklefile:
        pickle.dump(data, picklefile)

In [49]:
urls[:5]

['https://www.realtor.com/realestateandhomes-detail/300-11th-Ave_Seattle_WA_98122_M10614-86517',
 'https://www.realtor.com/realestateandhomes-detail/5912-California-Ave-SW-Apt-101_Seattle_WA_98136_M21371-66358',
 'https://www.realtor.com/realestateandhomes-detail/924-16th-Ave_Seattle_WA_98122_M10894-90024',
 'https://www.realtor.com/realestateandhomes-detail/715-2nd-Ave-N_Seattle_WA_98109_M29948-78771',
 'https://www.realtor.com/realestateandhomes-detail/1909-Franklin-Ave-E_Seattle_WA_98102_M21110-14592']

In [62]:
cols = ["Name","Type","Address","Built",
           "Room Type","Bath","Sqft","Price",
           "Number of Schools","Average School Rating",
           "Median Rental Price", "Median Listing Price",
           "Community Features","Unit Features","url"]

In [335]:
df = pd.DataFrame(columns = cols)

for url in urls[:10]:
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,"html5lib")
    df_property = single_page_scraper(soup)
    df_property["url"] = url
    df = pd.concat([df, df_property], ignore_index=True)
    df.to_csv("rental_data2.csv")
    time.sleep(2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [318]:
pickle_files("rental_data.csv")

!ls