# Aggregates the metadata for apartments in an area

In [1]:
from selenium import webdriver
import pandas as pd
import os

## Create dictionaries that will represent information groupings 

In [2]:
vendor_info = {
    "propertyName":[],
    "street":[],
    "city":[],
    "state":[],
    "zipCode":[],
    "neighborhood": [],
    "reviewScore":[],
    "reviewCount":[],
    "walkScore":[],
    "transitScore":[]
}
 
aptType_info = {
    "propertyName":[],
    "room":[],
    "costRange": []
}

amenites_info = {
    "propertyName":[],
    "amenitytype":[],
    "amenity":[]
}

listing_info = {
    "propertyName":[],
    "bedRoomType": [],
    "bedRoomNumber": [],
    "baths": [],
    "rent": [],
    "sqft": [],
    "availability": [],
    "deposit": [],
    "unit": [],
    "leaseLength": [],
    "name":[],
    "new": [],
    "applyNow" : []   
}

## Get vendor Information

In [3]:
def getVendorInformation(vendor_info, driver):
    property_class_name = "propertyName"
    property_field = driver.find_element_by_class_name(property_class_name)
    propertyname = property_field.text
    vendor_info["propertyName"].append(propertyname)

    
    propertyAddress_class_name = "propertyAddress"
    propertyAddress_field = driver.find_element_by_class_name(propertyAddress_class_name)
    
    address = propertyAddress_field.find_elements_by_tag_name("span")
    i = 1
    street=""
    if(address[3].text=='–'):
        street = propertyname
        i = 0
    else:
        street = address[0].text
        
    vendor_info["street"].append(street)
    
    city = address[i].text
    vendor_info["city"].append(city)
    
    state = address[i+1].text
    vendor_info["state"].append(state)
    
    zipcode = address[i+2].text
    vendor_info["zipCode"].append(int(zipcode))
    
    neighborhood_class_name = "neighborhoodAddress"
    neighborhood_field = driver.find_element_by_class_name(neighborhood_class_name)
    neighborhood_a_tag = neighborhood_field.find_element_by_tag_name("a")
    neighborhood = neighborhood_a_tag.text
    vendor_info['neighborhood'].append(neighborhood)
    

    reviews=0
    reviewCount=0
    
    try:
        rating_class_name = "rating"
        rating_field = driver.find_element_by_class_name(rating_class_name)
        reviews = len(rating_field.find_elements_by_class_name("starFullIcon"))
        
        reviewCountstr = rating_field.find_element_by_class_name("reviewCount").text
        reviewCountstr = reviewCountstr.replace('(',"")
        reviewCountstr = reviewCountstr.replace(')',"")
        reviewCount = int(reviewCountstr.split()[0])
        
    except:
        reviews= 0
        reviewCount= 0
        
    vendor_info["reviewScore"].append(reviews)
    vendor_info["reviewCount"].append(reviewCount)
    
    score_class_name = "score"
    score_class_fields = driver.find_elements_by_class_name(score_class_name)
    
    walkScore = score_class_fields[0].text
    vendor_info["walkScore"].append(eval(walkScore))
    
    transitScore = score_class_fields[1].text
    vendor_info["transitScore"].append(eval(transitScore))    

## Get Apartment type Information

### Step 1 clean the rental values by transforming them into being a single numerical value if a range was listed. For our purposes this this will be a simple average

In [4]:
def transformRent(rent):
    if "–" in rent:
        rent = rent.replace("–", "-")
    if(rent == "") or (rent == "Call for Rent"):
        return 0
    if "-" in rent :
        value = rent.split("-")
        num1 = eval(value[0].replace(",","").replace("$",""))
        num2 = eval(value[1].replace(",","").replace("/ Unit","").replace("/ Person",""))
        rent = int((num1 + num1)/2)
        
    else:
        rent = eval(rent.replace(",","").replace("/ Unit","").replace("/ Person","").replace("$",""))
    return rent

### Step 2 aggregate the basic apartment information on the page

In [5]:
def getAptInformation(propertyName, aptType_info, driver):
    apartmentType_class_name = "rentRollup"
    aptType_field = driver.find_elements_by_class_name(apartmentType_class_name)
    
    current = 1
    for aptType in aptType_field:
        aptType_info['propertyName'].append(propertyName)
        data = aptType.text.split('$')
        if( len(data) == 1):
            data = data[0].split(" C")
            data[1] = "C"+data[1]
        
        costRange = transformRent(data[1])
        aptType_info["costRange"].append(costRange)
        
        room = data[0]
        aptType_info['room'].append(room)
    
    

## Get Amenities Information

In [6]:
def getAmenityInformation(propertyName, amenites_info, driver):
    try:
        section_class_name = "specGroup"
        section_field = driver.find_element_by_class_name(section_class_name)

        specList_class_name = "specList"
        specList = section_field.find_elements_by_class_name(specList_class_name)

        for spec in specList:
            amenitytype = spec.find_element_by_tag_name("h3").text

            try:
                p_field = spec.find_element_by_tag_name("p")
                amenites_info["amenity"].append(p.text)
                amenites_info["propertyName"].append(propertyName)
                amenites_info["amenitytype"].append(amenitytype)
            except:
                time.sleep(.001)
            try:
                ul_field = spec.find_element_by_tag_name("ul")
                li_fields = ul_field.find_elements_by_tag_name("li")
                for li in li_fields:
                    amenites_info["amenity"].append(li.text.replace("•\n",""))    
                    amenites_info["propertyName"].append(propertyName)
                    amenites_info["amenitytype"].append(amenitytype)
            except:
                time.sleep(.001)

            try:
                h4_field = spec.find_element_by_tag_name("h4")
                amenites_info["amenity"].append(h4.text)
                amenites_info["propertyName"].append(propertyName)
                amenites_info["amenitytype"].append(amenitytype)
            except:
                time.sleep(.001)
    except:
        return

## Get listing Information

### Step 1 create helper functions to clean certain datapoints to being numerical values 

In [7]:
def transfromsqft(sqft):
    if(sqft == ""):
        return 0
    if " - " in sqft :
        value = sqft.split(" - ")
        avg = (int(value[0].replace(",","")) + int(value[1].replace(",","").replace('Sq Ft', '')))/2
        sqft = avg
    else:
        sqft = int(sqft.replace(",","").replace('Sq Ft', ''))
    return sqft

In [8]:
def transformLeaseLength(lease):
    return int(lease.replace(" Month Lease", ""))

In [9]:
def transformBedRooms(bedroom):
    bedRoomType = ""
    bedRoomNumber = 0
    if "Studio" in bedroom: 
        bedRoomType ="Studio" 
        bedRoomNumber = 1
    else:
        bedRoomType = "Traditional"
        rooms, throwaway = bedroom.split()
        bedRoomNumber = int(rooms)
    return (bedRoomType, bedRoomNumber)
    

In [10]:
def transformBathRooms(bathroom):
    bathdroomNumber = 0
    if bathroom == "":
        return bathdroomNumber
    elif '½' in bathroom:
        bathroom = bathroom.replace('½','')
        number, throwaway = bathroom.split()
        bathdroomNumber = int(number) + .5
    elif '¼' in bathroom:
        bathroom = bathroom.replace('¼','')
        number, throwaway = bathroom.split()
        bathdroomNumber = int(number) + .25
    else:
        number, throwaway = bathroom.split()
        bathdroomNumber = int(number)
    return bathdroomNumber

In [11]:
def transformDeposit(deposit):
    return int(deposit.replace('$','').replace(',',''))

In [12]:
def transformAvailabiliy(status):
    return status == "Available Now"

### Step 2 aggregate all the listings from a given vendor on the page

In [13]:
def getListingInformation(propertyName, listing_info, driver):
    table_class_name = "availabilityTable"
    table_field = driver.find_element_by_class_name(table_class_name)
    
    tablerow_class_name = "rentalGridRow"
    table_rows = table_field.find_elements_by_class_name(tablerow_class_name)
    
    for row in table_rows:
        listing_info["propertyName"].append(propertyName)
        
        bed_class_name = "beds"
        bed_field = row.find_element_by_class_name(bed_class_name)
        bedroom = transformBedRooms(bed_field.text)
        listing_info["bedRoomType"].append(bedroom[0])
        listing_info["bedRoomNumber"].append(bedroom[1])
        
        bath_class_name = "baths"
        bath_field = row.find_element_by_class_name(bath_class_name)
        bath = transformBathRooms(bath_field.text)
        listing_info["baths"].append(bath)
            
        rent_class_name = "rent"
        rent_field = row.find_element_by_class_name(rent_class_name)
        rent = transformRent(rent_field.text)
        listing_info["rent"].append(rent)
        
        deposit=0
        try:
            deposit_class_name = "deposit"
            deposit_field = row.find_element_by_class_name(deposit_class_name)
            deposit = transformDeposit(deposit_field.text)
        except:
            time.sleep(0.001)
        listing_info["deposit"].append(deposit)
        
        unit=""
        try:
            unit_class_name = "unit"
            unit_field = row.find_element_by_class_name(unit_class_name)
            unit = unit_field.text
        except:
            time.sleep(0.001)
        listing_info["unit"].append(unit)
        
        sqft_class_name = "sqft"
        sqft_field = row.find_element_by_class_name(sqft_class_name)
        sqft = transfromsqft(sqft_field.text)
        listing_info["sqft"].append(sqft)
        
        leaseLength=0
        try:
            leaseLength_class_name = "leaseLength"
            leaseLength_field = row.find_element_by_class_name(leaseLength_class_name)
            leaseLength = transformLeaseLength(leaseLength_field.text)
        except:
            time.sleep(.001)
        listing_info["leaseLength"].append(leaseLength)
        
        name=""
        try:
            name_class_name = "name"
            name_field = row.find_element_by_class_name(name_class_name)
            name = name_field.text
        except:
            time.sleep(.001)
        listing_info["name"].append(name)
        
        available=True
        try:
            available_class_name = "available"
            available_field = row.find_element_by_class_name(available_class_name)
            available = transformAvailabiliy(available_field.text)
        except:
            time.sleep(.001)
        listing_info["availability"].append(available)
        
        new= ""
        try:
            new_class_name = "new"
            new_field = row.find_element_by_class_name(new_class_name)
            new = new_field.text
        except:
            time.sleep(.001)
        listing_info["new"].append(new)
        
        applyNow=""
        try:
            applyNow_class_name = "applyNow"
            applyNow_field = row.find_element_by_class_name(applyNow_class_name)
            applyNow = applyNow_field.text
        except:
            time.sleep(.001)
        listing_info["applyNow"].append(applyNow)
        

## Create Web Driver

In [14]:
driverpath = os.path.relpath("./drivers/chromedriver.exe") 
driver = webdriver.Chrome(executable_path=driverpath)

## load data links

In [15]:
apartment_links_path = os.path.relpath("./data/Seattle, WA_links.csv")
apartment_links = pd.read_csv(apartment_links_path)

## Aggregate data from links

In [26]:
import time

for link in apartment_links.values[529:]:
    print(link[1])
    
    try:
        driver.get(link[1])
        
        #Test to check if listing still exists 
        driver.find_element_by_class_name("nearbySection")
        time.sleep(2)
    except:
        continue
    getVendorInformation(vendor_info, driver)

    propertyName = vendor_info["propertyName"][i]

    getAptInformation(propertyName, aptType_info, driver)

    getAmenityInformation(propertyName, amenites_info, driver)

    getListingInformation(propertyName, listing_info, driver)
    i+=1


https://www.apartments.com/939-martin-luther-king-jr-way-s-seattle-wa-unit-1/05d69p0/
https://www.apartments.com/1808-minor-ave-seattle-wa-unit-3206/6n37bgt/
https://www.apartments.com/329-23rd-ave-s-seattle-wa-unit-407/6v3yjms/
https://www.apartments.com/6706-17th-ave-sw-seattle-wa/k3eehcb/
https://www.apartments.com/6236-36th-ave-ne-seattle-wa/gsr7nfy/
https://www.apartments.com/1808-minor-ave-seattle-wa-unit-1406/v3vj52e/
https://www.apartments.com/5015-38th-ave-ne-seattle-wa-unit-cottage/fzm7csr/
https://www.apartments.com/5015-38th-ave-ne-seattle-wa-unit-2nd-floor/v7n83hf/
https://www.apartments.com/8372-wabash-ave-s-seattle-wa/whw9phj/
https://www.apartments.com/403-roy-st-seattle-wa-unit-8/1x96zc7/
https://www.apartments.com/6244-25th-ave-ne-seattle-wa/99k5tq5/
https://www.apartments.com/1110-8th-ave-seattle-wa-unit-10/glb7ezw/
https://www.apartments.com/park-south-seattle-wa/39gy0kf/
https://www.apartments.com/1102-8th-ave-seattle-wa-unit-5/v6cg46f/
https://www.apartments.com/3

https://www.apartments.com/2345-ne-104th-way-seattle-wa/6t2en6v/
https://www.apartments.com/3018-nw-85th-st-seattle-wa/dbs35wh/
https://www.apartments.com/1115-17th-ave-seattle-wa/6mfmhpr/
https://www.apartments.com/bowman-seattle-wa/3xvcwwe/
https://www.apartments.com/6012-5th-ave-nw-seattle-wa/2ely0z9/
https://www.apartments.com/11038-lake-city-way-ne-seattle-wa/3dbwrxg/
https://www.apartments.com/13341-15th-ave-ne-seattle-wa/mwkxd7h/
https://www.apartments.com/611-n-50th-st-seattle-wa/hvftj51/
https://www.apartments.com/4001-ne-50th-st-seattle-wa/140yt06/
https://www.apartments.com/4051-gilman-ave-w-seattle-wa/vlw403f/
https://www.apartments.com/7101-roosevelt-way-ne-seattle-wa-unit-107/n04g75m/
https://www.apartments.com/1-br-1-bath-condo-1000-aurora-ave-n-seattle-wa/1f2cp37/
https://www.apartments.com/8626-10th-ave-sw-seattle-wa/v6z2cvg/
https://www.apartments.com/1038-lakeview-blvd-e-seattle-wa-unit-b/56cg9wm/
https://www.apartments.com/ascent-south-lake-union-seattle-wa/gg967yx/

In [24]:
print(i)

528


## Create dataframes 

In [27]:
area = "Seattle, WA"
path = "./data/"+area

vendor_info_df = pd.DataFrame.from_dict(vendor_info)
vendor_info_df.to_csv(path + "_vendor_info.csv")

In [28]:
aptType_info_df = pd.DataFrame.from_dict(aptType_info)
aptType_info_df.to_csv(path + "_aptType_info.csv")

In [29]:
amenites_info_df = pd.DataFrame.from_dict(amenites_info)
amenites_info_df.to_csv(path + "_amenites_info.csv")

In [30]:
listing_info_df = pd.DataFrame.from_dict(listing_info)
listing_info_df.to_csv(path + "_listing_info.csv")

In [31]:
driver.close()