### Import Packages

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pickle
import requests
from bs4 import BeautifulSoup as bs
import time, os
import re
import random

import pandas as pd

chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
# request the html from the website

url = 'https://www.cityrealty.com/nyc/apartments-for-rent/search-results#?page=1'

response = requests.get(url)

if response.status_code == 200:
    print(response.text[:300])
else: 
    print(f'opps! Received status code {status}')

In [2]:
# I searched any apartments in NYC. Each search result page shows around 50 apartments. 
# so I need to loop through the pages to get 1000+ apartments.

results_page = 'https://www.cityrealty.com/nyc/apartments-for-rent/search-results#?page='
apt_page = []

for page in range (1, 51):
    driver = webdriver.Chrome(chromedriver)
    driver.get(results_page + str(page))
    soup = bs(driver.page_source, 'html.parser')
    for link in soup.find_all('a', class_ = "ng-scope"):
        if re.match('^/nyc/', link['href']):
            apt_page.append(link['href'])
    
    driver.close()
    time.sleep(.5+2*random.random())

In [3]:
len(apt_page) # shows how many apartment pages are in the list

1500

In [4]:
# some of apartment in jersey city or hoboken nearby nyc are included so let's drop those.

for i in apt_page:
    if 'jersey-city' in i:
        apt_page.remove(i)
    elif 'hoboken' in i:
        apt_page.remove(i)
        
len(apt_page)

1430

In [None]:
# pickle it in case I need the list of apartment in the future. 
# with open('apt_page.pickle', 'wb') as apt_page_list:
#     pickle.dump(apt_page, apt_page_list)

In [None]:
# let's scrape each page for every single apartment
apt_pipeline = []

url_base = 'https://www.cityrealty.com'
url_list = [url_base + str(i) for i in apt_page]

for url in url_list:
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    soup = bs(driver.page_source, 'html.parser')
    
    # each apartment page may show a subscribe pop-up page which prevents from scraping the data at first hand.
    # if the subscribe page exists, I need to click the close buttom to remove that pop up screen.
    close = driver.find_element_by_xpath('//*[@id="lst"]/registration/div[2]/div[1]/i')
    
    if close.is_displayed():
        close.click()
    else:
        None
        
    
    if soup.find('title') == None:
        address = 'missing'
    else: address = soup.find('title').text.split(',')[0]
    
    if soup.find('span', class_ = "_content _price") == None:
        price = 'missing'
    else: price = soup.find('span', class_ = "_content _price").text.replace('$','')\
                .replace(',','').split('\n')[1]
    
    if soup.find('span', class_= "_content _beds") == None:
        num_beds = 'missing'
    else: num_beds = soup.find('span', class_= "_content _beds").text.replace('\n', '').split(' ')[0]
    
    if soup.find('span', class_= "_content _baths") == None:
        num_baths = 'missing'
    else: num_baths = soup.find('span', class_= "_content _baths").text\
                      .replace('\n', '').replace(',', '').split(' ')[1]
    
    if soup.find('span', class_ = "_option") == None:
        sq_ft = 'missing'
    else: sq_ft = soup.find('span', class_ = "_option").next.next.replace(' ft', '')

    if soup.find('span', class_ = "_init") == None:
        listed_date = 'missing'
    else: listed_date = soup.find('span', class_ = "_init").text.replace('Listed ', '')
    
    if soup.find('i', class_ = "fa fa-map-signs") == None:
        neighborhood = 'missing'
    else: neighborhood = soup.find('i', class_ = "fa fa-map-signs")\
            .next.next.next.text.replace('\n', '')
    
    if soup.find('i', class_ = "fa fa-building-o big") == None:
        fee = 'missing'
    else: fee = soup.find('i', class_ = "fa fa-building-o big").next
    
    if soup.find('i', class_ = "fa fa-wrench") == None:
        built_yr = 'missing'
    else: built_yr = soup.find('i', class_ = "fa fa-wrench").next.split(' ')[2]
        
        
    if soup.find('i', class_ = "fa fa-key big") == None:
        unit = 'missing'
    else: unit = soup.find('i', class_ = "fa fa-key big").next.split(' ')[0]
        
    if soup.find('li', class_ = "last") == None:
        floors = 'missing'
    else: floors = soup.find('li', class_ = "last").text.split(' ')[0]
    
    if soup.find('span', class_ = 'value') == None:
        listed_date = 'missing'
    else: listed_date = soup.find('span', class_ = 'value').text
    #parsing a list of amentities and number of amenities
    
    if soup.find('span', class_ = 'value') == None:
        listed_date = 'missing'
    else: listed_date = soup.find('span', class_ = 'value').text.replace('\n', '')
        
    amenities = []
    if soup.find('ul', class_ = 'w_list') == None:
        amenity_root = 'missing'
    else: amenity_root = soup.find('ul', class_ = 'w_list').text.split('\n')
    

    if amenity_root == 'missing':
        None
    else:
        while "" in amenity_root:
            amenity_root.remove("")
    
    if amenity_root == 'missing':
        None
    else: amenity_list = [amenity.strip(' ') for amenity in amenity_root]
    
    
    num_amenities = 0
    
    if amenity_root == 'missing':
        num_amenities = 0
    else:
        for i in amenity_list:
            num_amenities = num_amenities +1
    
    if soup.find("span", class_ = "distance") == None:
        closest_dist = 'missing'
    else: closest_dist = soup.find("span", class_ = "distance").text.split(' ')[0]
        
    apt_pipeline.append({'address': address, 'price': price, 'num_beds': num_beds,
                     'num_baths': num_baths, 'sq_ft': sq_ft, 'listed_date': listed_date,
                     'neighborhood': neighborhood,
                     'fee': fee, 'built_yr': built_yr, 'unit': unit,
                     'floors': floors, 'num_amenities': num_amenities,
                     'closest_dist_station': closest_dist, 'url':url}) 
    
    driver.close()
    time.sleep(.5+2*random.random()) # this breaks some time between loading each page.        

In [None]:
apt_pipeline

In [None]:
pipeline_df = pd.DataFrame(apt_pipeline)

# for col in pipeline_df.columns:
#   pipeline_df.loc[0, col] in pipeline_df[col].values  #first required post
#   pipeline_df.loc[999, col] in pipeline_df[col].values #last required post

pipeline_df.info()

In [None]:
#save as a csv file 
pipeline_df.to_csv('apt_info2.csv', sep='\t', encoding='utf-8', header='true')