## MSDS696 Data Science Practicum II
### Clustering and Linear Regression with Real Estate Data
### Part 1a - Web Scraping a Real Estate Website

### Libraries

In [1]:
from bs4 import BeautifulSoup #library for webscraping (take care of html)
from requests import get #HTTP library for making http requests in python.
from time import sleep # use for pauses during code execution
from random import randint # use for psudo-random number generation
from selenium import webdriver # use for web-scraping
#import re # library for regular expressions
import regex as re # library for regular expressions
import pandas as pd # library for data analysis and manipulation
import numpy as np # library for working with arrays
import itertools # library that implements iterator building blocks

import matplotlib.pyplot as plt # library for ploting data
import seaborn as sns # library for plotting data
sns.set()


### Web-Scraping Part 1:  Getting a list of property URLs

The website I will be scraping from is REALTOR.com, searching for properties in Douglas County, Colorado.

"https://www.realtor.com/realestateandhomes-search/Douglas-County_CO"

The search returns 37 pages of results (1,544 Homes), which means that I am going to need to create code that iterates through all 37 pages to collect the html that identifies the URL for each property that can be used in another scrape to collect the data I am interested in. 

The Steps for part 1 of the web scrape will be:

   1) use "get" to obtain the HTML code from the website.
   
   2) use BeautifulSoup and RE to parse the HTML for the property URL

In [3]:
# Be polite when scrapping! - try to not overwhelm the website.

# pass the header in the "get" command to mimic actual user behavior when web scrapping.  
headers = ({'User-Agent':
            'Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}) 

            #'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

# set the location to scrape as the website with the filtered information (first page of results)
website = "https://www.realtor.com/realestateandhomes-search/Douglas-County_CO"

# use get from the response library to extract the htlm from the first page of the website referenced.
#response = get(website, headers=headers)
response = webdriver.Firefox()
res_get = response.get(website)

# view the response code
print(response)

WebDriverException: Message: 'geckodriver' executable needs to be in PATH. 


In [None]:
# View the html returned from the get command.
print(response.text[:1000])

In [None]:
# Use BeautifulSoup to parse though the html obtained using get.
soup = BeautifulSoup(response.content, "html.parser")

# Getting the relevant links from the html - identified using "inspect" from google chrome on the website
# Example:  <a rel="noopener" href="/realestateandhomes-detail/9396-Desert-Willow-Rd_Highlands-Ranch_CO_80129_M16601-58901" target="_self" data-testid="property-anchor"><picture><source srcset="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360.webp, https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360_x2.webp 2x" type="image/webp" data-testid="img-webp"><img alt="9396 Desert Willow Rd, Highlands Ranch, CO 80129 with Three Car Garage" data-src="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360.jpg" src="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360.jpg" srcset="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360.jpg, https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m2340783516od-w480_h360_x2.jpg 2x" itemprop="image" class="fade bottom" data-label="pc-photo" data-atf="false" data-fmp="false"></picture><picture><source srcset="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360.webp, https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360_x2.webp 2x" type="image/webp" data-testid="img-webp"><img alt="9396 Desert Willow Rd, Highlands Ranch, CO 80129 with Three Car Garage" data-src="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360.jpg" src="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360.jpg" srcset="https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360.jpg, https://ap.rdcpix.com/74d2956d9a69906854ddf4fa93ecf3efl-m793780717od-w480_h360_x2.jpg 2x" itemprop="image" class="fade top" data-label="pc-photo" data-atf="true" data-fmp="false"></picture></a>

links_html = soup.find_all('a', rel = "noopener")


In [None]:
# view the object type-
type(links_html)

In [None]:
# Turning the soup into a list of strings using str(x)
links_v1 = [str(x) for x in links_html]

In [None]:
# view the object type-
type(links_v1)

In [None]:
# view the size of the object
len(links_v1)

In [None]:
# View the first few items in the list
links_v1[0:6]

In [None]:
# Extract the property URL from the html
# Use regex to identify a specific string
url = '<a.*href="(.*)"noopener"'

# Use a list comprehension to loop through each item in the list and search for the regex pattern.
links_v2a = [re.search(url, x) for x in links_v1]

# Use a list comprehension to loop through each item in the list to remove the "None" matches (allow for use of .group)
links_v2b = [i for i in links_v2a if i]

# Use a list comprehension to change each item in the list to a string
links_v2c = [str(x) for x in links_v2b]

In [None]:
# view the object type-
type(links_v2c)

In [None]:
# view the first few items in the list
links_v2c[1:10]

In [None]:
# Extract the property URL from the html

pattern = 'href="(.*)".rel="noopener"' #- use with group(1)
links_v2d = [re.search(pattern, x).group(1) for x in links_v2c]

In [None]:
# view the object type-
type(links_v2d)

In [None]:
# view the first few items in the list
links_v2d[1:10]

In [None]:
# The URL needs the website added to the beginning of the string
pre_url = 'https://www.realtor.com'
pre_url += '% s'
web_url =  [pre_url % i for i in links_v2d]
web_url[0:10]


In [None]:
# Append the URL's individually to a list (this preps the list for the next step of iterating through multiple web pages)
url_list = []

for url in web_url:
    url_list.append(url)

In [None]:
# View how many items are in the url list
len(url_list)

In [None]:
# View the conent of the URL list
url_list

In [None]:
len(url_list)

#### Set up a for loop to iterate through multiple web pages

In [None]:
#Create an numpy array of values 1 through 37 to match the number of pages of results from the website
pages = np.arange(2, 39)
print(pages)

In [None]:
# view the object type
type(pages)

In [None]:
# Create a for loop to iterate through each of the 38 pages.

for page in pages:
       
    url_page_lp = get("https://www.realtor.com/realestateandhomes-search/Douglas-County_CO/pg-" + str(page), headers=headers)
    
    soup_lp = BeautifulSoup(url_page_lp.content, "html.parser")
    
    links_html_lp = soup_lp.find_all('a', rel = "noopener")
    
    links_v1_lp = [str(x) for x in links_html_lp]
    
    # Extract the property URL from the html
    # Use regex to identify a specific string
    url_lp = '<a.*href="(.*)"noopener"'

    # Use a list comprehension to loop through each item in the list and search for the regex pattern.
    links_v2a_lp = [re.search(url_lp, x) for x in links_v1_lp]

    # Use a list comprehension to loop through each item in the list to remove the "None" matches (allow for use of .group)
    links_v2b_lp = [i for i in links_v2a_lp if i]

    # Use a list comprehension to change each item in the list to a string
    links_v2c_lp = [str(x) for x in links_v2b_lp]
    
    # Extract the property URL from the html

    pattern_lp = 'href="(.*)".rel="noopener"' #- use with group(1)
    links_v2d_lp = [re.search(pattern_lp, x).group(1) for x in links_v2c_lp]
    
    pre_url_lp = 'https://www.realtor.com'
    pre_url_lp += '% s'
    web_url_lp =  [pre_url_lp % i for i in links_v2d_lp]
    
    for url_item in web_url_lp:
        url_list.append(url_item)
    
    print(page)

    
    sleep(randint(60,120))

In [None]:
# View the number of items in the list
len(url_list)

In [None]:
# View the first few items in the list
url_list[0:300]

In [None]:
# Identify if there are any duplicated URLs in the list and create a new list with only unique URLs
url_unique = []
for i in url_list:
    if i not in url_unique:
        url_unique.append(i)

In [None]:
# View the number of items in the new list
len(url_unique)

In [None]:
# View the fir
url_unique

### REFERENCE

###### Realtor.com Robot.txt file
https://www.realtor.com/robots.txt

###### Webscraping real estate market data
https://data4help.medium.com/webscraping-real-estate-market-data-515c0b85b494

###### Scraping a real estate website
https://towardsdatascience.com/looking-for-a-house-build-a-web-scraper-to-help-you-5ab25badc83e

##### Scraping Realtor.com
https://www.proxiesapi.com/blog/scraping-listings-from-realtor-with-python-and-bea.html.php

###### Scraping multiple pages of a website
https://betterprogramming.pub/how-to-scrape-multiple-pages-of-a-website-using-a-python-web-scraper-4e2c641cff8

###### How to web scrape, and avoid being tagged as a bot:
https://www.scrapehero.com/how-to-prevent-getting-blacklisted-while-scraping/

