In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC  
import pandas as pd
import time
import requests
import credentials
from datetime import date

In [2]:
def accept_cookies_if_asked(driver):
    try:
        driver.find_element(By.ID, "onetrust-accept-btn-handler").click()
    except:
        pass

In [3]:
class Data:
    def __init__(self):
        self.ids = []
        self.prices = []
        self.address = []
        self.postcodes = []
        self.latitudes = []
        self.longitudes = []

    def add_page_data(self, driver):
        properties = driver.find_elements(By.CLASS_NAME, "l-searchResult")
        for property in properties:
            self.ids.append(property.find_element(By.CLASS_NAME, "propertyCard-anchor").get_attribute('id'))
            self.prices.append(property.find_element(By.CLASS_NAME, "propertyCard-priceValue").text)
            self.address.append(property.find_element(By.CLASS_NAME, "propertyCard-address").get_attribute('title'))

    def transform_location_data(self):
        self.postcodes = []
        self.latitudes = []
        self.longitudes = []
        for addy in self.address:
            addy = addy + ", London, UK"
            addy = addy.replace(' ', '%20')
            r = requests.get(rf'https://dev.virtualearth.net/REST/v1/Locations?q={addy}&key={credentials.bingmaps_api_key}')
            if r.status_code >= 300:
                print(r.status_code)
                raise Exception(r.reason)
            try:
                self.postcodes.append(r.json()['resourceSets'][0]['resources'][0]['address']['postalCode'])
            except KeyError:
                self.postcodes.append(None)
            self.latitudes.append(r.json()['resourceSets'][0]['resources'][0]['geocodePoints'][0]['coordinates'][0])
            self.longitudes.append(r.json()['resourceSets'][0]['resources'][0]['geocodePoints'][0]['coordinates'][1])

    def transform_prices(self):
        self.prices = [int(price[1:-4].replace(',','')) for price in self.prices]

    def make_df(self):
        return pd.DataFrame({'id': self.ids, 'price': self.prices, 'address': self.address, 
                             'postcode': self.postcodes, 'latitude': self.latitudes, 'longitude': self.longitudes})

    def zip_property(self):
        ''''This method zips up all of the attributes needed to 
        insert into the property table in the database'''
        return list(zip([int(id.replace('prop','')) for id in self.ids], self.address, self.postcodes, [round(lat, 8) for lat in self.latitudes], [round(lon, 8) for lon in self.longitudes]))
    
    def zip_price(self, date_today):
        ''''This method zips up all of the attributes needed to 
        insert into the price table in the database'''
        return [(int(id.replace('prop','')), str(date_today), self.prices[i]) for i, id in enumerate(self.ids)]


In [4]:
date_today = date.today()
url = r'''https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E92829&maxBedrooms=2&minBedrooms=2&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords='''
driver = webdriver.Chrome()
driver.get(url)

time.sleep(2)
accept_cookies_if_asked(driver)

data = Data()
data.add_page_data(driver)
pages_scanned = 1

button = driver.find_element(By.XPATH, "//button[@title='Next page']")

while button.is_enabled():
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable(button))
    button.click()
    accept_cookies_if_asked(driver)
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard-anchor')))
    data.add_page_data(driver)
    pages_scanned += 1
    button = driver.find_element(By.XPATH, "//button[@title='Next page']")

driver.close()


In [5]:
pages_scanned

42

In [6]:
data.transform_prices()
data.transform_location_data()

In [7]:
import pyodbc

In [8]:
endpoint = credentials.db_endpoint
conn = pyodbc.connect('Driver={SQL Server};'
                  'Server='+credentials.db_endpoint+';'
                  'Database='+credentials.db_database+';'
                  'UID='+credentials.db_user+';'
                  'PWD='+credentials.db_password+';'
                  'Trusted_Connection=no;')

# load to property table
property_tuples = data.zip_property()
cursor = conn.cursor()
cursor.executemany('''
                    WITH source AS (
                    SELECT * FROM (VALUES (?, ?, ?, ?, ?)) s(prop_id, address, postcode, latitude, longitude)
                    )
                    INSERT INTO dbo.property
                    (prop_id, address, postcode, latitude, longitude)
                    SELECT source.prop_id, source.address, source.postcode, 
                    source.latitude, source.longitude 
                    FROM source
                    WHERE NOT EXISTS
                    (SELECT 1 FROM dbo.property target
                    WHERE target.prop_id = source.prop_id)
                   ''', property_tuples)
cursor.commit()
cursor.close()

In [22]:
# load to price table
price_tuples = [(int(id.replace('prop','')), str(date_today), data.prices[i]) for i, id in enumerate(data.ids)]#data.zip_price(date_today=date_today)
cursor = conn.cursor()
cursor.executemany('''
                    WITH source AS (
                    SELECT * FROM (VALUES (?, ?, ?)) s(prop_id, date, price)
                    )
                    INSERT INTO dbo.price
                    (prop_id, date, price)
                    SELECT source.prop_id, source.date, source.price 
                    FROM source
                    WHERE NOT EXISTS
                    (SELECT 1 FROM dbo.price target
                    WHERE target.prop_id = source.prop_id
                    AND target.date = source.date)
                   ''', price_tuples)
cursor.commit()
cursor.close()

conn.close()