In [36]:
# import the necessary libraries
from bs4 import BeautifulSoup
import requests
import json
import re
import datetime

In [2]:
# this function handles the situation when the html is embedded within javascript
def javascript_html_parse(source_arg):
	"""Return the JSON object as a dictionary"""
	start = "<script>window.__PRELOADED_STATE__ = "
	end = "</script"
	x = source_arg[source_arg.find(start)+len(start):]
	x = x[:x.find(end)]
	x = json.loads(x)
	return x

In [3]:
# Obtain all of the URL's that need to be scraped from 
# This is the first URL we start at, here we are collecting all of the URL's to be scraped from into 
# local area URL's 
url = "https://www.rightmove.co.uk/house-prices-in-London.html"
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
b = soup.find_all("a", class_="head")
london_urls = []
for i in b:
	london_urls.append(i["href"])

borough_urls = []
for url in london_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		borough_urls.append(i["href"])

local_area_urls = []
for url in borough_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		local_area_urls.append(i["href"])	

In [29]:
# iterate over all of the URL's and scrape from them, and store the scraped data into a variable called houses
houses = {}
for area_url in local_area_urls[:2]:

	# while loop that obtains the address, property type, last sold price, last sold date, and its url (if it has one) from each house. 
	# Stores each value into a list. This is from the web page which shows different houses from a particular area.

	page_number = 1
	addresses, property_type, last_sold_price, last_sold_date, url = [], [], [], [], []
	#bush_hill_url = "https://www.rightmove.co.uk/house-prices/bush-hill-park.html"
	# obtain the number of pages to use in the while loop condition
	source = requests.get(area_url).text
	a = javascript_html_parse(source)
	num_pages = a["pagination"]["last"]
	#houses = {}
	while page_number <= num_pages:

		if page_number == 1:
			source = requests.get(area_url).text
		else:
			source = requests.get(area_url+f"?page={page_number}").text

		# all we want to extract is the address, the price & date of the last sale, and the URL.
		x = javascript_html_parse(source)
		for house in x["results"]["properties"]:
			addresses.append(house["address"])
			property_type.append(house["propertyType"])
			last_sold_price.append(house["transactions"][0]["displayPrice"])
			last_sold_date.append(house["transactions"][0]["dateSold"])
			url.append(house["detailUrl"])



		page_number += 1


	# Create a dictionary with all of the scraped info stored in it so far
	#houses = {}
	for i in range(len(addresses)):
		houses[addresses[i]] = {"property_type":property_type[i], "price":last_sold_price[i], "date":last_sold_date[i], "url": url[i]}




	for v in houses.values():
		if v['url'] == '':
			continue
		else:
			source = requests.get(v['url']).text
			soup = BeautifulSoup(source, 'lxml')
			script = soup.find("script", {"type":"text/javascript"}).text
			# Regex used to convert the JSON data structure into a python dict 
			# Use regex to extract json data from the script text
			json_script=re.findall(("(?s)(?<=window.PAGE_MODEL = )(.*$)"), script)[0]
			# Transforming json data within string into dictionary
			json_dict=json.loads(json_script)

			# The bullet pointed features # 
			try:
				features = json_dict["soldPropertyData"]["property"]["keyFeatures"]
			except TypeError: 
				pass
			else:
				v["features"] = features
			# got the floorplan URL
			try:
				floorplan_url = json_dict["soldPropertyData"]["property"]["floorplans"][0]['url']
			except Exception:
				# seen a TypeError and an IndexError
				pass
			else:
				v["floorplan_url"] = floorplan_url	
			# Scrape the pictures of the house
			house_image_urls = []
			for image in json_dict["soldPropertyData"]["property"]["images"]:
				house_image_urls.append(image['url'])
			v["house_image_urls"] = house_image_urls
			# location of the house
			latitude = json_dict["soldPropertyData"]["property"]["location"]["latitude"]
			longitude = json_dict["soldPropertyData"]["property"]["location"]["longitude"]
			v["latitude"] = latitude
			v["longitude"] = longitude
			# proximity to the stations
			station_names, distances = [], []
			for station in json_dict["soldPropertyData"]["property"]["nearestStations"]:
				station_names.append(station["name"])
				distances.append(station["distance"])
			station_proximities = {station:distance for station, distance in zip(station_names, distances)}
			v["station_proximities"] = station_proximities
			# size of the property
			units, sizes = [], []
			for i in json_dict["soldPropertyData"]["property"]["sizings"]:
				units.append(i['unit'])
				sizes.append(i['maximumSize'])
			dimensions = {unit:size for unit, size in zip(units, sizes)}
			v["property_size"] = dimensions

			# number of bedrooms and bathrooms
			bedrooms = json_dict["soldPropertyData"]["property"]['bedrooms']
			bathrooms = json_dict["soldPropertyData"]["property"]['bathrooms']
			v["bedrooms"] = bedrooms
			v["bathrooms"] = bathrooms

			# Whether the property is a new build or not
			new_build = json_dict["soldPropertyData"]["transactions"][0]['newBuild']
			v["new_build"] = new_build


In [30]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': '£1,500,000',
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'house_image_urls': ['https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_00_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_01_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_02_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_13_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_14_0001.jpeg',
   'https://media.rightmove.co.uk/72k/71134/11

In [34]:
# change price from a string into an int
def price_int(x):
    a = x[1:]
    b = a.replace(",", "")
    b = int(b)
    return b

for features in houses.values():
    y = features['price']
    z = price_int(y)
    features['price'] = z
    

In [35]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'house_image_urls': ['https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_00_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_01_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_02_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_13_0000.jpeg',
   'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_IMG_14_0001.jpeg',
   'https://media.rightmove.co.uk/72k/71134/1111982

In [None]:
# Change the format of the date