In [1256]:
# import the necessary libraries
from bs4 import BeautifulSoup
import requests
import json
import re
import datetime
import easyocr
import cv2
import geopy.distance

In [1257]:
# this function handles the situation when the html is embedded within javascript
def javascript_html_parse(source_arg):
	"""Return the JSON object as a dictionary"""
	start = "<script>window.__PRELOADED_STATE__ = "
	end = "</script"
	x = source_arg[source_arg.find(start)+len(start):]
	x = x[:x.find(end)]
	x = json.loads(x)
	return x

In [851]:
# Obtain all of the URL's that need to be scraped from 
# This is the first URL we start at, here we are collecting all of the URL's to be scraped from into 
# local area URL's 
url = "https://www.rightmove.co.uk/house-prices-in-London.html"
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
b = soup.find_all("a", class_="head")
london_urls = []
for i in b:
	london_urls.append(i["href"])

borough_urls = []
for url in london_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		borough_urls.append(i["href"])

local_area_urls = []
for url in borough_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		local_area_urls.append(i["href"])	

In [1293]:
# iterate over all of the URL's and scrape from them, and store the scraped data into a variable called houses
m_values = ["square meters", "square metres", "square meter","square metre","square m","square mt","sqmt","sq mt","sq.mt", "sq. mt", "sqm", "sq m", "sq.m", "sq. m", "sq: m", "sq:m", "sq :m", "sq:  m", "sq : m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
f_values = ['square feet','square ft','square f','sqft','sq ft','sq.ft','sq. ft','sqf','sq f','sq.f','sq. f','sq: f','sq:f','sq :f','sq : f','sq :f','feet2','ft2','f2']
houses = {}
for area_url in local_area_urls[80:90]:

    # while loop that obtains the address, property type, last sold price, last sold date, and its url (if it has one) from each house. 
    # Stores each value into a list. This is from the web page which shows different houses from a particular area.

    page_number = 1
    addresses, property_type, last_sold_price, last_sold_date, url = [], [], [], [], []
    # obtain the number of pages to use in the while loop condition
    source = requests.get(area_url).text
    a = javascript_html_parse(source)
    num_pages = a["pagination"]["last"]
    
    while page_number <= num_pages:

        if page_number == 1:
            source = requests.get(area_url).text
        else:
            source = requests.get(area_url+f"?page={page_number}").text

        # Scrape the address, property type, the price & date of the last sale, and the URL.
        x = javascript_html_parse(source)
        for house in x["results"]["properties"]:
            addresses.append(house["address"])
            property_type.append(house["propertyType"])
            last_sold_price.append(house["transactions"][0]["displayPrice"])
            last_sold_date.append(house["transactions"][0]["dateSold"])
            url.append(house["detailUrl"])



        page_number += 1


    # Store the scraped data for each property into the houses dictionary
    for i in range(len(addresses)):
        houses[addresses[i]] = {"property_type":property_type[i], "price":last_sold_price[i], "date":last_sold_date[i], "url": url[i]}


    # Now loop through all of the stored URL's for each property, and scrape more data from it.

    for v in houses.values():
        if v['url'] == '':
            continue
        else:
            source = requests.get(v['url']).text
            soup = BeautifulSoup(source, 'lxml')
            script = soup.find("script", {"type":"text/javascript"}).text
            # Regex used to convert the JSON data structure into a python dict 
            # Use regex to extract json data from the script text
            json_script = re.findall(("(?s)(?<=window.PAGE_MODEL = )(.*$)"), script)[0]
            # Transforming json data within string into dictionary
            json_dict = json.loads(json_script)

            # The bullet pointed features 
            try:
                features = json_dict["soldPropertyData"]["property"]["keyFeatures"]
            except TypeError: 
                pass
            else:
                v["features"] = features
            # The floorplan URL
            try:
                floorplan_url = json_dict["soldPropertyData"]["property"]["floorplans"][0]['url']
            except Exception:
                # seen a TypeError and an IndexError so I'm going to capture all exceptions raised
                pass
            else:
                v["floorplan_url"] = floorplan_url
            # Scrape the pictures of the house ### commented this out for now, as I'm not using the image data
            #house_image_urls = []
            #for image in json_dict["soldPropertyData"]["property"]["images"]:
            #	house_image_urls.append(image['url'])
            #v["house_image_urls"] = house_image_urls
            # location of the house
            try:
                latitude = json_dict["soldPropertyData"]["property"]["location"]["latitude"]
                longitude = json_dict["soldPropertyData"]["property"]["location"]["longitude"]
            except TypeError:
                pass
            else:
                v["latitude"] = latitude
                v["longitude"] = longitude
            # proximity to the stations 
            station_names, distances = [], []
            try:
                for station in json_dict["soldPropertyData"]["property"]["nearestStations"]:
                    station_names.append(station["name"])
                    distances.append(station["distance"])
            except TypeError:
                station_proximities = {}
            else:
                station_proximities = {station:distance for station, distance in zip(station_names, distances)}
            v["station_proximities"] = station_proximities
            # size of the property
            units, sizes = [], []
            try:
                for i in json_dict["soldPropertyData"]["property"]["sizings"]:
                    units.append(i['unit'])
                    sizes.append(i['maximumSize'])
            except TypeError:
                dimensions = {}
            else:
                dimensions = {unit:size for unit, size in zip(units, sizes)}
            #for label, size in dimensions.items():
             #   if label not in m_values and label not in f_values:
              #      del dimensions[label]

            # still haven't delt with the potential of having both a sqm and sqf, or just a sqf

            v["property_size"] = dimensions
            
            # JUST ABOVE HERE IS WHERE I COULD MAKE IT SO ONLY SQ FT IS OBTAINED - Still needs to be finished off

            # number of bedrooms and bathrooms
            try:
                bedrooms = json_dict["soldPropertyData"]["property"]['bedrooms']
            except TypeError:
                bedrooms = ''
            try:
                bathrooms = json_dict["soldPropertyData"]["property"]['bathrooms']
            except TypeError:
                bathrooms = ''
                
            v["bedrooms"] = bedrooms
            v["bathrooms"] = bathrooms

            # Whether the property is a new build or not
            try:
                new_build = json_dict["soldPropertyData"]["transactions"][0]['newBuild']
            except TypeError:
                pass
            else:
                v["new_build"] = new_build

KeyboardInterrupt: 

In [1294]:
# change price from a string into an int
def price_int(x):
    """Return an integer from a price string"""
    a = x[1:]
    b = a.replace(",", "")
    b = int(b)
    return b

# run the above function for all of the stored prices
for features in houses.values():
    y = features['price']
    z = price_int(y)
    features['price'] = z

In [1295]:
def size_floorplan_2(floorplan):
    """Return the size data (in sq ft) from the floorplan"""
    reader = easyocr.Reader(['en']) # GPU=false taken out for now
    result = reader.readtext(floorplan)
    
    scraped_text = ''
    for i in result:
        scraped_text += i[1]
        scraped_text += ","
    text_list = scraped_text.split(",")
    
    m_patterns = re.compile(r"[+-]? *((?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)\s*(square meters|square metres|square meter|square metre|square m|square mt|sqmt|sq mt|sq.mt|sq. mt|sqm|sq m|sq.m|sq. m|sq: m|sq:m|sq :m|sq : m|sq :m|meters2|metres2|meter2|metre2|mt2|m2)", re.IGNORECASE)
    f_patterns = re.compile(r"[+-]? *((?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)\s*(square feet|square ft|square f|sqft|sq ft|sq.ft|sq. ft|sqf|sq f|sq.f|sq. f|sq: f|sq:f|sq :f|sq : f|sq :f|feet2|ft2|f2)", re.IGNORECASE)
    house_size = []
    for x in text_list:
        for match in re.finditer(m_patterns, x):
            #print('%s : %s' % (match.groups()))
            house_size.append(list(match.groups()))
        for match in re.finditer(f_patterns, x):
            #print('%s : %s' % (match.groups())) 
            house_size.append(list(match.groups()))
    for a in house_size:
        try:
            a[0] = int(a[0])
        except ValueError:
            a[0] = float(a[0])
            
    # then here you could find the
    house_size = dict((house_size))
    #print(house_size)
    house_size_swap = {}
    # code to swap keys and values
    for k, v in house_size.items():
        house_size_swap[v] = k

    house_size = house_size_swap
    
    # Code to take the largest returned value from the floorplan numbers
    amount = max(house_size.values())
    for i in house_size.keys():
        if house_size[i] == amount:
            size_label = i.lower()
    
    # Remove punctuation from the size labels       
    if "," in size_label:
        size_label = size_label.replace(",", "")
    if ":" in size_label:
        size_label = size_label.replace(":", "")
    if "." in size_label:
        size_label = size_label.replace(".", "")
    
    largest_size = {size_label:amount}
    
    
            
    # need to add values with colons, :
    # alternatively, you could just remove the colon. This would also nehate this issue

    metre_values = ["square meters", "square metres", "square meter", "square metre", "square m", "square mt", "sqmt", "sq mt", "sq mt", "sq mt", "sqm", "sq m", "sq m", "sq m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
    feet_values = ["square feet","square ft","square f","sqft","sq ft","sq  ft","sq. ft","sqf","sq f","sq f","sq f","feet2", "ft2","f2"]


    if size_label in metre_values:
        # convert the size into square ft if sq m is obtained
        largest_size["sq ft"] = amount*10.764
        del largest_size[size_label]
    elif size_label in feet_values:
        largest_size["sq ft"] = amount
    

        
    return largest_size["sq ft"]

In [1296]:
# function that returns the sq foot value from the dictionaries that are scraped from the sold houses
def sq_foot_return(x):
    values_m = ["square meters", "sq meters", "square metres", "sq metres", "square meter", "sq meter", "square metre", "sq metre","square m", "square mt", "sqmt", "sq mt", "sq.mt", "sq. mt", "sqm", "sq m", "sq.m", "sq. m", "sq: m", "sq:m", "sq :m", "sq : m", "sq :m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
    values_f = ["square feet", "square ft", "square f", "sqft", "sq ft", "sq.ft", "sq. ft", "sqf", "sq f", "sq.f", "sq. f", "sq: f", "sq:f", "sq :f", "sq : f", "sq :f", "feet2", "ft2", "f2"]
    for k, v in x.items():
        if k.strip() in values_f:
            return x[k]
        if k.strip() in values_m:
            return x[k]*10.764
    return "Na"

In [1297]:
# here is where the code should go to create this outstanding schools list
outstanding_schools = [[51.4974948, -0.1356583],
 [51.5202607, -0.0293396],
 [51.4935082, -0.1178424],
 [51.6569225, -0.1949252],
 [51.5202607, -0.0293396],
 [51.5345448, -0.2043853],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.6569225, -0.1949252],
 [51.5906113, -0.1109709],
 [51.5202607, -0.0293396],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.5672808, -0.2710568],
 [51.6569225, -0.1949252],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.5202607, -0.0293396],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.4990805, -0.1938253],
 [51.5255162, 0.0352163],
 [51.5255162, 0.0352163],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.5202607, -0.0293396],
 [51.5436387, -0.0553621],
 [51.4935082, -0.1178424],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5886383, -0.0117625],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.5886121, 0.0823982],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.5886383, -0.0117625],
 [51.5132537, -0.3043136],
 [51.5906113, -0.1109709],
 [51.458373, -0.1891356],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5672808, -0.2710568],
 [51.5886383, -0.0117625],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.5202607, -0.0293396],
 [51.4609218, -0.373149],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.4933675, 0.0098214],
 [51.5886121, 0.0823982],
 [51.5886121, 0.0823982],
 [51.502781, -0.087738],
 [51.4935082, -0.1178424],
 [51.5540666, 0.134017],
 [51.4935082, -0.1178424],
 [51.5886121, 0.0823982],
 [51.5886383, -0.0117625],
 [51.4611509, -0.0073177],
 [51.5390261, -0.1425516],
 [51.4609218, -0.373149],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5132537, -0.3043136],
 [51.4990805, -0.1938253],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.5886121, 0.0823982],
 [51.5202607, -0.0293396],
 [51.5390261, -0.1425516],
 [51.502781, -0.087738],
 [51.4609218, -0.373149],
 [51.5436387, -0.0553621],
 [51.4974948, -0.1356583],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5123443, -0.0909852],
 [51.538621, -0.1028346],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.5255162, 0.0352163],
 [51.5886383, -0.0117625],
 [51.538621, -0.1028346],
 [51.4935082, -0.1178424],
 [51.4990156, -0.22915],
 [51.538621, -0.1028346],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.502781, -0.087738],
 [51.4935082, -0.1178424],
 [51.5202607, -0.0293396],
 [51.502781, -0.087738],
 [51.5202607, -0.0293396],
 [51.5540666, 0.134017],
 [51.4990156, -0.22915],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.5132537, -0.3043136],
 [51.6569225, -0.1949252],
 [51.4611509, -0.0073177],
 [51.4935082, -0.1178424],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.4933675, 0.0098214],
 [51.406025, 0.013156],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.5886383, -0.0117625],
 [51.502781, -0.087738],
 [51.4990805, -0.1938253],
 [51.4097742, -0.2108084],
 [51.4933675, 0.0098214],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.502781, -0.087738],
 [51.4974948, -0.1356583],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.4611509, -0.0073177],
 [51.6522994, -0.0807119],
 [51.5132537, -0.3043136],
 [51.6522994, -0.0807119],
 [51.5132537, -0.3043136],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.6522994, -0.0807119],
 [51.5436387, -0.0553621],
 [51.4611509, -0.0073177],
 [51.502781, -0.087738],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.5540666, 0.134017],
 [51.461311, -0.303742],
 [51.502781, -0.087738],
 [51.5672808, -0.2710568],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5886383, -0.0117625],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.577924, 0.2120829],
 [51.4990805, -0.1938253],
 [51.5886383, -0.0117625],
 [51.461311, -0.303742],
 [51.4935082, -0.1178424],
 [51.4990156, -0.22915],
 [51.5255162, 0.0352163],
 [51.502781, -0.087738],
 [51.5255162, 0.0352163],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.538621, -0.1028346],
 [51.3769529, -0.0956895],
 [51.6569225, -0.1949252],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.5255162, 0.0352163],
 [51.4990805, -0.1938253],
 [51.439933, 0.154327],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.3769529, -0.0956895],
 [51.5436387, -0.0553621],
 [51.439933, 0.154327],
 [51.6569225, -0.1949252],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.4097742, -0.2108084],
 [51.6522994, -0.0807119],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5886121, 0.0823982],
 [51.41233, -0.300689],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.502781, -0.087738],
 [51.580559, -0.341995],
 [51.439933, 0.154327],
 [51.5906113, -0.1109709],
 [51.5351832, -0.4481378],
 [51.5255162, 0.0352163],
 [51.4974948, -0.1356583],
 [51.4097742, -0.2108084],
 [51.5436387, -0.0553621],
 [51.5202607, -0.0293396],
 [51.5390261, -0.1425516],
 [51.6569225, -0.1949252],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.4935082, -0.1178424],
 [51.5540666, 0.134017],
 [51.4974948, -0.1356583],
 [51.4933675, 0.0098214],
 [51.5540666, 0.134017],
 [51.3769529, -0.0956895],
 [51.5351832, -0.4481378],
 [51.4990156, -0.22915],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177],
 [51.5886121, 0.0823982],
 [51.5906113, -0.1109709],
 [51.6522994, -0.0807119],
 [51.6522994, -0.0807119],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.4933675, 0.0098214],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.4097742, -0.2108084],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.502781, -0.087738],
 [51.577924, 0.2120829],
 [51.406025, 0.013156],
 [51.5906113, -0.1109709],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5132537, -0.3043136],
 [51.4611509, -0.0073177],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.4990805, -0.1938253],
 [51.6522994, -0.0807119],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.4935082, -0.1178424],
 [51.5906113, -0.1109709],
 [51.41233, -0.300689],
 [51.5906113, -0.1109709],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4609218, -0.373149],
 [51.4611509, -0.0073177],
 [51.5906113, -0.1109709],
 [51.580559, -0.341995],
 [51.3769529, -0.0956895],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.5886121, 0.0823982],
 [51.461311, -0.303742],
 [51.4935082, -0.1178424],
 [51.5132537, -0.3043136],
 [51.6569225, -0.1949252],
 [51.4097742, -0.2108084],
 [51.5906113, -0.1109709],
 [51.6569225, -0.1949252],
 [51.5886383, -0.0117625],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4609218, -0.373149],
 [51.577924, 0.2120829],
 [51.5351832, -0.4481378],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.439933, 0.154327],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.5351832, -0.4481378],
 [51.4609218, -0.373149],
 [51.5540666, 0.134017],
 [51.6569225, -0.1949252],
 [51.3769529, -0.0956895],
 [51.439933, 0.154327],
 [51.6569225, -0.1949252],
 [51.4609218, -0.373149],
 [51.4990805, -0.1938253],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.3769529, -0.0956895],
 [51.4990156, -0.22915],
 [51.5351832, -0.4481378],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.4609218, -0.373149],
 [51.5436387, -0.0553621],
 [51.5390261, -0.1425516],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.4974948, -0.1356583],
 [51.5390261, -0.1425516],
 [51.458373, -0.1891356],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.4974948, -0.1356583],
 [51.6522994, -0.0807119],
 [51.4990156, -0.22915],
 [51.4990805, -0.1938253],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5886383, -0.0117625],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.5540666, 0.134017],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4097742, -0.2108084],
 [51.5390261, -0.1425516],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5123443, -0.0909852],
 [51.461311, -0.303742],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.538621, -0.1028346],
 [51.5886121, 0.0823982],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.6569225, -0.1949252],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.6569225, -0.1949252],
 [51.4935082, -0.1178424],
 [51.580559, -0.341995],
 [51.4935082, -0.1178424],
 [51.538621, -0.1028346],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.5255162, 0.0352163],
 [51.4990805, -0.1938253],
 [51.4974948, -0.1356583],
 [51.4990156, -0.22915],
 [51.5672808, -0.2710568],
 [51.6522994, -0.0807119],
 [51.538621, -0.1028346],
 [51.5906113, -0.1109709],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.5132537, -0.3043136],
 [51.5202607, -0.0293396],
 [51.5132537, -0.3043136],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5351832, -0.4481378],
 [51.5390261, -0.1425516],
 [51.580559, -0.341995],
 [51.458373, -0.1891356],
 [51.538621, -0.1028346],
 [51.4935082, -0.1178424],
 [51.580559, -0.341995],
 [51.5351832, -0.4481378],
 [51.502781, -0.087738],
 [51.4990156, -0.22915],
 [51.5886121, 0.0823982],
 [51.6569225, -0.1949252],
 [51.4990805, -0.1938253],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.5351832, -0.4481378],
 [51.4990805, -0.1938253],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.6522994, -0.0807119],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.4611509, -0.0073177],
 [51.580559, -0.341995],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.6569225, -0.1949252],
 [51.458373, -0.1891356],
 [51.502781, -0.087738],
 [51.41233, -0.300689],
 [51.5540666, 0.134017],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5351832, -0.4481378],
 [51.6522994, -0.0807119],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.502781, -0.087738],
 [51.5132537, -0.3043136],
 [51.5351832, -0.4481378],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.5672808, -0.2710568],
 [51.3769529, -0.0956895],
 [51.4990805, -0.1938253],
 [51.4990156, -0.22915],
 [51.4974948, -0.1356583],
 [51.5886121, 0.0823982],
 [51.461311, -0.303742],
 [51.5351832, -0.4481378],
 [51.5390261, -0.1425516],
 [51.5255162, 0.0352163],
 [51.538621, -0.1028346],
 [51.4097742, -0.2108084],
 [51.5886121, 0.0823982],
 [51.3769529, -0.0956895],
 [51.5390261, -0.1425516],
 [51.577924, 0.2120829],
 [51.5886383, -0.0117625],
 [51.406025, 0.013156],
 [51.4990805, -0.1938253],
 [51.3769529, -0.0956895],
 [51.6522994, -0.0807119],
 [51.4935082, -0.1178424],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.406025, 0.013156],
 [51.6522994, -0.0807119],
 [51.4990156, -0.22915],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.4974948, -0.1356583],
 [51.6522994, -0.0807119],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5886383, -0.0117625],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.5672808, -0.2710568],
 [51.5672808, -0.2710568],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.538621, -0.1028346],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.580559, -0.341995],
 [51.5390261, -0.1425516],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.5906113, -0.1109709],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.5390261, -0.1425516],
 [51.458373, -0.1891356],
 [51.580559, -0.341995],
 [51.5255162, 0.0352163],
 [51.5906113, -0.1109709],
 [51.4990156, -0.22915],
 [51.4990805, -0.1938253],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4611509, -0.0073177],
 [51.3769529, -0.0956895],
 [51.4933675, 0.0098214],
 [51.577924, 0.2120829],
 [51.6522994, -0.0807119],
 [51.458373, -0.1891356],
 [51.4933675, 0.0098214],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.6522994, -0.0807119],
 [51.4933675, 0.0098214],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.5351832, -0.4481378],
 [51.5436387, -0.0553621],
 [51.4990805, -0.1938253],
 [51.6569225, -0.1949252],
 [51.461311, -0.303742],
 [51.6569225, -0.1949252],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4990805, -0.1938253],
 [51.5886121, 0.0823982],
 [51.5351832, -0.4481378],
 [51.6522994, -0.0807119],
 [51.5255162, 0.0352163],
 [51.461311, -0.303742],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.5436387, -0.0553621],
 [51.5906113, -0.1109709],
 [51.580559, -0.341995],
 [51.5202607, -0.0293396],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.5436387, -0.0553621],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.6569225, -0.1949252],
 [51.4933675, 0.0098214],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.5132537, -0.3043136],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.580559, -0.341995],
 [51.439933, 0.154327],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.580559, -0.341995],
 [51.4935082, -0.1178424],
 [51.461311, -0.303742],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.4935082, -0.1178424],
 [51.458373, -0.1891356],
 [51.439933, 0.154327],
 [51.580559, -0.341995],
 [51.4933675, 0.0098214],
 [51.461311, -0.303742],
 [51.3769529, -0.0956895],
 [51.577924, 0.2120829],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.5132537, -0.3043136],
 [51.4974948, -0.1356583],
 [51.5886383, -0.0117625],
 [51.4990156, -0.22915],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.3769529, -0.0956895],
 [51.538621, -0.1028346],
 [51.5436387, -0.0553621],
 [51.538621, -0.1028346],
 [51.439933, 0.154327],
 [51.458373, -0.1891356],
 [51.3769529, -0.0956895],
 [51.5672808, -0.2710568],
 [51.4935082, -0.1178424],
 [51.41233, -0.300689],
 [51.577924, 0.2120829],
 [51.6569225, -0.1949252],
 [51.5202607, -0.0293396],
 [51.6569225, -0.1949252],
 [51.41233, -0.300689],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.4609218, -0.373149],
 [51.5886121, 0.0823982],
 [51.458373, -0.1891356],
 [51.5132537, -0.3043136],
 [51.458373, -0.1891356],
 [51.4611509, -0.0073177],
 [51.461311, -0.303742],
 [51.4933675, 0.0098214],
 [51.4097742, -0.2108084],
 [51.5436387, -0.0553621],
 [51.577924, 0.2120829],
 [51.4990156, -0.22915],
 [51.406025, 0.013156],
 [51.580559, -0.341995],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5123443, -0.0909852],
 [51.4990156, -0.22915],
 [51.5906113, -0.1109709],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.3769529, -0.0956895],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.5886121, 0.0823982],
 [51.4609218, -0.373149],
 [51.3769529, -0.0956895],
 [51.538621, -0.1028346],
 [51.6569225, -0.1949252],
 [51.5351832, -0.4481378],
 [51.6569225, -0.1949252],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.4935082, -0.1178424],
 [51.4974948, -0.1356583],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5351832, -0.4481378],
 [51.5540666, 0.134017],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.458373, -0.1891356],
 [51.5672808, -0.2710568],
 [51.458373, -0.1891356],
 [51.4974948, -0.1356583],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.5540666, 0.134017],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5390261, -0.1425516],
 [51.4609218, -0.373149],
 [51.4990156, -0.22915],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.502781, -0.087738],
 [51.461311, -0.303742],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.5886121, 0.0823982],
 [51.580559, -0.341995],
 [51.4097742, -0.2108084],
 [51.5886383, -0.0117625],
 [51.4611509, -0.0073177],
 [51.577924, 0.2120829],
 [51.5132537, -0.3043136],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.502781, -0.087738],
 [51.538621, -0.1028346],
 [51.4097742, -0.2108084],
 [51.5132537, -0.3043136],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.6569225, -0.1949252],
 [51.41233, -0.300689],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.458373, -0.1891356],
 [51.5202607, -0.0293396],
 [51.502781, -0.087738],
 [51.577924, 0.2120829],
 [51.5906113, -0.1109709],
 [51.4609218, -0.373149],
 [51.4609218, -0.373149],
 [51.5540666, 0.134017],
 [51.577924, 0.2120829],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.4933675, 0.0098214],
 [51.5886383, -0.0117625],
 [51.406025, 0.013156],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.5202607, -0.0293396],
 [51.461311, -0.303742],
 [51.4974948, -0.1356583],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.458373, -0.1891356],
 [51.5886121, 0.0823982],
 [51.4097742, -0.2108084],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.4611509, -0.0073177],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.4935082, -0.1178424],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4990805, -0.1938253],
 [51.6569225, -0.1949252],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.5132537, -0.3043136],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.3769529, -0.0956895],
 [51.5390261, -0.1425516],
 [51.4990156, -0.22915],
 [51.3769529, -0.0956895],
 [51.5886121, 0.0823982],
 [51.406025, 0.013156],
 [51.5202607, -0.0293396],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.6569225, -0.1949252],
 [51.5390261, -0.1425516],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.538621, -0.1028346],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5886121, 0.0823982],
 [51.6569225, -0.1949252],
 [51.580559, -0.341995],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.4990156, -0.22915],
 [51.5351832, -0.4481378],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.5255162, 0.0352163],
 [51.538621, -0.1028346],
 [51.4609218, -0.373149],
 [51.5390261, -0.1425516],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.4990156, -0.22915],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.5390261, -0.1425516],
 [51.461311, -0.303742],
 [51.5132537, -0.3043136],
 [51.461311, -0.303742],
 [51.3769529, -0.0956895],
 [51.5540666, 0.134017],
 [51.4933675, 0.0098214],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5255162, 0.0352163],
 [51.580559, -0.341995],
 [51.5886121, 0.0823982],
 [51.458373, -0.1891356],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.5672808, -0.2710568],
 [51.538621, -0.1028346],
 [51.5540666, 0.134017],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.5132537, -0.3043136],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4933675, 0.0098214],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.461311, -0.303742],
 [51.538621, -0.1028346],
 [51.4990156, -0.22915],
 [51.4935082, -0.1178424],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.41233, -0.300689],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.5132537, -0.3043136],
 [51.538621, -0.1028346],
 [51.41233, -0.300689],
 [51.538621, -0.1028346],
 [51.4990805, -0.1938253],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.4974948, -0.1356583],
 [51.4935082, -0.1178424],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.4974948, -0.1356583],
 [51.538621, -0.1028346],
 [51.538621, -0.1028346],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.4935082, -0.1178424],
 [51.4933675, 0.0098214],
 [51.3769529, -0.0956895],
 [51.4990156, -0.22915],
 [51.5390261, -0.1425516],
 [51.4097742, -0.2108084],
 [51.4990805, -0.1938253],
 [51.4097742, -0.2108084],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.577924, 0.2120829],
 [51.4974948, -0.1356583],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.4609218, -0.373149],
 [51.5255162, 0.0352163],
 [51.580559, -0.341995],
 [51.5886383, -0.0117625],
 [51.4990805, -0.1938253],
 [51.502781, -0.087738],
 [51.3769529, -0.0956895],
 [51.577924, 0.2120829],
 [51.577924, 0.2120829],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.538621, -0.1028346],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.4990805, -0.1938253],
 [51.4935082, -0.1178424],
 [51.406025, 0.013156],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.6569225, -0.1949252],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.502781, -0.087738],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.5886121, 0.0823982],
 [51.577924, 0.2120829],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177]]

In [1298]:
def nearest_outstanding_school(lat, lon):
    """Return the KM distance to the nearest outstanding school"""
    coords_1 = (lat, lon)
    lower_boundary, upper_boundary = -0.05, 0.05
    school_distances = []
    for i in outstanding_schools:
        if (i[0] - lat > upper_boundary) or i[0] - lat < lower_boundary:
            continue
        elif (i[1] - lon > upper_boundary) or i[1] - lon < lower_boundary:
            continue
        else:
            coords_2 = (i[0], i[1])
            school_distances.append(geopy.distance.geodesic(coords_1, coords_2).km)
            
    try:
        return min(school_distances)
    except ValueError:
        return "Na"

In [1301]:
houses

{'102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA': {'property_type': 'Flat',
  'price': 540000,
  'date': '6 May 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-113320763-15073456?s=b78b2774144e4e6348541fa9baf724516be6301d318cc11332ffda03d7283107',
  'features': ['Ground Floor Shower Room &amp; First Floor Bathroom',
   'Two Receptions',
   'Double Glazed &amp; Central Heating',
   'Extended To Front And Rear',
   '0.3 Miles From Elm Park District Line Station',
   'Four Bedroom Semi-Detached House'],
  'floorplan_url': 'https://media.rightmove.co.uk/203k/202751/113320763/202751_1252954-1_FLP_00_0000.jpeg',
  'latitude': 51.553873,
  'longitude': 0.201187,
  'station_proximities': {'Elm Park Station': 0.3252095751095814,
   'Hornchurch Station': 0.7566512540661113,
   'Emerson Park Station': 1.314927814727093},
  'property_size': {},
  'bedrooms': 4,
  'bathrooms': 2,
  'new_build': False},
 '106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA'

In [1302]:
# get all of the latitude and longitudes for all of the houses in the data set, and add this to the data.
import pgeocode
nomi = pgeocode.Nominatim('GB')
for x, y in houses.items():
    if 'latitude' in y.keys() and 'longitude' in y.keys():
        continue
    else:
        postcode = x[-8:]
        if postcode[0] == " ":
            postcode = x[-7:]
        postcode_split = postcode.split(" ")
        if len(postcode_split) == 2:
            postcode = postcode_split[0]+" "+postcode_split[1]
        elif len(postcode_split) == 3:
            postcode = postcode_split[1]+" "+postcode_split[2]
        lat = float(nomi.query_postal_code(postcode).latitude)
        lon = float(nomi.query_postal_code(postcode).longitude)
        
        y['latitude'] = lat
        y['longitude'] = lon

In [783]:

# for y in houses.values():
#     # Generate the feature of distance to the nearest outstanding school
#     if 'latitude' in y.keys() and 'longitude' in y.keys():
#         distance_km = nearest_outstanding_school(y['latitude'], y['longitude'])
#         y['nearest_outstanding_school'] = distance_km

    
#     # Attempt to generate the feature of the sq ft of the property
#     if 'property_size' in y.keys():
#         if (y['property_size'] == {}) and ('floorplan_url' in y.keys()):
#             try:
#                 ocr_size = size_floorplan_2(y['floorplan_url'])
#             except Exception:
#                 y['property_size'] = 'Na'
#             else:
#                 if ocr_size == None:
#                     y['property_size'] = 'Na'
#                 y['property_size'] = ocr_size
#     # If a property_size dictionary already exists, use it to find the sq ft of the property 
#     # code that handles when the property data has a dictionary with sqf and/or sq/f values in it
#     # Now you just need the f_values and m_values lists in the block of code
#         elif len(y['property_size']) >= 1:
#         # loop through the this dictionary
#         # if the dictionary contains a sq m value, set the value of property size to its sq f equivalent, 
#         # if the dictionary contains a sq f value, set it
#             for i, v in y['property_size'].items():
#                 p = i.strip().lower()
#                 if p in f_values:
#                     y['property_size'] = y['property_size'][v]
#                 elif p in m_values:
#                     y['property_size'] = y['property_size'][v]*10.764
                    
#         #elif (y['property_size'] == {}) and ('floorplan_url' not in y.keys()):
#          #   y['property_size'] = 'Na'
                
    
# # This still needs to be looked at
                
#     # if floorplan not in keys and property_size is {}
    
    
#     # This is where you could do an else statement, and set all of the prop_size dictionaries which end up in the data set 
#     # as ''
        

            
            
        
#  # need to handle for the situation when there is data in the property size dict, but its not related to sq sizes, 
# # and there is a floorplan URL
        
    
# #     else:
# #         if 'floorplan_url' in y.keys():
# #             ocr_size = size_floorplan_2(y['floorplan_url'])
# #             if ocr_size == None:
# #                 y['property_size'] = ''
# #             y['property_size'] = ocr_size
    
        

In [861]:
# loop through all of the houses, if they don't have a property size but do have a floorplan, apply the OCR, and then return 
# a value (note a nonetype may be returned so you'll have to deal with that). 

for y in houses.values():
    if 'property_size' in y.keys():
        if type(y['property_size']) == int or type(y['property_size']) == float:
            continue
        if (y['property_size'] == {} or type(y['property_size']) == str) and ('floorplan_url' in y.keys()):
            try:
                ocr_size = size_floorplan_2(y['floorplan_url'])
            except Exception:
                y['property_size'] = 'Na'
            else:
                if ocr_size == None or ocr_size == '':
                    y['property_size'] = 'Na'
                y['property_size'] = ocr_size
                # The line below this just sets the dictionary as the property size
                #y['property_size'] = ocr_size
                # This code actually sets the sq ft property size 
                ##for f in f_values:
                  ##  try:
                    ##    y['property_size'] = ocr_size[f]
                    ##except KeyError:
                      ##  pass
                    ##else:
                      ##  break
                            
    

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |████----------------------------------------------| 8.3% CompleteProgress: |████████------------------------------------------| 16.6% CompleteProgress: |████████████--------------------------------------| 24.8% CompleteProgress: |████████████████----------------------------------| 33.1% CompleteProgress: |████████████████████------------------------------| 41.4% CompleteProgress: |████████████████████████--------------------------| 49.7% CompleteProgress: |████████████████████████████----------------------| 58.0% CompleteProgress: |█████████████████████████████████-----------------| 66.2% CompleteProgress: |█████████████████████████████████████-------------| 74.5% CompleteProgress: |█████████████████████████████████████████---------| 82.8% CompleteProgress: |█████████████████████████████████████████████-----| 91.1% CompleteProgress: |█████████████████████████████████████████████████-| 99

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |███████-------------------------------------------| 15.7% CompleteProgress: |███████████████-----------------------------------| 31.4% CompleteProgress: |███████████████████████---------------------------| 47.0% CompleteProgress: |███████████████████████████████-------------------| 62.7% CompleteProgress: |███████████████████████████████████████-----------| 78.4% CompleteProgress: |███████████████████████████████████████████████---| 94.1% CompleteProgress: |██████████████████████████████████████████████████████| 109.7% Complete

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |█-------------------------------------------------| 3.6% CompleteProgress: |███-----------------------------------------------| 7.3% CompleteProgress: |█████---------------------------------------------| 10.9% CompleteProgress: |███████-------------------------------------------| 14.5% CompleteProgress: |█████████-----------------------------------------| 18.2% CompleteProgress: |██████████----------------------------------------| 21.8% CompleteProgress: |████████████--------------------------------------| 25.5% CompleteProgress: |██████████████------------------------------------| 29.1% CompleteProgress: |████████████████----------------------------------| 32.7% CompleteProgress: |██████████████████--------------------------------| 36.4% CompleteProgress: |███████████████████-------------------------------| 40.0% CompleteProgress: |█████████████████████-----------------------------| 43.

KeyboardInterrupt: 

In [1303]:
# BROKEN THIS AWAY FROM THE ABOVE CODE AS THE OCR WAS TAKING TOO LONG # 
# # If they have a don't have a floorplan, but a property_size dictionary has been scraped, apply sq_foot_return()
# # Loop through all of the houses, if they have their lon and lat available, apply the nearest_oustanding_school()
# to return the distance to the nearest outstanding school
for y in houses.values():
    if 'property_size' in y.keys():
        try:
            if len(y['property_size']) >= 1 and type(y['property_size']) == dict:
                value = sq_foot_return(y['property_size'])
                y['property_size'] = value
        except TypeError:
            pass
        if 'latitude' in y.keys() and 'longitude' in y.keys():
            distance_km = nearest_outstanding_school(y['latitude'], y['longitude'])
            y['nearest_outstanding_school'] = distance_km

In [1304]:
houses

{'102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA': {'property_type': 'Flat',
  'price': 540000,
  'date': '6 May 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-113320763-15073456?s=b78b2774144e4e6348541fa9baf724516be6301d318cc11332ffda03d7283107',
  'features': ['Ground Floor Shower Room &amp; First Floor Bathroom',
   'Two Receptions',
   'Double Glazed &amp; Central Heating',
   'Extended To Front And Rear',
   '0.3 Miles From Elm Park District Line Station',
   'Four Bedroom Semi-Detached House'],
  'floorplan_url': 'https://media.rightmove.co.uk/203k/202751/113320763/202751_1252954-1_FLP_00_0000.jpeg',
  'latitude': 51.553873,
  'longitude': 0.201187,
  'station_proximities': {'Elm Park Station': 0.3252095751095814,
   'Hornchurch Station': 0.7566512540661113,
   'Emerson Park Station': 1.314927814727093},
  'property_size': {},
  'bedrooms': 4,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 2.780506863387836},
 '106, La

In [1305]:
# Code for finding the average price of houses in the same postcode area with the same number of bedrooms of each property
# This column will be used in the first regression 

for i, v in houses.items():
    local_similar_prices = []
    # need to store the postcode, each house has a postcode so this will be easier
    postcode = i[-8:]
    if postcode[0] == " ":
        postcode = i[-7:]
    postcode_area = postcode.split(" ")[0]
    if postcode_area == 'n':
        postcode_area = postcode.split(" ")[1]


    try:
        prop_type = v['property_type']
        num_bedrooms = v["bedrooms"]
    except KeyError:
        continue
    else:
        for a, b in houses.items():
            postcode = i[-8:]
            if postcode[0] == " ":
                postcode = i[-7:]
            postcode_area1 = postcode.split(" ")[0]
            if postcode_area1 == 'n':
                postcode_area1 = postcode.split(" ")[1]
            if postcode_area == postcode_area1:
                try:
                    prop_type1 = b['property_type']
                    num_bedrooms1 = b['bedrooms']
                except KeyError:
                    pass
                else:
                    if prop_type1 == prop_type and num_bedrooms1 == num_bedrooms:
                        local_similar_prices.append(b['price'])
    
    avg_local_price = sum(local_similar_prices) / len(local_similar_prices)
    v["avg_local_price"] = avg_local_price

In [1306]:
houses

{'102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA': {'property_type': 'Flat',
  'price': 540000,
  'date': '6 May 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-113320763-15073456?s=b78b2774144e4e6348541fa9baf724516be6301d318cc11332ffda03d7283107',
  'features': ['Ground Floor Shower Room &amp; First Floor Bathroom',
   'Two Receptions',
   'Double Glazed &amp; Central Heating',
   'Extended To Front And Rear',
   '0.3 Miles From Elm Park District Line Station',
   'Four Bedroom Semi-Detached House'],
  'floorplan_url': 'https://media.rightmove.co.uk/203k/202751/113320763/202751_1252954-1_FLP_00_0000.jpeg',
  'latitude': 51.553873,
  'longitude': 0.201187,
  'station_proximities': {'Elm Park Station': 0.3252095751095814,
   'Hornchurch Station': 0.7566512540661113,
   'Emerson Park Station': 1.314927814727093},
  'property_size': {},
  'bedrooms': 4,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 2.780506863387836,
  'avg_loc

In [1039]:
# THIS HAS BEEN COMMENTED OUT AFTER SPEAKING TO ADAM 

# # Code that makes a unique number for each postcode area from our scraped data
# postcodes = []
# for i, v in houses.items():
#     postcode = i[-8:]
#     if postcode[0] == " ":
#         postcode = i[-7:]
#     postcode_area = postcode.split(" ")[0]
#     if postcode_area == 'n':
#         postcode_area = postcode.split(" ")[1]
#     postcodes.append(postcode_area)
# unique_postcodes = set(postcodes)
# unique_postcodes = list(unique_postcodes)

# # What I need to do is order the unique postcodes in terms of average sold price
# postcode_prices = {}
# for i in unique_postcodes:
#     prices = []
#     for a, b in houses.items():
#         if i in a:
#             prices.append(b['price'])
#     postcode_avg = sum(prices) / len(prices)
#     postcode_prices[i] = postcode_avg
    
# postcode_prices = sorted(postcode_prices.items(), key=lambda x: x[1])


# postcodes_ordered = [postcode_prices[i][0] for i in range(len(postcode_prices))]
# postcodes_ordered
# area_codes = {}
# for i in range(len(postcodes_ordered)):
#     area_codes[postcodes_ordered[i]] = i
# area_codes

{'WC2A': 0,
 'EC4R': 1,
 'EC3A': 2,
 'EC3N': 3,
 'EC4M': 4,
 'EC4V': 5,
 'E1': 6,
 'EC3V': 7,
 'EC4Y': 8,
 'EC4A': 9,
 'EC1V': 10,
 'EC3M': 11,
 'EC1Y': 12,
 'EC1M': 13,
 'EC2Y': 14,
 'EC4N': 15,
 'EC1A': 16,
 'EC3R': 17,
 'EC1N': 18,
 'WC2R': 19}

In [1333]:
# Need to convert the data into a quantitative format, which regression algorithms can learn/predict from
# This is done for each data set
properties = {}
index = 0
# loop through all of the 
for h, i in houses.items():
    house_data = []
    
    # Area code has been commented out after speaking to Adam
    # area_code - Want to know which codes
#     postcode = h[-8:]
#     if postcode[0] == " ":
#         postcode = h[-7:]
#     postcode_area = postcode.split(" ")[0]
#     if postcode_area == 'n':
#         postcode_area = postcode.split(" ")[1]
    
#     area_code = area_codes[postcode_area]
#     house_data.append(area_code)
    
    house_data.append(i['latitude'])
    house_data.append(i['longitude'])
    
    
    if i['property_type'] == 'Flat':
        prop_type = 1
    elif i['property_type'] == 'Terraced':
        prop_type == 2
    elif i['property_type'] == 'Semi-Detached':
        prop_type = 3 
    elif i['property_type'] == 'Detached':
        prop_type = 4
    else:
        prop_type = 5
    house_data.append(prop_type)
    
    # Convert the data into the necessary string format
    date_obj = datetime.datetime.strptime(i['date'], "%d %b %Y")
    date_str = date_obj.strftime("%Y-%m-%d")
    house_data.append(date_str)
    
    if 'station_proximities' in i.keys() and i['station_proximities'] != {}:
        station_distance = min(i['station_proximities'].values())
        house_data.append(station_distance)
    else:
        house_data.append("NA")
        
    # You should have handled for the property_size difficulties by now
    if 'property_size' not in i.keys():
        house_data.append("NA")
    elif i['property_size'] == {} or i['property_size'] == 'Na' or i["property_size"] == None:
        house_data.append("NA")
    else:
        # This should return the sq foot of the property
        house_data.append(i["property_size"])
    
    # Add the bedrooms and bathrooms data   
    if "bedrooms" not in i.keys():
        house_data.append("NA")
    elif i['bedrooms'] == None or i['bedrooms'] == '':
        house_data.append("NA")
    else:
        house_data.append(i["bedrooms"])
        
    if 'bathrooms' not in i.keys():
        house_data.append("NA")
    elif i['bathrooms'] == None or i['bathrooms'] == '':
        house_data.append("NA")
    else:
        house_data.append(i["bathrooms"])
        
        
    # New build
    if 'new_build' not in i.keys():
        house_data.append("NA")
    elif i['new_build'] == '':
        house_data.append("NA")
    else:
        if i["new_build"]:
            house_data.append(1)
        else:
            house_data.append(0)
    
    if 'nearest_outstanding_school' not in i.keys():
        house_data.append("NA")
    else:
        house_data.append(i["nearest_outstanding_school"])
    
    if 'avg_local_price' not in i.keys():
        house_data.append("NA")
    else:
        house_data.append(i["avg_local_price"])
        
    if 'floorplan_url' in i.keys():
        house_data.append("Yes")
    else:
        house_data.append("NA")
        
    
    
    house_data.append(i["price"])
    
    
        
    properties[h] = house_data
    index += 1
    
    

In [1334]:
properties

{'102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA': [51.553873,
  0.201187,
  1,
  '2022-05-06',
  0.3252095751095814,
  'NA',
  4,
  2,
  0,
  2.780506863387836,
  409000.0,
  'Yes',
  540000],
 '106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA': [51.556609,
  0.1913,
  1,
  '2022-04-20',
  0.554390444675729,
  'NA',
  3,
  'NA',
  0,
  2.7749919023057745,
  354941.0054200542,
  'Yes',
  465000],
 '23, Suttons Avenue, Hornchurch, Greater London RM12 4LE': [51.555864,
  0.217558,
  3,
  '2022-04-04',
  0.14699358984489788,
  'NA',
  3,
  1,
  0,
  2.483563068087039,
  380391.95324427483,
  'Yes',
  520000],
 '2, Hayburn Way, Hornchurch, Greater London RM12 4BH': [51.564437,
  0.189416,
  3,
  '2022-03-31',
  0.7776265160282667,
  'NA',
  3,
  'NA',
  0,
  2.1728832923785975,
  354941.0054200542,
  'Yes',
  490000],
 '23, Laburnum Avenue, Hornchurch, Greater London RM12 4HE': [51.55846,
  0.193053,
  3,
  '2022-03-25',
  0.6402629534917887,
  'NA',
  4,
  1,
  0,
  2.5

In [1335]:
len(properties)

4772

# Now I build the Regression Model


In [1336]:
import pandas as pd
import numpy as np

In [1337]:
properties_df = pd.DataFrame(properties)

In [1338]:
properties_df

Unnamed: 0,"102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA","106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA","23, Suttons Avenue, Hornchurch, Greater London RM12 4LE","2, Hayburn Way, Hornchurch, Greater London RM12 4BH","23, Laburnum Avenue, Hornchurch, Greater London RM12 4HE","24, Saunton Road, Hornchurch, Greater London RM12 4HG","147, Warren Drive, Hornchurch, Greater London RM12 4QU","Flat 27, Uphavering House, Parkhill Close, Hornchurch, Greater London RM12 4YX","56, Albany Road, Hornchurch, Greater London RM12 4AF","28, Gordon Avenue, Hornchurch, Greater London RM12 4EA",...,"Granary Barn, Church Lane, North Ockendon, Upminster, Greater London RM14 3QA","3, Bury Farm Cottages, St Marys Lane, Upminster, Greater London RM14 3PH","East View, Clay Tye Road, Upminster, Greater London RM14 3PL","Chubs Nook, Clay Tye Road, Upminster, Greater London RM14 3PL","8, St Marys Lane, North Ockendon, Upminster, Greater London RM14 3PA","Cranham Place, 4, Ockendon Road, Upminster, Greater London RM14 3QJ","Greystones, St Marys Lane, Upminster, Essex RM14 3PB","Hazeldene, Clay Tye Road, Upminster, Greater London RM14 3PL","Kosi Kot, Clay Tye Road, Upminster, Greater London RM14 3PL","Bankes House, Ockendon Road, Upminster, Greater London RM14 3QJ"
0,51.553873,51.556609,51.555864,51.564437,51.55846,51.560335,51.55443,51.557593,51.562709,51.560883,...,51.542,51.542,51.542,51.542,51.542,51.542,51.542,51.542,51.542,51.542
1,0.201187,0.1913,0.217558,0.189416,0.193053,0.194025,0.19956,0.206603,0.195238,0.186045,...,0.273875,0.273875,0.273875,0.273875,0.273875,0.273875,0.273875,0.273875,0.273875,0.273875
2,1,1,3,3,3,3,3,1,3,3,...,4,3,4,3,3,1,4,3,3,4
3,2022-05-06,2022-04-20,2022-04-04,2022-03-31,2022-03-25,2022-03-24,2022-03-18,2022-03-18,2022-03-14,2022-03-11,...,2000-10-25,2000-07-28,2000-04-18,2000-01-14,1999-05-28,1999-03-18,1997-11-13,1996-09-26,1995-11-30,1995-06-28
4,0.32521,0.55439,0.146994,0.777627,0.640263,0.754248,0.337395,0.583074,0.907514,0.924955,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,4,3,3,3,4,3,2,2,5,3,...,,,,,,,,,,
7,2,,1,,1,1,1,1,3,1,...,,,,,,,,,,
8,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
9,2.780507,2.774992,2.483563,2.172883,2.535864,2.323195,2.754377,2.2937,2.056607,2.618002,...,,,,,,,,,,


In [1339]:
properties_df = properties_df.T

In [1340]:
properties_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
"102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA",51.553873,0.201187,1,2022-05-06,0.32521,,4,2,0,2.780507,409000.0,Yes,540000
"106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA",51.556609,0.1913,1,2022-04-20,0.55439,,3,,0,2.774992,354941.00542,Yes,465000
"23, Suttons Avenue, Hornchurch, Greater London RM12 4LE",51.555864,0.217558,3,2022-04-04,0.146994,,3,1,0,2.483563,380391.953244,Yes,520000
"2, Hayburn Way, Hornchurch, Greater London RM12 4BH",51.564437,0.189416,3,2022-03-31,0.777627,,3,,0,2.172883,354941.00542,Yes,490000
"23, Laburnum Avenue, Hornchurch, Greater London RM12 4HE",51.55846,0.193053,3,2022-03-25,0.640263,,4,1,0,2.535864,439632.160804,Yes,475000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Cranham Place, 4, Ockendon Road, Upminster, Greater London RM14 3QJ",51.542,0.273875,1,1999-03-18,,,,,,,,,76000
"Greystones, St Marys Lane, Upminster, Essex RM14 3PB",51.542,0.273875,4,1997-11-13,,,,,,,,,175000
"Hazeldene, Clay Tye Road, Upminster, Greater London RM14 3PL",51.542,0.273875,3,1996-09-26,,,,,,,,,98000
"Kosi Kot, Clay Tye Road, Upminster, Greater London RM14 3PL",51.542,0.273875,3,1995-11-30,,,,,,,,,83500


In [1341]:
properties_df.columns = ["Latitude", "Longitude", "Property Type", "Sold_Date", "Closest Station", "Size", "Bedrooms", "Bathrooms", "New Build", "Nearest OS", "Local_Similar_Prices", "Floorplan?", "Price"]

In [1342]:
properties_df[0:25]

Unnamed: 0,Latitude,Longitude,Property Type,Sold_Date,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Floorplan?,Price
"102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA",51.553873,0.201187,1,2022-05-06,0.32521,,4.0,2.0,0.0,2.780507,409000.0,Yes,540000
"106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA",51.556609,0.1913,1,2022-04-20,0.55439,,3.0,,0.0,2.774992,354941.00542,Yes,465000
"23, Suttons Avenue, Hornchurch, Greater London RM12 4LE",51.555864,0.217558,3,2022-04-04,0.146994,,3.0,1.0,0.0,2.483563,380391.953244,Yes,520000
"2, Hayburn Way, Hornchurch, Greater London RM12 4BH",51.564437,0.189416,3,2022-03-31,0.777627,,3.0,,0.0,2.172883,354941.00542,Yes,490000
"23, Laburnum Avenue, Hornchurch, Greater London RM12 4HE",51.55846,0.193053,3,2022-03-25,0.640263,,4.0,1.0,0.0,2.535864,439632.160804,Yes,475000
"24, Saunton Road, Hornchurch, Greater London RM12 4HG",51.560335,0.194025,3,2022-03-24,0.754248,,3.0,1.0,0.0,2.323195,380391.953244,Yes,456000
"147, Warren Drive, Hornchurch, Greater London RM12 4QU",51.55443,0.19956,3,2022-03-18,0.337395,,2.0,1.0,0.0,2.754377,349309.378788,Yes,455000
"Flat 27, Uphavering House, Parkhill Close, Hornchurch, Greater London RM12 4YX",51.557593,0.206603,1,2022-03-18,0.583074,,2.0,1.0,0.0,2.2937,228292.707071,Yes,220000
"56, Albany Road, Hornchurch, Greater London RM12 4AF",51.562709,0.195238,3,2022-03-14,0.907514,,5.0,3.0,0.0,2.056607,514224.983333,Yes,655000
"28, Gordon Avenue, Hornchurch, Greater London RM12 4EA",51.560883,0.186045,3,2022-03-11,0.924955,,3.0,1.0,0.0,2.618002,380391.953244,Yes,470000


In [1343]:
# Set them to NaN values
properties_df = properties_df.replace("NA", np.NaN)
properties_df = properties_df.replace("Na", np.NaN)

In [1344]:
properties_df

Unnamed: 0,Latitude,Longitude,Property Type,Sold_Date,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Floorplan?,Price
"102, Eyhurst Avenue, Hornchurch, Greater London RM12 4RA",51.553873,0.201187,1,2022-05-06,0.325210,,4.0,2.0,0.0,2.780507,409000.000000,Yes,540000
"106, Laburnum Avenue, Hornchurch, Greater London RM12 4HA",51.556609,0.191300,1,2022-04-20,0.554390,,3.0,,0.0,2.774992,354941.005420,Yes,465000
"23, Suttons Avenue, Hornchurch, Greater London RM12 4LE",51.555864,0.217558,3,2022-04-04,0.146994,,3.0,1.0,0.0,2.483563,380391.953244,Yes,520000
"2, Hayburn Way, Hornchurch, Greater London RM12 4BH",51.564437,0.189416,3,2022-03-31,0.777627,,3.0,,0.0,2.172883,354941.005420,Yes,490000
"23, Laburnum Avenue, Hornchurch, Greater London RM12 4HE",51.558460,0.193053,3,2022-03-25,0.640263,,4.0,1.0,0.0,2.535864,439632.160804,Yes,475000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Cranham Place, 4, Ockendon Road, Upminster, Greater London RM14 3QJ",51.542000,0.273875,1,1999-03-18,,,,,,,,,76000
"Greystones, St Marys Lane, Upminster, Essex RM14 3PB",51.542000,0.273875,4,1997-11-13,,,,,,,,,175000
"Hazeldene, Clay Tye Road, Upminster, Greater London RM14 3PL",51.542000,0.273875,3,1996-09-26,,,,,,,,,98000
"Kosi Kot, Clay Tye Road, Upminster, Greater London RM14 3PL",51.542000,0.273875,3,1995-11-30,,,,,,,,,83500


In [1345]:
properties_df.isnull().sum(axis = 0)

Latitude                   0
Longitude                  0
Property Type              0
Sold_Date                  0
Closest Station          881
Size                    4739
Bedrooms                 886
Bathrooms               2183
New Build                880
Nearest OS               883
Local_Similar_Prices     880
Floorplan?              1874
Price                      0
dtype: int64

In [888]:
# calculate the median values 
mean_cs = properties_df['Closest Station'].mean()
mean_bedrooms = properties_df['Bedrooms'].mean()
mean_bathrooms = properties_df['Bathrooms'].mean()
median_build = properties_df['New Build'].median()
mean_os = properties_df['Nearest OS'].mean()
mean_sim_price = properties_df['Local_Similar_Prices'].mean()
mean_size = properties_df['Size'].mean()

mean_sim_price


863625.7422680412

In [889]:
prop_df1 = properties_df
prop_df1['Closest Station'] = prop_df1['Closest Station'].fillna(mean_cs)
prop_df1['Size'] = prop_df1['Size'].fillna(mean_size)
prop_df1['Bedrooms'] = prop_df1['Bedrooms'].fillna(mean_bedrooms)
prop_df1['Bathrooms'] = prop_df1['Bathrooms'].fillna(mean_bathrooms)
prop_df1['New Build'] = prop_df1['New Build'].fillna(median_build)
prop_df1['Nearest OS'] = prop_df1['Nearest OS'].fillna(mean_os)
prop_df1['Local_Similar_Prices'] = prop_df1['Local_Similar_Prices'].fillna(mean_sim_price)





In [1320]:
# Here we've dropped all of the rows without all of the data for the prop_df
prop_df = properties_df.dropna()

In [1321]:
len(prop_df)

28

In [1322]:
prop_df

Unnamed: 0,Latitude,Longitude,Property Type,Sold_Date,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Price
"93, Grenfell Avenue, Hornchurch, Greater London RM12 4DS",51.56183,0.18768,4,2021-06-29,0.927076,1174.0,3.0,1.0,0.0,2.463531,354941.00542,435000
"96, Elm Park Avenue, Hornchurch, Greater London RM12 4SW",51.55253,0.20397,3,2021-06-04,0.332076,699.66,2.0,1.0,0.0,2.880775,318253.159091,350000
"78, Albany Road, Hornchurch, Greater London RM12 4AF",51.56322,0.1951,4,2021-02-03,0.943309,1980.576,4.0,2.0,0.0,2.01564,545558.705882,645000
"44, Hayburn Way, Hornchurch, Greater London RM12 4BH",51.5643,0.19039,3,2020-12-07,0.802863,1496.196,3.0,1.0,0.0,2.135349,380391.953244,442000
"Flat 3, Hammond Court, Grenfell Avenue, Hornchurch, Greater London RM12 4FJ",51.56398,0.187352,1,2020-10-20,0.77947,430.56,1.0,1.0,0.0,2.312334,180830.0,170000
"17, Laburnum Avenue, Hornchurch, Greater London RM12 4HE",51.55813,0.19327,1,2020-03-16,0.615657,947.232,2.0,1.0,0.0,2.559591,318253.159091,322000
"106, Bancroft Chase, Hornchurch, Greater London RM12 4DR",51.55804,0.18549,3,2018-11-16,0.783826,1173.0,3.0,3.0,0.0,2.879939,354941.00542,415000
"64, Chestnut Avenue, Hornchurch, Greater London RM12 4HJ",51.5595,0.1919,4,2018-01-22,0.724644,1675.0,5.0,2.0,0.0,2.481968,465764.705882,480000
"9, Harold Court Road, Romford, Greater London RM3 0YU",51.602004,0.247532,4,2017-07-03,0.857375,828.828,3.0,1.0,0.0,3.635021,354941.00542,350000
"71, Cross Road, Mawneys, Romford, Greater London RM7 8DX",51.5865,0.15696,3,2021-09-30,1.369125,1076.0,3.0,2.0,0.0,3.943573,380391.953244,490000


In [1211]:
# Remove the sold date for now, will need to find a way to quantify this.
del prop_df['Sold_Date']

In [901]:

del prop_df1['Sold_Date']
del prop_df1['Closest Station']

In [1212]:
prop_df

Unnamed: 0,Latitude,Longitude,Property Type,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.050882,1087.900,3.0,2.0,0.0,0.929406,1.681475e+06,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.187232,618.800,2.0,1.0,0.0,1.180866,8.969366e+05,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.220410,1140.984,3.0,2.0,0.0,0.919759,1.681475e+06,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.110809,890.000,2.0,2.0,0.0,0.797907,8.969366e+05,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.171891,729.000,1.0,1.0,0.0,0.973840,6.733066e+05,670000
...,...,...,...,...,...,...,...,...,...,...,...
"Flat 5, Amen Lodge, Warwick Lane, London, Greater London EC4M 7BY",51.514671,-0.100893,1,0.127141,441.000,1.0,1.0,0.0,0.734900,6.733066e+05,445000
"Flat 99, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,1,0.251315,401.000,1.0,1.0,0.0,1.338317,6.733066e+05,475000
"Flat 141, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514653,-0.110204,1,0.248524,418.000,1.0,1.0,0.0,1.358670,6.733066e+05,470000
"Flat 125, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514434,-0.110411,1,0.260957,462.852,1.0,1.0,0.0,1.368437,6.733066e+05,620000


In [989]:
#del prop_df['Local_Similar_Prices']

In [1214]:
x = prop_df.drop(['Price'], axis=1).values
y = prop_df['Price'].values

In [1215]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [1216]:
# split the data set into a training and testing set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [1217]:
from sklearn.linear_model import LinearRegression
# Traing the model (on the training set)
ml = LinearRegression()
ml.fit(x_train,y_train)

In [1218]:
# Predict the test set results
y_pred = ml.predict(x_test)
print(y_pred)

[ 753382.20546743  919565.7000591   871763.78149562 1726476.76912479
  859366.12380391  792064.79673551  504148.90166409  775893.31477404
  929116.64806697  572671.31456921  609330.33170813 1354758.02605364
  930992.23517607  750875.67456909  789345.45843987  866243.42633919
  836635.21145651  859086.16429031  921534.41077711  731604.65734923
  672980.51629199  812723.05712621 1142862.67770942 1599947.07813488
  821821.64827231 1778781.12928459 1206798.12369834  916589.64520925
 1608672.44600216  742629.14892823  923154.69973606  906881.36245675
  613212.62193593  824664.58995736  891080.99707472  819020.80456488
  748681.78160143  383452.35123505  912809.91527829  428105.78984119
 1678534.55478762  929096.94837926 1599015.89992952  790169.43147902
  601330.70985666  758180.60895404  718139.33312276 1546684.24650033
  918865.9503437   871673.48413076  697011.2988608  1648036.22889908
  252845.58444397  870451.53005587 1029578.57907888 1577742.727304
  693060.95358264  925065.64887947 1

In [1219]:
# evaluate the model 
from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred))

print(mean_absolute_error(y_test, y_pred))
# 80 % score with the r score
# 55495 MAE (this will of)

0.7291869337938297
141774.0662980607


In [None]:
# Now lets do the linear regression without the local avg prices, and replace this column with numbers based of the KNN 
# to find the average price of local similar houses

In [1221]:
prop_dfx = properties_df
del prop_dfx["Local_Similar_Prices"]
del prop_dfx["Sold_Date"] 


In [1229]:
prop_dfx = prop_dfx.dropna()

In [1231]:
prop_dfx

Unnamed: 0,Latitude,Longitude,Property Type,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.050882,1087.900,3.0,2.0,0.0,0.929406,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.187232,618.800,2.0,1.0,0.0,1.180866,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.171891,729.000,1.0,1.0,0.0,0.973840,670000
...,...,...,...,...,...,...,...,...,...,...
"Flat 5, Amen Lodge, Warwick Lane, London, Greater London EC4M 7BY",51.514671,-0.100893,1,0.127141,441.000,1.0,1.0,0.0,0.734900,445000
"Flat 99, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,1,0.251315,401.000,1.0,1.0,0.0,1.338317,475000
"Flat 141, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514653,-0.110204,1,0.248524,418.000,1.0,1.0,0.0,1.358670,470000
"Flat 125, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514434,-0.110411,1,0.260957,462.852,1.0,1.0,0.0,1.368437,620000


In [1232]:
# Lets try the KNN to get local similar prices
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.model_selection import cross_val_predict

In [1233]:
x = prop_dfx[["Latitude", "Longitude", "Property Type", "Size", "Bedrooms", "Bathrooms", "New Build"]]
y = prop_dfx["Price"]
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred = cross_val_predict(knn, x, y, cv=5)

In [1239]:
# Set the KNN predictions to the local_similar_prices column 
y_pred

array([1259000.,  684999., 1703000.,  829500.,  847650.,  857400.,
        909990.,  892400.,  848800.,  921990.,  980100.,  958000.,
        908400.,  864600.,  980100.,  908400.,  847650.,  980100.,
        857000.,  492500.,  868000.,  843000.,  722600.,  868000.,
        980100.,  611199.,  796000.,  889000., 1651000.,  854800.,
       1027000.,  848800.,  894000.,  980100.,  654400., 1405000.,
        655400.,  857800., 1658600.,  894000.,  672500.,  439400.,
        721000.,  756000.,  689000.,  829000.,  862000., 1564000.,
       1564000., 1564000.,  587700., 1684800., 1624600.,  729999.,
        487500.,  961000.,  394400.,  839000.,  505700., 1695000.,
        814000., 1764800.,  862000.,  961000.,  889000.,  863000.,
        729999.,  897000.,  858000.,  961000.,  788100.,  729999.,
        706500., 1682000.,  803500.,  869150.,  845000.,  836000.,
        897000.,  609700.,  788100.,  925000., 1606600., 1184000.,
        709900.,  706499., 1623600.,  890990.,  697000.,  5976

In [1247]:
#for i in range(len(prop_dfx)):
 #   prop_dfx.iloc[i:"local_similar_price"] = y_pred[i] 
prop_dfx["Local Similar Price"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_dfx["Local Similar Price"] = y_pred


In [1248]:
prop_dfx

Unnamed: 0,Latitude,Longitude,Property Type,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price,Local Similar Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.050882,1087.900,3.0,2.0,0.0,0.929406,1500000,1259000.0
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.187232,618.800,2.0,1.0,0.0,1.180866,660000,684999.0
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000,1703000.0
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000,829500.0
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.171891,729.000,1.0,1.0,0.0,0.973840,670000,847650.0
...,...,...,...,...,...,...,...,...,...,...,...
"Flat 5, Amen Lodge, Warwick Lane, London, Greater London EC4M 7BY",51.514671,-0.100893,1,0.127141,441.000,1.0,1.0,0.0,0.734900,445000,672500.0
"Flat 99, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,1,0.251315,401.000,1.0,1.0,0.0,1.338317,475000,1152000.0
"Flat 141, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514653,-0.110204,1,0.248524,418.000,1.0,1.0,0.0,1.358670,470000,532400.0
"Flat 125, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514434,-0.110411,1,0.260957,462.852,1.0,1.0,0.0,1.368437,620000,576300.0


In [1249]:
# Now let's do the regression again and see if the perfomance has improved ...

In [1251]:
x_1 = prop_dfx.drop(['Price'], axis=1).values
y_1 = prop_dfx['Price'].values

In [1252]:
# split the data set into a training and testing set
x_train,x_test,y_train,y_test = train_test_split(x_1,y_1,test_size=0.3,random_state=0)

In [1253]:
# Traing the model (on the training set)
ml = LinearRegression()
ml.fit(x_train,y_train)

In [1254]:
# Predict the test set results
y_pred_1 = ml.predict(x_test)
print(y_pred_1)

[ 737854.6753013   887357.48475264  765711.49108534 1663560.96420386
  816160.91838332  778503.45778124  477031.42455273  737104.01322397
  903001.65962626  772536.99775033  724401.03734774 1188189.82530782
  975041.57494328  748060.92102129  782186.77854176  797567.59978613
  807855.5155183   932326.92235805  861936.81329089  713452.93771266
  657550.111773    715710.66318236 1317804.24267732 1724711.67515079
  813770.76870721 1439278.87631117  416231.40875237  913321.10947716
 1758459.65112163  719028.49155222  902233.88235638  914937.94759457
  784897.90474931  833071.97266486  910971.46355152  753721.80205702
  977631.5886904   356917.42095482  894165.51605394  687044.17400561
 1642524.53248314  884522.39499502 1745421.37904895  769050.57602014
  893324.30115665  733808.748527    664318.83250147 1336320.88251821
  875472.42823293  838446.88810945  691283.27079973 1622156.91045057
  474481.44284448  796925.49841778 1039850.2314298  1476212.09257288
  670928.85964915  871986.28339362

In [1255]:
# evaluate the model 
from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred_1))

print(mean_absolute_error(y_test, y_pred_1))

0.7219647062759831
142010.945907698


In [None]:
 # Essentially no change in the results

In [999]:
# NLP 


2554

In [1001]:
dataset_features = []
for i in houses.values():
    if 'features' in i.keys():
        if i['features'] == []:
            pass
        else:
            dataset_features.append(i['features'])

In [1002]:
dataset_features

[['Type 1A',
  'Contemporary Kitchen',
  'Utility Room',
  'Wraparound Balcony',
  'Use Of Communal Gardens',
  'West Facing Views'],
 ['Two Bedrooms',
  'Duplex',
  'Waitrose Nearby',
  'City of London',
  'On-Site Leisure Centre',
  'Grade II Listed'],
 ['3 Bedrooms',
  "Views of St Paul's, London Eye &amp; Other Iconic Landmarks",
  'Bathroom &amp; Separate Shower Room',
  'Open Plan Living Area',
  'Modernised Kitchen',
  '24 Hour Concierge',
  'Extended Lease'],
 ['Two-bedroom, split-level apartment',
  'Large living room with adjoining kitchen',
  'Bathroom and separate WC',
  'Two private balconies',
  'Central London location',
  'Views over the Barbican lake and gardens to the City',
  'Exquisite original features',
  'Closest stations: Moorgate (0.2 miles) &amp; Barbican (0.8 miles)',
  'Leasehold (175 years remain)',
  '890 sq. ft / 82.68 sq. m'],
 ['garden',
  'sought-after-location',
  'balcony',
  'close-to-local-amenities',
  'double-glazed-windows',
  'period-features-c

# KNN Approach


In [1041]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': 1087.9,
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 0.9294055612957386,
  'avg_local_price': 1681475.4385964912},
 'Flat 27, Bayer House, Golden Lane Estate, London, Greate

In [1044]:
knn_data = {}
for i,v in houses.items():
    knn_data[i] = [v['latitude'], v['longitude'], v['price']]

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': [51.519886,
  -0.096743,
  1500000],
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN': [51.522525,
  -0.095795,
  660000],
 'Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD': [51.520535,
  -0.09278,
  1620000],
 'Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL': [51.519515,
  -0.090796,
  1075000],
 '211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH': [51.5198,
  -0.0948,
  519500],
 '802, Frobisher Crescent, London, Greater London EC2Y 8HD': [51.52051,
  -0.09372,
  559000],
 'Flat 3, John Trundle Court, Barbican, London, Greater London EC2Y 8DJ': [51.520673,
  -0.097013,
  475000],
 'Flat 51, Breton House, Barbican, London, Greater London EC2Y 8DQ': [51.52133,
  -0.09377,
  535000],
 'Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL': [51.520886,
  -0.094049,
  670000],
 'Flat 16, Chequer Court, 3, Chequer Stre

In [1052]:
knn_df = pd.DataFrame(knn_data)
knn_df = knn_df.T
knn_df.columns = ["Latitude", "Longitude", "Price"]

In [1053]:
knn_df

Unnamed: 0,Latitude,Longitude,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1500000.0
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,660000.0
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1620000.0
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1075000.0
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.519800,-0.094800,519500.0
...,...,...,...
"10, Bolt Court, London, Greater London EC4A 3DQ",51.508500,-0.125700,1025000.0
"Flat 107, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,89000.0
"Flat 68, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,145000.0
"Flat 8, Pemberton House, 6, East Harding Street, London, Greater London EC4A 3AS",51.508500,-0.125700,165000.0


In [1064]:
knn_df.min() 

Latitude        51.5051
Longitude       -0.1257
Price        20000.0000
dtype: float64

In [1076]:
# establish the price categories 

    

In [1069]:
# You need to establish the feature_names () and the target_names (category names)
def categorise(row):
    if 20000<=row['Price']<100000:
        return 'A'
    if 100000<=row['Price']<200000:
        return 'B'
    if 200000<=row['Price']<300000:
        return 'C'
    if 300000<=row['Price']<400000:
        return 'D'
    if 400000<=row['Price']<500000:
        return 'E'
    if 500000<=row['Price']<600000:
        return 'F'
    if 600000<=row['Price']<700000:
        return 'G'
    if 700000<=row['Price']<800000:
        return 'H'
    if 800000<=row['Price']<900000:
        return 'I'
    if 900000<=row['Price']<1000000:
        return 'J'
    if 1000000<=row['Price']<1100000:
        return 'K'
    if 1100000<=row['Price']<1200000:
        return 'L'
    if 1200000<=row['Price']<1300000:
        return 'M'
    if 1300000<=row['Price']<1400000:
        return 'N'
    if 1400000<=row['Price']<1500000:
        return 'O'
    if 1500000<=row['Price']<1600000:
        return 'P'
    if 1600000<=row['Price']<1700000:
        return 'Q'
    if 1700000<=row['Price']<1800000:
        return 'R'
    if 1800000<=row['Price']<1900000:
        return 'S'
    if 1900000<=row['Price']<2000000:
        return 'T'
    if row['Price']>=2000000:
        return 'L'
    

In [1075]:
# apply the categorise function on the dataframe
knn_df['price_cat'] = knn_df.apply(lambda row: categorise(row), axis=1)

In [1074]:
knn_df

Unnamed: 0,Latitude,Longitude,Price,price_cat
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1500000.0,P
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,660000.0,G
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1620000.0,Q
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1075000.0,K
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.519800,-0.094800,519500.0,F
...,...,...,...,...
"10, Bolt Court, London, Greater London EC4A 3DQ",51.508500,-0.125700,1025000.0,K
"Flat 107, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,89000.0,A
"Flat 68, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,145000.0,B
"Flat 8, Pemberton House, 6, East Harding Street, London, Greater London EC4A 3AS",51.508500,-0.125700,165000.0,B


In [1110]:
# Now we can try to do our KNN
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.model_selection import cross_val_predict

In [1111]:
x = knn_df[["Latitude", "Longitude"]]
y = knn_df["Price"]

In [1153]:
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred = cross_val_predict(knn, x, y, cv=5)

In [1154]:
# Our predictions in an array 
y_pred

array([1494000. ,  606053.8, 1815600. , ...,  491000. ,  461200. ,
        461200. ])

In [1155]:
# Evaluation of the predictions
print(r2_score(y, y_pred))
print(mean_absolute_error(y, y_pred))

-0.39388112752444404
474608.72106499603


In [1158]:
knn_df['Price'].max()

11250000.0

In [1163]:
# I think a good idea is to add features to the house which makes the houses similar 
# and then apply the KNN. This include, bedrooms, bathrooms, new_build, size, lat & lon



In [1179]:
knn_df1 = properties_df 
del knn_df1["Postcode Area"]
del knn_df1["Sold_Date"]
del knn_df1["Local_Similar_Prices"]

In [1180]:
knn_df1 = knn_df1.dropna()

In [1181]:
knn_df1

Unnamed: 0,Latitude,Longitude,Property Type,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.050882,1087.900,3.0,2.0,0.0,0.929406,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.187232,618.800,2.0,1.0,0.0,1.180866,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.171891,729.000,1.0,1.0,0.0,0.973840,670000
...,...,...,...,...,...,...,...,...,...,...
"Flat 5, Amen Lodge, Warwick Lane, London, Greater London EC4M 7BY",51.514671,-0.100893,1,0.127141,441.000,1.0,1.0,0.0,0.734900,445000
"Flat 99, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,1,0.251315,401.000,1.0,1.0,0.0,1.338317,475000
"Flat 141, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514653,-0.110204,1,0.248524,418.000,1.0,1.0,0.0,1.358670,470000
"Flat 125, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514434,-0.110411,1,0.260957,462.852,1.0,1.0,0.0,1.368437,620000


In [1192]:
# Lets do a KNN regression on this data to predict the price
x1 = knn_df1[["Latitude", "Longitude", "Property Type", "Closest Station", "Size", "Bedrooms", "Bathrooms", "New Build", "Nearest OS"]]
y1 = knn_df1["Price"]

In [1193]:
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred1 = cross_val_predict(knn, x1, y1, cv=5)

In [1194]:
y_pred1

array([1259000.,  684999., 1703000.,  829500.,  847650.,  857400.,
        909990.,  892400.,  848800.,  921990.,  980100.,  958000.,
        908400.,  864600.,  980100.,  908400.,  847650.,  980100.,
        857000.,  492500.,  868000.,  843000.,  722600.,  868000.,
        980100.,  611199.,  796000.,  889000., 1651000.,  889000.,
       1027000.,  848800.,  894000.,  980100.,  654400., 1405000.,
        655400.,  857800., 1658600.,  894000.,  672500.,  439400.,
        721000.,  756000.,  689000.,  829000.,  862000., 1564000.,
       1564000., 1564000.,  587700., 1684800., 1624600.,  729999.,
        487500.,  961000.,  394400.,  839000.,  505700., 1695000.,
        814000., 1764800.,  862000.,  961000.,  889000.,  863000.,
        729999.,  897000.,  858000.,  961000.,  788100.,  729999.,
        706500., 1682000.,  803500.,  869150.,  845000.,  836000.,
        897000.,  609700.,  788100.,  925000., 1606600., 1184000.,
        709900.,  706499., 1623600.,  886990.,  697000.,  5976

In [1195]:
# Evaluation of the predictions
print(r2_score(y1, y_pred1))
print(mean_absolute_error(y1, y_pred1))

0.5331607027261885
184037.38604651162
