In [1256]:
# import the necessary libraries
from bs4 import BeautifulSoup
import requests
import json
import re
import datetime
import easyocr
import cv2
import geopy.distance

In [1257]:
# this function handles the situation when the html is embedded within javascript
def javascript_html_parse(source_arg):
	"""Return the JSON object as a dictionary"""
	start = "<script>window.__PRELOADED_STATE__ = "
	end = "</script"
	x = source_arg[source_arg.find(start)+len(start):]
	x = x[:x.find(end)]
	x = json.loads(x)
	return x

In [851]:
# Obtain all of the URL's that need to be scraped from 
# This is the first URL we start at, here we are collecting all of the URL's to be scraped from into 
# local area URL's 
url = "https://www.rightmove.co.uk/house-prices-in-London.html"
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
b = soup.find_all("a", class_="head")
london_urls = []
for i in b:
	london_urls.append(i["href"])

borough_urls = []
for url in london_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		borough_urls.append(i["href"])

local_area_urls = []
for url in borough_urls:
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'lxml')
	b = soup.find_all("a", class_="head")
	for i in b:
		local_area_urls.append(i["href"])	

In [1473]:
# iterate over all of the URL's and scrape from them, and store the scraped data into a variable called houses
m_values = ["square meters", "square metres", "square meter","square metre","square m","square mt","sqmt","sq mt","sq.mt", "sq. mt", "sqm", "sq m", "sq.m", "sq. m", "sq: m", "sq:m", "sq :m", "sq:  m", "sq : m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
f_values = ['square feet','square ft','square f','sqft','sq ft','sq.ft','sq. ft','sqf','sq f','sq.f','sq. f','sq: f','sq:f','sq :f','sq : f','sq :f','feet2','ft2','f2']
houses = {}
for area_url in local_area_urls[:20]:

    # while loop that obtains the address, property type, last sold price, last sold date, and its url (if it has one) from each house. 
    # Stores each value into a list. This is from the web page which shows different houses from a particular area.

    page_number = 1
    addresses, property_type, last_sold_price, last_sold_date, url = [], [], [], [], []
    # obtain the number of pages to use in the while loop condition
    source = requests.get(area_url).text
    a = javascript_html_parse(source)
    num_pages = a["pagination"]["last"]
    
    while page_number <= num_pages:

        if page_number == 1:
            source = requests.get(area_url).text
        else:
            source = requests.get(area_url+f"?page={page_number}").text

        # Scrape the address, property type, the price & date of the last sale, and the URL.
        x = javascript_html_parse(source)
        for house in x["results"]["properties"]:
            addresses.append(house["address"])
            property_type.append(house["propertyType"])
            last_sold_price.append(house["transactions"][0]["displayPrice"])
            last_sold_date.append(house["transactions"][0]["dateSold"])
            url.append(house["detailUrl"])



        page_number += 1


    # Store the scraped data for each property into the houses dictionary
    for i in range(len(addresses)):
        houses[addresses[i]] = {"property_type":property_type[i], "price":last_sold_price[i], "date":last_sold_date[i], "url": url[i]}


    # Now loop through all of the stored URL's for each property, and scrape more data from it.

    for v in houses.values():
        if v['url'] == '':
            continue
        else:
            source = requests.get(v['url']).text
            soup = BeautifulSoup(source, 'lxml')
            script = soup.find("script", {"type":"text/javascript"}).text
            # Regex used to convert the JSON data structure into a python dict 
            # Use regex to extract json data from the script text
            json_script = re.findall(("(?s)(?<=window.PAGE_MODEL = )(.*$)"), script)[0]
            # Transforming json data within string into dictionary
            json_dict = json.loads(json_script)

            # The bullet pointed features 
            try:
                features = json_dict["soldPropertyData"]["property"]["keyFeatures"]
            except TypeError: 
                pass
            else:
                v["features"] = features
            # The floorplan URL
            try:
                floorplan_url = json_dict["soldPropertyData"]["property"]["floorplans"][0]['url']
            except Exception:
                # seen a TypeError and an IndexError so I'm going to capture all exceptions raised
                pass
            else:
                v["floorplan_url"] = floorplan_url
            # Scrape the pictures of the house ### commented this out for now, as I'm not using the image data
            #house_image_urls = []
            #for image in json_dict["soldPropertyData"]["property"]["images"]:
            #	house_image_urls.append(image['url'])
            #v["house_image_urls"] = house_image_urls
            # location of the house
            try:
                latitude = json_dict["soldPropertyData"]["property"]["location"]["latitude"]
                longitude = json_dict["soldPropertyData"]["property"]["location"]["longitude"]
            except TypeError:
                pass
            else:
                v["latitude"] = latitude
                v["longitude"] = longitude
            # proximity to the stations 
            station_names, distances = [], []
            try:
                for station in json_dict["soldPropertyData"]["property"]["nearestStations"]:
                    station_names.append(station["name"])
                    distances.append(station["distance"])
            except TypeError:
                station_proximities = {}
            else:
                station_proximities = {station:distance for station, distance in zip(station_names, distances)}
            v["station_proximities"] = station_proximities
            # size of the property
            units, sizes = [], []
            try:
                for i in json_dict["soldPropertyData"]["property"]["sizings"]:
                    units.append(i['unit'])
                    sizes.append(i['maximumSize'])
            except TypeError:
                dimensions = {}
            else:
                dimensions = {unit:size for unit, size in zip(units, sizes)}
            #for label, size in dimensions.items():
             #   if label not in m_values and label not in f_values:
              #      del dimensions[label]

            # still haven't delt with the potential of having both a sqm and sqf, or just a sqf

            v["property_size"] = dimensions
            
            # JUST ABOVE HERE IS WHERE I COULD MAKE IT SO ONLY SQ FT IS OBTAINED - Still needs to be finished off

            # number of bedrooms and bathrooms
            try:
                bedrooms = json_dict["soldPropertyData"]["property"]['bedrooms']
            except TypeError:
                bedrooms = ''
            try:
                bathrooms = json_dict["soldPropertyData"]["property"]['bathrooms']
            except TypeError:
                bathrooms = ''
                
            v["bedrooms"] = bedrooms
            v["bathrooms"] = bathrooms

            # Whether the property is a new build or not
            try:
                new_build = json_dict["soldPropertyData"]["transactions"][0]['newBuild']
            except TypeError:
                pass
            else:
                v["new_build"] = new_build

In [1474]:
# change price from a string into an int
def price_int(x):
    """Return an integer from a price string"""
    a = x[1:]
    b = a.replace(",", "")
    b = int(b)
    return b

# run the above function for all of the stored prices
for features in houses.values():
    y = features['price']
    z = price_int(y)
    features['price'] = z

In [1475]:
def size_floorplan_2(floorplan):
    """Return the size data (in sq ft) from the floorplan"""
    reader = easyocr.Reader(['en']) # GPU=false taken out for now
    result = reader.readtext(floorplan)
    
    scraped_text = ''
    for i in result:
        scraped_text += i[1]
        scraped_text += ","
    text_list = scraped_text.split(",")
    
    m_patterns = re.compile(r"[+-]? *((?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)\s*(square meters|square metres|square meter|square metre|square m|square mt|sqmt|sq mt|sq.mt|sq. mt|sqm|sq m|sq.m|sq. m|sq: m|sq:m|sq :m|sq : m|sq :m|meters2|metres2|meter2|metre2|mt2|m2)", re.IGNORECASE)
    f_patterns = re.compile(r"[+-]? *((?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?)\s*(square feet|square ft|square f|sqft|sq ft|sq.ft|sq. ft|sqf|sq f|sq.f|sq. f|sq: f|sq:f|sq :f|sq : f|sq :f|feet2|ft2|f2)", re.IGNORECASE)
    house_size = []
    for x in text_list:
        for match in re.finditer(m_patterns, x):
            #print('%s : %s' % (match.groups()))
            house_size.append(list(match.groups()))
        for match in re.finditer(f_patterns, x):
            #print('%s : %s' % (match.groups())) 
            house_size.append(list(match.groups()))
    for a in house_size:
        try:
            a[0] = int(a[0])
        except ValueError:
            a[0] = float(a[0])
            
    # then here you could find the
    house_size = dict((house_size))
    #print(house_size)
    house_size_swap = {}
    # code to swap keys and values
    for k, v in house_size.items():
        house_size_swap[v] = k

    house_size = house_size_swap
    
    # Code to take the largest returned value from the floorplan numbers
    amount = max(house_size.values())
    for i in house_size.keys():
        if house_size[i] == amount:
            size_label = i.lower()
    
    # Remove punctuation from the size labels       
    if "," in size_label:
        size_label = size_label.replace(",", "")
    if ":" in size_label:
        size_label = size_label.replace(":", "")
    if "." in size_label:
        size_label = size_label.replace(".", "")
    
    largest_size = {size_label:amount}
    
    
            
    # need to add values with colons, :
    # alternatively, you could just remove the colon. This would also nehate this issue

    metre_values = ["square meters", "square metres", "square meter", "square metre", "square m", "square mt", "sqmt", "sq mt", "sq mt", "sq mt", "sqm", "sq m", "sq m", "sq m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
    feet_values = ["square feet","square ft","square f","sqft","sq ft","sq  ft","sq. ft","sqf","sq f","sq f","sq f","feet2", "ft2","f2"]


    if size_label in metre_values:
        # convert the size into square ft if sq m is obtained
        largest_size["sq ft"] = amount*10.764
        del largest_size[size_label]
    elif size_label in feet_values:
        largest_size["sq ft"] = amount
    

        
    return largest_size["sq ft"]

In [1476]:
# function that returns the sq foot value from the dictionaries that are scraped from the sold houses
def sq_foot_return(x):
    values_m = ["square meters", "sq meters", "square metres", "sq metres", "square meter", "sq meter", "square metre", "sq metre","square m", "square mt", "sqmt", "sq mt", "sq.mt", "sq. mt", "sqm", "sq m", "sq.m", "sq. m", "sq: m", "sq:m", "sq :m", "sq : m", "sq :m", "meters2", "metres2", "meter2", "metre2", "mt2", "m2"]
    values_f = ["square feet", "square ft", "square f", "sqft", "sq ft", "sq.ft", "sq. ft", "sqf", "sq f", "sq.f", "sq. f", "sq: f", "sq:f", "sq :f", "sq : f", "sq :f", "feet2", "ft2", "f2"]
    for k, v in x.items():
        if k.strip() in values_f:
            return x[k]
        if k.strip() in values_m:
            return x[k]*10.764
    return "Na"

In [1477]:
# here is where the code should go to create this outstanding schools list
outstanding_schools = [[51.4974948, -0.1356583],
 [51.5202607, -0.0293396],
 [51.4935082, -0.1178424],
 [51.6569225, -0.1949252],
 [51.5202607, -0.0293396],
 [51.5345448, -0.2043853],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.6569225, -0.1949252],
 [51.5906113, -0.1109709],
 [51.5202607, -0.0293396],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.5672808, -0.2710568],
 [51.6569225, -0.1949252],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.5202607, -0.0293396],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.4990805, -0.1938253],
 [51.5255162, 0.0352163],
 [51.5255162, 0.0352163],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.5202607, -0.0293396],
 [51.5436387, -0.0553621],
 [51.4935082, -0.1178424],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5886383, -0.0117625],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.5886121, 0.0823982],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.5886383, -0.0117625],
 [51.5132537, -0.3043136],
 [51.5906113, -0.1109709],
 [51.458373, -0.1891356],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5672808, -0.2710568],
 [51.5886383, -0.0117625],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.5202607, -0.0293396],
 [51.4609218, -0.373149],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.4933675, 0.0098214],
 [51.5886121, 0.0823982],
 [51.5886121, 0.0823982],
 [51.502781, -0.087738],
 [51.4935082, -0.1178424],
 [51.5540666, 0.134017],
 [51.4935082, -0.1178424],
 [51.5886121, 0.0823982],
 [51.5886383, -0.0117625],
 [51.4611509, -0.0073177],
 [51.5390261, -0.1425516],
 [51.4609218, -0.373149],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5132537, -0.3043136],
 [51.4990805, -0.1938253],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.5886121, 0.0823982],
 [51.5202607, -0.0293396],
 [51.5390261, -0.1425516],
 [51.502781, -0.087738],
 [51.4609218, -0.373149],
 [51.5436387, -0.0553621],
 [51.4974948, -0.1356583],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5123443, -0.0909852],
 [51.538621, -0.1028346],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.5255162, 0.0352163],
 [51.5886383, -0.0117625],
 [51.538621, -0.1028346],
 [51.4935082, -0.1178424],
 [51.4990156, -0.22915],
 [51.538621, -0.1028346],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.502781, -0.087738],
 [51.4935082, -0.1178424],
 [51.5202607, -0.0293396],
 [51.502781, -0.087738],
 [51.5202607, -0.0293396],
 [51.5540666, 0.134017],
 [51.4990156, -0.22915],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.5132537, -0.3043136],
 [51.6569225, -0.1949252],
 [51.4611509, -0.0073177],
 [51.4935082, -0.1178424],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.4933675, 0.0098214],
 [51.406025, 0.013156],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.5886383, -0.0117625],
 [51.502781, -0.087738],
 [51.4990805, -0.1938253],
 [51.4097742, -0.2108084],
 [51.4933675, 0.0098214],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.502781, -0.087738],
 [51.4974948, -0.1356583],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.4611509, -0.0073177],
 [51.6522994, -0.0807119],
 [51.5132537, -0.3043136],
 [51.6522994, -0.0807119],
 [51.5132537, -0.3043136],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.6522994, -0.0807119],
 [51.5436387, -0.0553621],
 [51.4611509, -0.0073177],
 [51.502781, -0.087738],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.5540666, 0.134017],
 [51.461311, -0.303742],
 [51.502781, -0.087738],
 [51.5672808, -0.2710568],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5886383, -0.0117625],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.577924, 0.2120829],
 [51.4990805, -0.1938253],
 [51.5886383, -0.0117625],
 [51.461311, -0.303742],
 [51.4935082, -0.1178424],
 [51.4990156, -0.22915],
 [51.5255162, 0.0352163],
 [51.502781, -0.087738],
 [51.5255162, 0.0352163],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.538621, -0.1028346],
 [51.3769529, -0.0956895],
 [51.6569225, -0.1949252],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.5255162, 0.0352163],
 [51.4990805, -0.1938253],
 [51.439933, 0.154327],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.3769529, -0.0956895],
 [51.5436387, -0.0553621],
 [51.439933, 0.154327],
 [51.6569225, -0.1949252],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.4097742, -0.2108084],
 [51.6522994, -0.0807119],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5886121, 0.0823982],
 [51.41233, -0.300689],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.502781, -0.087738],
 [51.580559, -0.341995],
 [51.439933, 0.154327],
 [51.5906113, -0.1109709],
 [51.5351832, -0.4481378],
 [51.5255162, 0.0352163],
 [51.4974948, -0.1356583],
 [51.4097742, -0.2108084],
 [51.5436387, -0.0553621],
 [51.5202607, -0.0293396],
 [51.5390261, -0.1425516],
 [51.6569225, -0.1949252],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.4935082, -0.1178424],
 [51.5540666, 0.134017],
 [51.4974948, -0.1356583],
 [51.4933675, 0.0098214],
 [51.5540666, 0.134017],
 [51.3769529, -0.0956895],
 [51.5351832, -0.4481378],
 [51.4990156, -0.22915],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177],
 [51.5886121, 0.0823982],
 [51.5906113, -0.1109709],
 [51.6522994, -0.0807119],
 [51.6522994, -0.0807119],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.4933675, 0.0098214],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.4097742, -0.2108084],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.502781, -0.087738],
 [51.577924, 0.2120829],
 [51.406025, 0.013156],
 [51.5906113, -0.1109709],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5132537, -0.3043136],
 [51.4611509, -0.0073177],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.4935082, -0.1178424],
 [51.4611509, -0.0073177],
 [51.4990805, -0.1938253],
 [51.6522994, -0.0807119],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.4935082, -0.1178424],
 [51.5906113, -0.1109709],
 [51.41233, -0.300689],
 [51.5906113, -0.1109709],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4609218, -0.373149],
 [51.4611509, -0.0073177],
 [51.5906113, -0.1109709],
 [51.580559, -0.341995],
 [51.3769529, -0.0956895],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.5886121, 0.0823982],
 [51.461311, -0.303742],
 [51.4935082, -0.1178424],
 [51.5132537, -0.3043136],
 [51.6569225, -0.1949252],
 [51.4097742, -0.2108084],
 [51.5906113, -0.1109709],
 [51.6569225, -0.1949252],
 [51.5886383, -0.0117625],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4609218, -0.373149],
 [51.577924, 0.2120829],
 [51.5351832, -0.4481378],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.5672808, -0.2710568],
 [51.439933, 0.154327],
 [51.5436387, -0.0553621],
 [51.3769529, -0.0956895],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.5351832, -0.4481378],
 [51.4609218, -0.373149],
 [51.5540666, 0.134017],
 [51.6569225, -0.1949252],
 [51.3769529, -0.0956895],
 [51.439933, 0.154327],
 [51.6569225, -0.1949252],
 [51.4609218, -0.373149],
 [51.4990805, -0.1938253],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.3769529, -0.0956895],
 [51.4990156, -0.22915],
 [51.5351832, -0.4481378],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.4609218, -0.373149],
 [51.5436387, -0.0553621],
 [51.5390261, -0.1425516],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.4974948, -0.1356583],
 [51.5390261, -0.1425516],
 [51.458373, -0.1891356],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.4974948, -0.1356583],
 [51.6522994, -0.0807119],
 [51.4990156, -0.22915],
 [51.4990805, -0.1938253],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5886383, -0.0117625],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.5540666, 0.134017],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4097742, -0.2108084],
 [51.5390261, -0.1425516],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5123443, -0.0909852],
 [51.461311, -0.303742],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.538621, -0.1028346],
 [51.5886121, 0.0823982],
 [51.4974948, -0.1356583],
 [51.458373, -0.1891356],
 [51.6569225, -0.1949252],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.6569225, -0.1949252],
 [51.4935082, -0.1178424],
 [51.580559, -0.341995],
 [51.4935082, -0.1178424],
 [51.538621, -0.1028346],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.5255162, 0.0352163],
 [51.4990805, -0.1938253],
 [51.4974948, -0.1356583],
 [51.4990156, -0.22915],
 [51.5672808, -0.2710568],
 [51.6522994, -0.0807119],
 [51.538621, -0.1028346],
 [51.5906113, -0.1109709],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.5132537, -0.3043136],
 [51.5202607, -0.0293396],
 [51.5132537, -0.3043136],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5351832, -0.4481378],
 [51.5390261, -0.1425516],
 [51.580559, -0.341995],
 [51.458373, -0.1891356],
 [51.538621, -0.1028346],
 [51.4935082, -0.1178424],
 [51.580559, -0.341995],
 [51.5351832, -0.4481378],
 [51.502781, -0.087738],
 [51.4990156, -0.22915],
 [51.5886121, 0.0823982],
 [51.6569225, -0.1949252],
 [51.4990805, -0.1938253],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5132537, -0.3043136],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.5351832, -0.4481378],
 [51.4990805, -0.1938253],
 [51.502781, -0.087738],
 [51.458373, -0.1891356],
 [51.6522994, -0.0807119],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.538621, -0.1028346],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.6522994, -0.0807119],
 [51.4974948, -0.1356583],
 [51.4611509, -0.0073177],
 [51.580559, -0.341995],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.6569225, -0.1949252],
 [51.458373, -0.1891356],
 [51.502781, -0.087738],
 [51.41233, -0.300689],
 [51.5540666, 0.134017],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.5351832, -0.4481378],
 [51.6522994, -0.0807119],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.502781, -0.087738],
 [51.5132537, -0.3043136],
 [51.5351832, -0.4481378],
 [51.5202607, -0.0293396],
 [51.4990805, -0.1938253],
 [51.5672808, -0.2710568],
 [51.3769529, -0.0956895],
 [51.4990805, -0.1938253],
 [51.4990156, -0.22915],
 [51.4974948, -0.1356583],
 [51.5886121, 0.0823982],
 [51.461311, -0.303742],
 [51.5351832, -0.4481378],
 [51.5390261, -0.1425516],
 [51.5255162, 0.0352163],
 [51.538621, -0.1028346],
 [51.4097742, -0.2108084],
 [51.5886121, 0.0823982],
 [51.3769529, -0.0956895],
 [51.5390261, -0.1425516],
 [51.577924, 0.2120829],
 [51.5886383, -0.0117625],
 [51.406025, 0.013156],
 [51.4990805, -0.1938253],
 [51.3769529, -0.0956895],
 [51.6522994, -0.0807119],
 [51.4935082, -0.1178424],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.406025, 0.013156],
 [51.6522994, -0.0807119],
 [51.4990156, -0.22915],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.4974948, -0.1356583],
 [51.6522994, -0.0807119],
 [51.458373, -0.1891356],
 [51.4990156, -0.22915],
 [51.5886383, -0.0117625],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.5672808, -0.2710568],
 [51.5672808, -0.2710568],
 [51.4990805, -0.1938253],
 [51.5202607, -0.0293396],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.5436387, -0.0553621],
 [51.538621, -0.1028346],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.580559, -0.341995],
 [51.5390261, -0.1425516],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.5906113, -0.1109709],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.5390261, -0.1425516],
 [51.458373, -0.1891356],
 [51.580559, -0.341995],
 [51.5255162, 0.0352163],
 [51.5906113, -0.1109709],
 [51.4990156, -0.22915],
 [51.4990805, -0.1938253],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4611509, -0.0073177],
 [51.3769529, -0.0956895],
 [51.4933675, 0.0098214],
 [51.577924, 0.2120829],
 [51.6522994, -0.0807119],
 [51.458373, -0.1891356],
 [51.4933675, 0.0098214],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.4935082, -0.1178424],
 [51.5436387, -0.0553621],
 [51.6522994, -0.0807119],
 [51.4933675, 0.0098214],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.5351832, -0.4481378],
 [51.5436387, -0.0553621],
 [51.4990805, -0.1938253],
 [51.6569225, -0.1949252],
 [51.461311, -0.303742],
 [51.6569225, -0.1949252],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4990805, -0.1938253],
 [51.5886121, 0.0823982],
 [51.5351832, -0.4481378],
 [51.6522994, -0.0807119],
 [51.5255162, 0.0352163],
 [51.461311, -0.303742],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.5436387, -0.0553621],
 [51.5906113, -0.1109709],
 [51.580559, -0.341995],
 [51.5202607, -0.0293396],
 [51.5886121, 0.0823982],
 [51.439933, 0.154327],
 [51.5436387, -0.0553621],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.6569225, -0.1949252],
 [51.4933675, 0.0098214],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.5132537, -0.3043136],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.580559, -0.341995],
 [51.439933, 0.154327],
 [51.4611509, -0.0073177],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.4097742, -0.2108084],
 [51.5351832, -0.4481378],
 [51.406025, 0.013156],
 [51.580559, -0.341995],
 [51.4935082, -0.1178424],
 [51.461311, -0.303742],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.4935082, -0.1178424],
 [51.458373, -0.1891356],
 [51.439933, 0.154327],
 [51.580559, -0.341995],
 [51.4933675, 0.0098214],
 [51.461311, -0.303742],
 [51.3769529, -0.0956895],
 [51.577924, 0.2120829],
 [51.5436387, -0.0553621],
 [51.406025, 0.013156],
 [51.5132537, -0.3043136],
 [51.4974948, -0.1356583],
 [51.5886383, -0.0117625],
 [51.4990156, -0.22915],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.3769529, -0.0956895],
 [51.538621, -0.1028346],
 [51.5436387, -0.0553621],
 [51.538621, -0.1028346],
 [51.439933, 0.154327],
 [51.458373, -0.1891356],
 [51.3769529, -0.0956895],
 [51.5672808, -0.2710568],
 [51.4935082, -0.1178424],
 [51.41233, -0.300689],
 [51.577924, 0.2120829],
 [51.6569225, -0.1949252],
 [51.5202607, -0.0293396],
 [51.6569225, -0.1949252],
 [51.41233, -0.300689],
 [51.3769529, -0.0956895],
 [51.458373, -0.1891356],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.4609218, -0.373149],
 [51.5886121, 0.0823982],
 [51.458373, -0.1891356],
 [51.5132537, -0.3043136],
 [51.458373, -0.1891356],
 [51.4611509, -0.0073177],
 [51.461311, -0.303742],
 [51.4933675, 0.0098214],
 [51.4097742, -0.2108084],
 [51.5436387, -0.0553621],
 [51.577924, 0.2120829],
 [51.4990156, -0.22915],
 [51.406025, 0.013156],
 [51.580559, -0.341995],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.4990156, -0.22915],
 [51.458373, -0.1891356],
 [51.5123443, -0.0909852],
 [51.4990156, -0.22915],
 [51.5906113, -0.1109709],
 [51.3769529, -0.0956895],
 [51.580559, -0.341995],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.3769529, -0.0956895],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.5886121, 0.0823982],
 [51.4609218, -0.373149],
 [51.3769529, -0.0956895],
 [51.538621, -0.1028346],
 [51.6569225, -0.1949252],
 [51.5351832, -0.4481378],
 [51.6569225, -0.1949252],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.4935082, -0.1178424],
 [51.4974948, -0.1356583],
 [51.4609218, -0.373149],
 [51.6569225, -0.1949252],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5351832, -0.4481378],
 [51.5540666, 0.134017],
 [51.439933, 0.154327],
 [51.406025, 0.013156],
 [51.4611509, -0.0073177],
 [51.458373, -0.1891356],
 [51.5672808, -0.2710568],
 [51.458373, -0.1891356],
 [51.4974948, -0.1356583],
 [51.4974948, -0.1356583],
 [51.5906113, -0.1109709],
 [51.5906113, -0.1109709],
 [51.5540666, 0.134017],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.5390261, -0.1425516],
 [51.4609218, -0.373149],
 [51.4990156, -0.22915],
 [51.6522994, -0.0807119],
 [51.5672808, -0.2710568],
 [51.5436387, -0.0553621],
 [51.502781, -0.087738],
 [51.461311, -0.303742],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.5886121, 0.0823982],
 [51.580559, -0.341995],
 [51.4097742, -0.2108084],
 [51.5886383, -0.0117625],
 [51.4611509, -0.0073177],
 [51.577924, 0.2120829],
 [51.5132537, -0.3043136],
 [51.5390261, -0.1425516],
 [51.5132537, -0.3043136],
 [51.502781, -0.087738],
 [51.538621, -0.1028346],
 [51.4097742, -0.2108084],
 [51.5132537, -0.3043136],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.6569225, -0.1949252],
 [51.41233, -0.300689],
 [51.577924, 0.2120829],
 [51.3769529, -0.0956895],
 [51.461311, -0.303742],
 [51.458373, -0.1891356],
 [51.5202607, -0.0293396],
 [51.502781, -0.087738],
 [51.577924, 0.2120829],
 [51.5906113, -0.1109709],
 [51.4609218, -0.373149],
 [51.4609218, -0.373149],
 [51.5540666, 0.134017],
 [51.577924, 0.2120829],
 [51.5132537, -0.3043136],
 [51.5436387, -0.0553621],
 [51.4933675, 0.0098214],
 [51.5886383, -0.0117625],
 [51.406025, 0.013156],
 [51.439933, 0.154327],
 [51.3769529, -0.0956895],
 [51.5202607, -0.0293396],
 [51.461311, -0.303742],
 [51.4974948, -0.1356583],
 [51.461311, -0.303742],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.458373, -0.1891356],
 [51.5886121, 0.0823982],
 [51.4097742, -0.2108084],
 [51.458373, -0.1891356],
 [51.458373, -0.1891356],
 [51.4611509, -0.0073177],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.4935082, -0.1178424],
 [51.406025, 0.013156],
 [51.458373, -0.1891356],
 [51.4990805, -0.1938253],
 [51.6569225, -0.1949252],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.5132537, -0.3043136],
 [51.5886383, -0.0117625],
 [51.458373, -0.1891356],
 [51.3769529, -0.0956895],
 [51.5390261, -0.1425516],
 [51.4990156, -0.22915],
 [51.3769529, -0.0956895],
 [51.5886121, 0.0823982],
 [51.406025, 0.013156],
 [51.5202607, -0.0293396],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.6569225, -0.1949252],
 [51.5390261, -0.1425516],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.538621, -0.1028346],
 [51.461311, -0.303742],
 [51.461311, -0.303742],
 [51.5886121, 0.0823982],
 [51.6569225, -0.1949252],
 [51.580559, -0.341995],
 [51.5906113, -0.1109709],
 [51.4990805, -0.1938253],
 [51.461311, -0.303742],
 [51.580559, -0.341995],
 [51.4990156, -0.22915],
 [51.5351832, -0.4481378],
 [51.5132537, -0.3043136],
 [51.5255162, 0.0352163],
 [51.5255162, 0.0352163],
 [51.538621, -0.1028346],
 [51.4609218, -0.373149],
 [51.5390261, -0.1425516],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.4990156, -0.22915],
 [51.4609218, -0.373149],
 [51.4935082, -0.1178424],
 [51.5390261, -0.1425516],
 [51.461311, -0.303742],
 [51.5132537, -0.3043136],
 [51.461311, -0.303742],
 [51.3769529, -0.0956895],
 [51.5540666, 0.134017],
 [51.4933675, 0.0098214],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.5906113, -0.1109709],
 [51.5255162, 0.0352163],
 [51.580559, -0.341995],
 [51.5886121, 0.0823982],
 [51.458373, -0.1891356],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.6569225, -0.1949252],
 [51.5672808, -0.2710568],
 [51.5672808, -0.2710568],
 [51.538621, -0.1028346],
 [51.5540666, 0.134017],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.406025, 0.013156],
 [51.5132537, -0.3043136],
 [51.406025, 0.013156],
 [51.461311, -0.303742],
 [51.4933675, 0.0098214],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.461311, -0.303742],
 [51.538621, -0.1028346],
 [51.4990156, -0.22915],
 [51.4935082, -0.1178424],
 [51.3769529, -0.0956895],
 [51.4097742, -0.2108084],
 [51.41233, -0.300689],
 [51.406025, 0.013156],
 [51.5351832, -0.4481378],
 [51.5132537, -0.3043136],
 [51.538621, -0.1028346],
 [51.41233, -0.300689],
 [51.538621, -0.1028346],
 [51.4990805, -0.1938253],
 [51.5436387, -0.0553621],
 [51.4609218, -0.373149],
 [51.538621, -0.1028346],
 [51.4974948, -0.1356583],
 [51.4935082, -0.1178424],
 [51.4609218, -0.373149],
 [51.458373, -0.1891356],
 [51.5255162, 0.0352163],
 [51.3769529, -0.0956895],
 [51.4609218, -0.373149],
 [51.4974948, -0.1356583],
 [51.538621, -0.1028346],
 [51.538621, -0.1028346],
 [51.458373, -0.1891356],
 [51.4609218, -0.373149],
 [51.461311, -0.303742],
 [51.5202607, -0.0293396],
 [51.458373, -0.1891356],
 [51.4935082, -0.1178424],
 [51.4933675, 0.0098214],
 [51.3769529, -0.0956895],
 [51.4990156, -0.22915],
 [51.5390261, -0.1425516],
 [51.4097742, -0.2108084],
 [51.4990805, -0.1938253],
 [51.4097742, -0.2108084],
 [51.5202607, -0.0293396],
 [51.580559, -0.341995],
 [51.577924, 0.2120829],
 [51.4974948, -0.1356583],
 [51.406025, 0.013156],
 [51.5886121, 0.0823982],
 [51.5436387, -0.0553621],
 [51.5351832, -0.4481378],
 [51.580559, -0.341995],
 [51.4974948, -0.1356583],
 [51.502781, -0.087738],
 [51.4609218, -0.373149],
 [51.5255162, 0.0352163],
 [51.580559, -0.341995],
 [51.5886383, -0.0117625],
 [51.4990805, -0.1938253],
 [51.502781, -0.087738],
 [51.3769529, -0.0956895],
 [51.577924, 0.2120829],
 [51.577924, 0.2120829],
 [51.406025, 0.013156],
 [51.4990156, -0.22915],
 [51.538621, -0.1028346],
 [51.4609218, -0.373149],
 [51.406025, 0.013156],
 [51.5390261, -0.1425516],
 [51.4990805, -0.1938253],
 [51.4990805, -0.1938253],
 [51.4935082, -0.1178424],
 [51.406025, 0.013156],
 [51.4933675, 0.0098214],
 [51.458373, -0.1891356],
 [51.6569225, -0.1949252],
 [51.458373, -0.1891356],
 [51.577924, 0.2120829],
 [51.502781, -0.087738],
 [51.5886121, 0.0823982],
 [51.4990156, -0.22915],
 [51.5886121, 0.0823982],
 [51.577924, 0.2120829],
 [51.4611509, -0.0073177],
 [51.4611509, -0.0073177]]

In [1478]:
def nearest_outstanding_school(lat, lon):
    """Return the KM distance to the nearest outstanding school"""
    coords_1 = (lat, lon)
    lower_boundary, upper_boundary = -0.05, 0.05
    school_distances = []
    for i in outstanding_schools:
        if (i[0] - lat > upper_boundary) or i[0] - lat < lower_boundary:
            continue
        elif (i[1] - lon > upper_boundary) or i[1] - lon < lower_boundary:
            continue
        else:
            coords_2 = (i[0], i[1])
            school_distances.append(geopy.distance.geodesic(coords_1, coords_2).km)
            
    try:
        return min(school_distances)
    except ValueError:
        return "Na"

In [1482]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': {},
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False},
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN': {'property_type': 'Flat',
  'price': 660000,
  'date': '8 Apr 2022',
  'url':

In [1480]:
# get all of the latitude and longitudes for all of the houses in the data set, and add this to the data.
import pgeocode
nomi = pgeocode.Nominatim('GB')
for x, y in houses.items():
    if 'latitude' in y.keys() and 'longitude' in y.keys():
        continue
    else:
        postcode = x[-8:]
        if postcode[0] == " ":
            postcode = x[-7:]
        postcode_split = postcode.split(" ")
        if len(postcode_split) == 2:
            postcode = postcode_split[0]+" "+postcode_split[1]
        elif len(postcode_split) == 3:
            postcode = postcode_split[1]+" "+postcode_split[2]
        lat = float(nomi.query_postal_code(postcode).latitude)
        lon = float(nomi.query_postal_code(postcode).longitude)
        
        y['latitude'] = lat
        y['longitude'] = lon

In [783]:

# for y in houses.values():
#     # Generate the feature of distance to the nearest outstanding school
#     if 'latitude' in y.keys() and 'longitude' in y.keys():
#         distance_km = nearest_outstanding_school(y['latitude'], y['longitude'])
#         y['nearest_outstanding_school'] = distance_km

    
#     # Attempt to generate the feature of the sq ft of the property
#     if 'property_size' in y.keys():
#         if (y['property_size'] == {}) and ('floorplan_url' in y.keys()):
#             try:
#                 ocr_size = size_floorplan_2(y['floorplan_url'])
#             except Exception:
#                 y['property_size'] = 'Na'
#             else:
#                 if ocr_size == None:
#                     y['property_size'] = 'Na'
#                 y['property_size'] = ocr_size
#     # If a property_size dictionary already exists, use it to find the sq ft of the property 
#     # code that handles when the property data has a dictionary with sqf and/or sq/f values in it
#     # Now you just need the f_values and m_values lists in the block of code
#         elif len(y['property_size']) >= 1:
#         # loop through the this dictionary
#         # if the dictionary contains a sq m value, set the value of property size to its sq f equivalent, 
#         # if the dictionary contains a sq f value, set it
#             for i, v in y['property_size'].items():
#                 p = i.strip().lower()
#                 if p in f_values:
#                     y['property_size'] = y['property_size'][v]
#                 elif p in m_values:
#                     y['property_size'] = y['property_size'][v]*10.764
                    
#         #elif (y['property_size'] == {}) and ('floorplan_url' not in y.keys()):
#          #   y['property_size'] = 'Na'
                
    
# # This still needs to be looked at
                
#     # if floorplan not in keys and property_size is {}
    
    
#     # This is where you could do an else statement, and set all of the prop_size dictionaries which end up in the data set 
#     # as ''
        

            
            
        
#  # need to handle for the situation when there is data in the property size dict, but its not related to sq sizes, 
# # and there is a floorplan URL
        
    
# #     else:
# #         if 'floorplan_url' in y.keys():
# #             ocr_size = size_floorplan_2(y['floorplan_url'])
# #             if ocr_size == None:
# #                 y['property_size'] = ''
# #             y['property_size'] = ocr_size
    
        

In [861]:
# loop through all of the houses, if they don't have a property size but do have a floorplan, apply the OCR, and then return 
# a value (note a nonetype may be returned so you'll have to deal with that). 

for y in houses.values():
    if 'property_size' in y.keys():
        if type(y['property_size']) == int or type(y['property_size']) == float:
            continue
        if (y['property_size'] == {} or type(y['property_size']) == str) and ('floorplan_url' in y.keys()):
            try:
                ocr_size = size_floorplan_2(y['floorplan_url'])
            except Exception:
                y['property_size'] = 'Na'
            else:
                if ocr_size == None or ocr_size == '':
                    y['property_size'] = 'Na'
                y['property_size'] = ocr_size
                # The line below this just sets the dictionary as the property size
                #y['property_size'] = ocr_size
                # This code actually sets the sq ft property size 
                ##for f in f_values:
                  ##  try:
                    ##    y['property_size'] = ocr_size[f]
                    ##except KeyError:
                      ##  pass
                    ##else:
                      ##  break
                            
    

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |████----------------------------------------------| 8.3% CompleteProgress: |████████------------------------------------------| 16.6% CompleteProgress: |████████████--------------------------------------| 24.8% CompleteProgress: |████████████████----------------------------------| 33.1% CompleteProgress: |████████████████████------------------------------| 41.4% CompleteProgress: |████████████████████████--------------------------| 49.7% CompleteProgress: |████████████████████████████----------------------| 58.0% CompleteProgress: |█████████████████████████████████-----------------| 66.2% CompleteProgress: |█████████████████████████████████████-------------| 74.5% CompleteProgress: |█████████████████████████████████████████---------| 82.8% CompleteProgress: |█████████████████████████████████████████████-----| 91.1% CompleteProgress: |█████████████████████████████████████████████████-| 99

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |███████-------------------------------------------| 15.7% CompleteProgress: |███████████████-----------------------------------| 31.4% CompleteProgress: |███████████████████████---------------------------| 47.0% CompleteProgress: |███████████████████████████████-------------------| 62.7% CompleteProgress: |███████████████████████████████████████-----------| 78.4% CompleteProgress: |███████████████████████████████████████████████---| 94.1% CompleteProgress: |██████████████████████████████████████████████████████| 109.7% Complete

CUDA not available - defaulting to CPU. Note: This module is much faster with a GPU.


Progress: |--------------------------------------------------| 0.0% CompleteProgress: |█-------------------------------------------------| 3.6% CompleteProgress: |███-----------------------------------------------| 7.3% CompleteProgress: |█████---------------------------------------------| 10.9% CompleteProgress: |███████-------------------------------------------| 14.5% CompleteProgress: |█████████-----------------------------------------| 18.2% CompleteProgress: |██████████----------------------------------------| 21.8% CompleteProgress: |████████████--------------------------------------| 25.5% CompleteProgress: |██████████████------------------------------------| 29.1% CompleteProgress: |████████████████----------------------------------| 32.7% CompleteProgress: |██████████████████--------------------------------| 36.4% CompleteProgress: |███████████████████-------------------------------| 40.0% CompleteProgress: |█████████████████████-----------------------------| 43.

KeyboardInterrupt: 

In [1483]:
# BROKEN THIS AWAY FROM THE ABOVE CODE AS THE OCR WAS TAKING TOO LONG # 
# # If they have a don't have a floorplan, but a property_size dictionary has been scraped, apply sq_foot_return()
# # Loop through all of the houses, if they have their lon and lat available, apply the nearest_oustanding_school()
# to return the distance to the nearest outstanding school
for y in houses.values():
    if 'property_size' in y.keys():
        try:
            if len(y['property_size']) >= 1 and type(y['property_size']) == dict:
                value = sq_foot_return(y['property_size'])
                y['property_size'] = value
        except TypeError:
            pass
        if 'latitude' in y.keys() and 'longitude' in y.keys():
            distance_km = nearest_outstanding_school(y['latitude'], y['longitude'])
            y['nearest_outstanding_school'] = distance_km

In [1484]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': {},
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 0.9294055612957386},
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN': {'property_type': 'Flat',

In [1485]:
## NLP
## Code for converting the features of each house into a number

house_features = []
for x in houses.values():
    if 'features' in x.keys():
        house_features.append(x['features'])

In [1486]:
house_features

[['Type 1A',
  'Contemporary Kitchen',
  'Utility Room',
  'Wraparound Balcony',
  'Use Of Communal Gardens',
  'West Facing Views'],
 ['Two Bedrooms',
  'Duplex',
  'Waitrose Nearby',
  'City of London',
  'On-Site Leisure Centre',
  'Grade II Listed'],
 ['3 Bedrooms',
  "Views of St Paul's, London Eye &amp; Other Iconic Landmarks",
  'Bathroom &amp; Separate Shower Room',
  'Open Plan Living Area',
  'Modernised Kitchen',
  '24 Hour Concierge',
  'Extended Lease'],
 ['Two-bedroom, split-level apartment',
  'Large living room with adjoining kitchen',
  'Bathroom and separate WC',
  'Two private balconies',
  'Central London location',
  'Views over the Barbican lake and gardens to the City',
  'Exquisite original features',
  'Closest stations: Moorgate (0.2 miles) &amp; Barbican (0.8 miles)',
  'Leasehold (175 years remain)',
  '890 sq. ft / 82.68 sq. m'],
 [],
 ['garden',
  'sought-after-location',
  'balcony',
  'close-to-local-amenities',
  'double-glazed-windows',
  'period-featu

In [1487]:
from gensim.models import Word2Vec
from statistics import mean

tokenised_features = []
for f in house_features:
    for i in range(len(f)):
        tokenised_sentance = []
        words = f[i].split(" ")
        for word in words:
            tokenised_sentance.append(word.lower())
        tokenised_features.append(tokenised_sentance)
        
model = Word2Vec(tokenised_features, min_count=1)

# here I want to obtain a list of vector arrays for each house 
start = 0
vectors = []
# loop over all of the house descriptions
for description in house_features:
    # obtain the number of bullet points for the house we're iterating over
    num_bullets = len(description) 
    # Create a list to store the word vectors for each house
    arrays1 = []
    # loop over the number of bullet points for the houses
    for i in range(num_bullets):
        # loop over the individual words within each bullet point
        for j in tokenised_features[i+start]:
            # Find the vector for each word in the bullet point, append it to the list where we store each 
            arrays1.append(model.wv[j])
    vectors.append(arrays1)
            
    start += num_bullets
    
house_averages = []
for i in vectors:
    word_means = []
    num_words = len(i)
    for x in range(num_words):
        word_means.append(mean(i[x]))
    try:
        house_averages.append(mean(word_means))
    except Exception:
        house_averages.append("NA")

In [1488]:
house_averages

[0.001071774,
 0.011361502,
 0.007192984,
 0.0047333045,
 'NA',
 -0.00016100262,
 0.0072742715,
 0.0044176085,
 0.006176685,
 0.0039416663,
 0.007067024,
 0.0023649153,
 0.006587687,
 0.0033123945,
 0.004480877,
 'NA',
 0.006191069,
 0.002535373,
 0.0084286155,
 0.008003741,
 0.0026529168,
 0.0060980506,
 0.0041638534,
 0.007250199,
 0.0034175864,
 0.004091956,
 0.0072393278,
 0.00617186,
 'NA',
 0.006027163,
 0.0083266245,
 0.004509195,
 0.0016024605,
 0.009240103,
 -0.00019836795,
 0.003247421,
 'NA',
 0.006214857,
 0.0035376407,
 0.009678988,
 0.0054326607,
 0.004437551,
 'NA',
 0.007699368,
 0.0060201203,
 0.0071664043,
 0.0027759306,
 'NA',
 0.0049162004,
 0.009067943,
 0.0054920265,
 0.0018817153,
 0.004752064,
 0.005340451,
 0.0053125466,
 0.0036802413,
 0.0054276837,
 0.006698339,
 'NA',
 0.005277893,
 0.007593262,
 0.0029532353,
 0.0060429797,
 0.0054750764,
 0.0034376234,
 0.0035135883,
 0.0058607464,
 0.0040768594,
 0.008588116,
 0.0074766227,
 'NA',
 0.006589839,
 0.0055173

In [1489]:
# add these values to the 
for i in houses.values():
    if 'features' not in i.keys():
        i["NLP_Score"] = "Na"
    else:
        i["NLP_Score"] = house_averages.pop(0)

In [1490]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': {},
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 0.9294055612957386,
  'NLP_Score': 0.001071774},
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN

In [1491]:
# Code for finding the average price of houses in the same postcode area with the same number of bedrooms of each property
# This column will be used in the first regression 

for i, v in houses.items():
    local_similar_prices = []
    # need to store the postcode, each house has a postcode so this will be easier
    postcode = i[-8:]
    if postcode[0] == " ":
        postcode = i[-7:]
    postcode_area = postcode.split(" ")[0]
    if postcode_area == 'n':
        postcode_area = postcode.split(" ")[1]


    try:
        prop_type = v['property_type']
        num_bedrooms = v["bedrooms"]
    except KeyError:
        continue
    else:
        for a, b in houses.items():
            postcode = i[-8:]
            if postcode[0] == " ":
                postcode = i[-7:]
            postcode_area1 = postcode.split(" ")[0]
            if postcode_area1 == 'n':
                postcode_area1 = postcode.split(" ")[1]
            if postcode_area == postcode_area1:
                try:
                    prop_type1 = b['property_type']
                    num_bedrooms1 = b['bedrooms']
                except KeyError:
                    pass
                else:
                    if prop_type1 == prop_type and num_bedrooms1 == num_bedrooms:
                        local_similar_prices.append(b['price'])
    
    avg_local_price = sum(local_similar_prices) / len(local_similar_prices)
    v["avg_local_price"] = avg_local_price

In [1492]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': {},
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 0.9294055612957386,
  'NLP_Score': 0.001071774,
  'avg_local_price': 1645641.4135977337},
 'Flat 27, Bayer House, Golden Lan

In [1387]:
# THIS HAS BEEN COMMENTED OUT AFTER SPEAKING TO ADAM 

# # Code that makes a unique number for each postcode area from our scraped data
# postcodes = []
# for i, v in houses.items():
#     postcode = i[-8:]
#     if postcode[0] == " ":
#         postcode = i[-7:]
#     postcode_area = postcode.split(" ")[0]
#     if postcode_area == 'n':
#         postcode_area = postcode.split(" ")[1]
#     postcodes.append(postcode_area)
# unique_postcodes = set(postcodes)
# unique_postcodes = list(unique_postcodes)

# # What I need to do is order the unique postcodes in terms of average sold price
# postcode_prices = {}
# for i in unique_postcodes:
#     prices = []
#     for a, b in houses.items():
#         if i in a:
#             prices.append(b['price'])
#     postcode_avg = sum(prices) / len(prices)
#     postcode_prices[i] = postcode_avg
    
# postcode_prices = sorted(postcode_prices.items(), key=lambda x: x[1])


# postcodes_ordered = [postcode_prices[i][0] for i in range(len(postcode_prices))]
# postcodes_ordered
# area_codes = {}
# for i in range(len(postcodes_ordered)):
#     area_codes[postcodes_ordered[i]] = i
# area_codes

In [1493]:
# Need to convert the data into a quantitative format, which regression algorithms can learn/predict from
# This is done for each data set
properties = {}
index = 0
# loop through all of the 
for h, i in houses.items():
    house_data = []
    
    # Area code has been commented out after speaking to Adam
    # area_code - Want to know which codes
#     postcode = h[-8:]
#     if postcode[0] == " ":
#         postcode = h[-7:]
#     postcode_area = postcode.split(" ")[0]
#     if postcode_area == 'n':
#         postcode_area = postcode.split(" ")[1]
    
#     area_code = area_codes[postcode_area]
#     house_data.append(area_code)
    
    house_data.append(i['latitude'])
    house_data.append(i['longitude'])
    
    
    if i['property_type'] == 'Flat':
        prop_type = 1
    elif i['property_type'] == 'Terraced':
        prop_type == 2
    elif i['property_type'] == 'Semi-Detached':
        prop_type = 3 
    elif i['property_type'] == 'Detached':
        prop_type = 4
    else:
        prop_type = 5
    house_data.append(prop_type)
    
    house_data.append(i["NLP_Score"])
    
    # Convert the data into the necessary string format
    date_obj = datetime.datetime.strptime(i['date'], "%d %b %Y")
    date_str = date_obj.strftime("%Y-%m-%d")
    house_data.append(date_str)
    
    if 'station_proximities' in i.keys() and i['station_proximities'] != {}:
        station_distance = min(i['station_proximities'].values())
        house_data.append(station_distance)
    else:
        house_data.append("NA")
        
    # You should have handled for the property_size difficulties by now
    if 'property_size' not in i.keys():
        house_data.append("NA")
    elif i['property_size'] == {} or i['property_size'] == 'Na' or i["property_size"] == None:
        house_data.append("NA")
    else:
        # This should return the sq foot of the property
        house_data.append(i["property_size"])
    
    # Add the bedrooms and bathrooms data   
    if "bedrooms" not in i.keys():
        house_data.append("NA")
    elif i['bedrooms'] == None or i['bedrooms'] == '':
        house_data.append("NA")
    else:
        house_data.append(i["bedrooms"])
        
    if 'bathrooms' not in i.keys():
        house_data.append("NA")
    elif i['bathrooms'] == None or i['bathrooms'] == '':
        house_data.append("NA")
    else:
        house_data.append(i["bathrooms"])
        
        
    # New build
    if 'new_build' not in i.keys():
        house_data.append("NA")
    elif i['new_build'] == '':
        house_data.append("NA")
    else:
        if i["new_build"]:
            house_data.append(1)
        else:
            house_data.append(0)
    
    if 'nearest_outstanding_school' not in i.keys():
        house_data.append("NA")
    else:
        house_data.append(i["nearest_outstanding_school"])
    
    if 'avg_local_price' not in i.keys():
        house_data.append("NA")
    else:
        house_data.append(i["avg_local_price"])
        
    if 'floorplan_url' in i.keys():
        house_data.append("Yes")
    else:
        house_data.append("NA")
        
    
    
    house_data.append(i["price"])
    
    
        
    properties[h] = house_data
    index += 1
    
    

In [1494]:
properties

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': [51.519886,
  -0.096743,
  1,
  0.001071774,
  '2022-04-14',
  0.05088188581023095,
  'NA',
  3,
  2,
  0,
  0.9294055612957386,
  1645641.4135977337,
  'Yes',
  1500000],
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN': [51.522525,
  -0.095795,
  1,
  0.011361502,
  '2022-04-08',
  0.18723210717210148,
  'NA',
  2,
  1,
  0,
  1.18086626671716,
  1008716.6797752809,
  'Yes',
  660000],
 'Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD': [51.520535,
  -0.09278,
  1,
  0.007192984,
  '2022-04-06',
  0.22041031108326287,
  1140.984,
  3,
  2,
  0,
  0.9197592059919366,
  1645641.4135977337,
  'Yes',
  1620000],
 'Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL': [51.519515,
  -0.090796,
  1,
  0.0047333045,
  '2022-04-01',
  0.11080892335977656,
  890,
  2,
  2,
  0,
  0.7979068234378649,
  1008716.6797752809,
  'Yes',
  1075000],
 '211, Bunyan 

In [1495]:
len(properties)

10660

# Now I build the Regression Model


In [1496]:
import pandas as pd
import numpy as np

In [1497]:
properties_df = pd.DataFrame(properties)

In [1498]:
properties_df

Unnamed: 0,"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY","Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN","Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD","Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL","211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH","802, Frobisher Crescent, London, Greater London EC2Y 8HD","Flat 3, John Trundle Court, Barbican, London, Greater London EC2Y 8DJ","Flat 51, Breton House, Barbican, London, Greater London EC2Y 8DQ","Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL","Flat 16, Chequer Court, 3, Chequer Street, London, Greater London EC1Y 8PW",...,"First And Second Floor Flat, 254, Ashmore Road, London, Greater London W9 3DD","Basement Flat, 33a, Fernhead Road, London, Greater London W9 3EX","23b, Bravington Road, London, Greater London W9 3AB","Flat 83, Dibdin House, Maida Vale, London, Greater London W9 1QF","171a, Saltram Crescent, London, Greater London W9 3JU","Ground Floor Flat, 138, Portnall Road, London, Greater London W9 3BQ","22a, Saltram Crescent, London, Greater London W9 3HR","28b, Denholme Road, London, Greater London W9 3HX","First Floor Flat, 41, Hormead Road, London, Greater London W9 3NQ","Flat 1, 84a, Carlton Hill, London, Greater London NW8 0ER"
0,51.519886,51.522525,51.520535,51.519515,51.5198,51.52051,51.520673,51.52133,51.520886,51.5232,...,51.5244,51.52707,51.526718,51.533893,51.532039,51.5244,51.53037,51.53049,51.524103,51.5299
1,-0.096743,-0.095795,-0.09278,-0.090796,-0.0948,-0.09372,-0.097013,-0.09377,-0.094049,-0.0933,...,-0.18458,-0.20122,-0.204491,-0.188475,-0.201636,-0.18458,-0.19907,-0.20094,-0.203919,-0.1747
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,0.001072,0.011362,0.007193,0.004733,Na,,-0.000161,0.007274,0.004418,Na,...,Na,0.004766,0.007543,0.005367,0.008689,Na,0.006883,0.006387,0.007074,Na
4,2022-04-14,2022-04-08,2022-04-06,2022-04-01,2022-04-01,2022-03-28,2022-03-25,2022-03-25,2022-03-18,2022-03-08,...,2019-10-04,2019-10-03,2019-10-03,2019-10-02,2019-10-02,2019-10-01,2019-09-27,2019-09-25,2019-09-23,2019-09-20
5,0.050882,0.187232,0.22041,0.110809,,0.179989,0.051595,0.194288,0.171891,,...,,0.429642,0.433827,0.250521,0.243525,,0.391038,0.334152,0.259534,
6,,,1140.984,890,,,,,729,,...,,,,667.368,,,,,,
7,3,2,3,2,,,,,1,,...,,1,2,2,2,,3,2,2,
8,2,1,2,2,,,1,1,1,,...,,,1,1,1,,1,1,1,
9,0,0,0,0,,0,0,0,0,,...,,0,0,0,0,,0,0,0,


In [1499]:
properties_df = properties_df.T

In [1500]:
properties_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.001072,2022-04-14,0.050882,,3,2,0,0.929406,1645641.413598,Yes,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.011362,2022-04-08,0.187232,,2,1,0,1.180866,1008716.679775,Yes,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.09278,1,0.007193,2022-04-06,0.22041,1140.984,3,2,0,0.919759,1645641.413598,Yes,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,2022-04-01,0.110809,890,2,2,0,0.797907,1008716.679775,Yes,1075000
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.5198,-0.0948,1,Na,2022-04-01,,,,,,,,,519500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Ground Floor Flat, 138, Portnall Road, London, Greater London W9 3BQ",51.5244,-0.18458,1,Na,2019-10-01,,,,,,,,,455000
"22a, Saltram Crescent, London, Greater London W9 3HR",51.53037,-0.19907,1,0.006883,2019-09-27,0.391038,,3,1,0,0.593113,1645641.413598,,630000
"28b, Denholme Road, London, Greater London W9 3HX",51.53049,-0.20094,1,0.006387,2019-09-25,0.334152,,2,1,0,0.510563,1008716.679775,Yes,365000
"First Floor Flat, 41, Hormead Road, London, Greater London W9 3NQ",51.524103,-0.203919,1,0.007074,2019-09-23,0.259534,,2,1,0,1.162188,1008716.679775,Yes,415000


In [1501]:
properties_df.columns = ["Latitude", "Longitude", "Property Type", "NLP Score","Sold_Date", "Closest Station", "Size", "Bedrooms", "Bathrooms", "New Build", "Nearest OS", "Local_Similar_Prices", "Floorplan?", "Price"]

In [1502]:
properties_df[0:25]

Unnamed: 0,Latitude,Longitude,Property Type,NLP Score,Sold_Date,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Floorplan?,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.001072,2022-04-14,0.050882,,3.0,2.0,0.0,0.929406,1645641.413598,Yes,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.011362,2022-04-08,0.187232,,2.0,1.0,0.0,1.180866,1008716.679775,Yes,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.09278,1,0.007193,2022-04-06,0.22041,1140.984,3.0,2.0,0.0,0.919759,1645641.413598,Yes,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,2022-04-01,0.110809,890.0,2.0,2.0,0.0,0.797907,1008716.679775,Yes,1075000
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.5198,-0.0948,1,Na,2022-04-01,,,,,,,,,519500
"802, Frobisher Crescent, London, Greater London EC2Y 8HD",51.52051,-0.09372,1,,2022-03-28,0.179989,,,,0.0,0.928123,424087.029703,Yes,559000
"Flat 3, John Trundle Court, Barbican, London, Greater London EC2Y 8DJ",51.520673,-0.097013,1,-0.000161,2022-03-25,0.051595,,,1.0,0.0,1.016726,424087.029703,Yes,475000
"Flat 51, Breton House, Barbican, London, Greater London EC2Y 8DQ",51.52133,-0.09377,1,0.007274,2022-03-25,0.194288,,,1.0,0.0,1.01825,424087.029703,Yes,535000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.004418,2022-03-18,0.171891,729.0,1.0,1.0,0.0,0.97384,713763.732628,Yes,670000
"Flat 16, Chequer Court, 3, Chequer Street, London, Greater London EC1Y 8PW",51.5232,-0.0933,1,Na,2022-03-08,,,,,,,,,885000


In [1503]:
# Set them to NaN values
properties_df = properties_df.replace("NA", np.NaN)
properties_df = properties_df.replace("Na", np.NaN)

In [1505]:
properties_df
del properties_df['Floorplan?']

In [1506]:
properties_df.isnull().sum(axis = 0)

Latitude                    0
Longitude                   0
Property Type               0
NLP Score                8408
Sold_Date                   0
Closest Station          7876
Size                    10057
Bedrooms                 8087
Bathrooms                8848
New Build                7875
Nearest OS               7877
Local_Similar_Prices     7875
Price                       0
dtype: int64

In [888]:
# calculate the median values 
mean_cs = properties_df['Closest Station'].mean()
mean_bedrooms = properties_df['Bedrooms'].mean()
mean_bathrooms = properties_df['Bathrooms'].mean()
median_build = properties_df['New Build'].median()
mean_os = properties_df['Nearest OS'].mean()
mean_sim_price = properties_df['Local_Similar_Prices'].mean()
mean_size = properties_df['Size'].mean()

mean_sim_price


863625.7422680412

In [889]:
prop_df1 = properties_df
prop_df1['Closest Station'] = prop_df1['Closest Station'].fillna(mean_cs)
prop_df1['Size'] = prop_df1['Size'].fillna(mean_size)
prop_df1['Bedrooms'] = prop_df1['Bedrooms'].fillna(mean_bedrooms)
prop_df1['Bathrooms'] = prop_df1['Bathrooms'].fillna(mean_bathrooms)
prop_df1['New Build'] = prop_df1['New Build'].fillna(median_build)
prop_df1['Nearest OS'] = prop_df1['Nearest OS'].fillna(mean_os)
prop_df1['Local_Similar_Prices'] = prop_df1['Local_Similar_Prices'].fillna(mean_sim_price)





In [1507]:
# Here we've dropped all of the rows without all of the data for the prop_df
prop_df = properties_df.dropna()

In [1508]:
len(prop_df)

267

In [1509]:
prop_df

Unnamed: 0,Latitude,Longitude,Property Type,NLP Score,Sold_Date,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Price
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.007193,2022-04-06,0.220410,1140.984,3.0,2.0,0.0,0.919759,1.645641e+06,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,2022-04-01,0.110809,890.000,2.0,2.0,0.0,0.797907,1.008717e+06,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.004418,2022-03-18,0.171891,729.000,1.0,1.0,0.0,0.973840,7.137637e+05,670000
"Flat 144, Thomas More House, Barbican, London, Greater London EC2Y 8BU",51.519016,-0.096078,1,0.003312,2022-01-28,0.109014,779.000,2.0,1.0,0.0,0.822169,1.008717e+06,825000
"Flat 171, Defoe House, Barbican, London, Greater London EC2Y 8ND",51.518757,-0.092939,1,0.002653,2021-11-11,0.167040,764.244,1.0,1.0,0.0,0.726242,7.137637e+05,925000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Second Floor Flat, 102, Portnall Road, London, Greater London W9 3BE",51.527830,-0.203930,1,0.003264,2020-10-16,0.433353,527.000,1.0,1.0,0.0,0.747746,7.137637e+05,635000
"First Floor Flat, 103, Kilburn Park Road, London, Greater London NW6 5LB",51.530825,-0.193699,1,0.005397,2020-05-15,0.292400,495.000,1.0,1.0,0.0,0.849204,7.137637e+05,335000
"Flat A, 181, Ashmore Road, London, Greater London W9 3DB",51.531551,-0.203538,1,0.005082,2019-12-10,0.204789,928.000,3.0,2.0,0.0,0.338235,1.645641e+06,682000
"117a, Kilburn Park Road, London, Greater London NW6 5LB",51.530505,-0.194053,1,0.006621,2019-12-03,0.314292,75.348,2.0,1.0,0.0,0.846204,1.008717e+06,375000


In [1510]:
# Remove the sold date for now, will need to find a way to quantify this.
del prop_df['Sold_Date']

In [1406]:

del prop_df1['Sold_Date']
del prop_df1['Closest Station']

KeyError: 'Closest Station'

In [1511]:
prop_df

Unnamed: 0,Latitude,Longitude,Property Type,NLP Score,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Local_Similar_Prices,Price
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.007193,0.220410,1140.984,3.0,2.0,0.0,0.919759,1.645641e+06,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,0.110809,890.000,2.0,2.0,0.0,0.797907,1.008717e+06,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.004418,0.171891,729.000,1.0,1.0,0.0,0.973840,7.137637e+05,670000
"Flat 144, Thomas More House, Barbican, London, Greater London EC2Y 8BU",51.519016,-0.096078,1,0.003312,0.109014,779.000,2.0,1.0,0.0,0.822169,1.008717e+06,825000
"Flat 171, Defoe House, Barbican, London, Greater London EC2Y 8ND",51.518757,-0.092939,1,0.002653,0.167040,764.244,1.0,1.0,0.0,0.726242,7.137637e+05,925000
...,...,...,...,...,...,...,...,...,...,...,...,...
"Second Floor Flat, 102, Portnall Road, London, Greater London W9 3BE",51.527830,-0.203930,1,0.003264,0.433353,527.000,1.0,1.0,0.0,0.747746,7.137637e+05,635000
"First Floor Flat, 103, Kilburn Park Road, London, Greater London NW6 5LB",51.530825,-0.193699,1,0.005397,0.292400,495.000,1.0,1.0,0.0,0.849204,7.137637e+05,335000
"Flat A, 181, Ashmore Road, London, Greater London W9 3DB",51.531551,-0.203538,1,0.005082,0.204789,928.000,3.0,2.0,0.0,0.338235,1.645641e+06,682000
"117a, Kilburn Park Road, London, Greater London NW6 5LB",51.530505,-0.194053,1,0.006621,0.314292,75.348,2.0,1.0,0.0,0.846204,1.008717e+06,375000


In [989]:
#del prop_df['Local_Similar_Prices']

In [1512]:
x = prop_df.drop(['Price'], axis=1).values
y = prop_df['Price'].values

In [1513]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [1514]:
# split the data set into a training and testing set
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

In [1515]:
from sklearn.linear_model import LinearRegression
# Traing the model (on the training set)
ml = LinearRegression()
ml.fit(x_train,y_train)

In [1516]:
# Predict the test set results
y_pred = ml.predict(x_test)
print(y_pred)

[2882044.01898408  715029.30167961  842867.16699338  428387.87096047
 2757542.59779596 2892503.7067802  2529344.50333428 6798812.72709489
 5837886.49700069 2540666.03763771  116784.60731864 1013128.91369367
 6137296.64844751 2043421.86022329  929693.15356207  744241.59367609
 1336777.29313707 3162725.44082832 4398648.48311353  525963.65889001
 4451536.02466488 3993714.23633981  874705.09363008  651592.18764305
 2673093.64868665  217993.3729682  1928518.3491106  1957089.43555379
 6967895.66993308  167759.33892488  947025.31989622   72105.42873049
 2254588.30737138 2119187.08851409 1486585.00441456 3040299.03475308
 1456046.62470889 3526907.98222876 1572190.73273802  746405.50821996
 1828575.28974605  -15802.45864749 1905002.90538144  116375.78916264
 3172150.64538884  423828.19664621 1351497.41916347 3199292.58397198
   42875.93127799  683648.48219013  618238.92293072 2042208.06775522
  850552.26848626  330676.11156058 3041112.33251429 4305368.7768662
 3728146.6566906   423150.75372314 

In [1517]:
# evaluate the model 
from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred))

print(mean_absolute_error(y_test, y_pred))
# 80 % score with the r score
# 55495 MAE (this will of)

0.4995401562702644
890286.0958587682


In [None]:
# Now lets do the linear regression without the local avg prices, and replace this column with numbers based of the KNN 
# to find the average price of local similar houses

In [1518]:
prop_dfx = properties_df
del prop_dfx["Local_Similar_Prices"]
del prop_dfx["Sold_Date"] 


In [1519]:
prop_dfx = prop_dfx.dropna()

In [1520]:
prop_dfx

Unnamed: 0,Latitude,Longitude,Property Type,NLP Score,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.007193,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.004418,0.171891,729.000,1.0,1.0,0.0,0.973840,670000
"Flat 144, Thomas More House, Barbican, London, Greater London EC2Y 8BU",51.519016,-0.096078,1,0.003312,0.109014,779.000,2.0,1.0,0.0,0.822169,825000
"Flat 171, Defoe House, Barbican, London, Greater London EC2Y 8ND",51.518757,-0.092939,1,0.002653,0.167040,764.244,1.0,1.0,0.0,0.726242,925000
...,...,...,...,...,...,...,...,...,...,...,...
"Second Floor Flat, 102, Portnall Road, London, Greater London W9 3BE",51.527830,-0.203930,1,0.003264,0.433353,527.000,1.0,1.0,0.0,0.747746,635000
"First Floor Flat, 103, Kilburn Park Road, London, Greater London NW6 5LB",51.530825,-0.193699,1,0.005397,0.292400,495.000,1.0,1.0,0.0,0.849204,335000
"Flat A, 181, Ashmore Road, London, Greater London W9 3DB",51.531551,-0.203538,1,0.005082,0.204789,928.000,3.0,2.0,0.0,0.338235,682000
"117a, Kilburn Park Road, London, Greater London NW6 5LB",51.530505,-0.194053,1,0.006621,0.314292,75.348,2.0,1.0,0.0,0.846204,375000


In [1521]:
# Lets try the KNN to get local similar prices
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.model_selection import cross_val_predict

In [1522]:
x = prop_dfx[["Latitude", "Longitude", "Property Type", "Size", "Bedrooms", "Bathrooms", "New Build"]]
y = prop_dfx["Price"]
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred = cross_val_predict(knn, x, y, cv=5)

In [1523]:
# Set the KNN predictions to the local_similar_prices column 
y_pred

array([ 1923000. ,  1005000. ,   787000. ,   734000. ,   724000. ,
         570000. ,  1982000. ,   538000. ,   949000. ,   910000. ,
         729000. ,   755000. ,  1131400. ,   713200. ,   755000. ,
         729000. ,   713200. ,   910000. ,   949000. ,   755000. ,
         729000. ,   724000. ,   849000. ,  1173700. ,  1135500. ,
         724000. ,   538000. ,   755000. ,   479000. ,   545000. ,
         479000. ,   966000. ,   479000. ,   479000. ,   822500. ,
        1130000. ,   729000. ,  5718250. ,   724000. ,   598000. ,
         737000. ,   538000. ,  3195000. ,   796500. ,   876500. ,
        2486000. ,  1459004.2,   755000. ,   697200. ,  1130000. ,
        1604000. ,  1334000. ,  2071000. ,  2486000. ,   547108. ,
         530308.6,  1965000. ,   530308.6,   511000. ,  1254000. ,
        2106000. ,  4012000. ,   606000. ,   765200. ,   771500. ,
        2997000. ,   547108. ,  2117000. ,   609200. ,   488200. ,
        4217500. ,  2484000. ,  9163000. ,   609200. ,  155600

In [1524]:
#for i in range(len(prop_dfx)):
 #   prop_dfx.iloc[i:"local_similar_price"] = y_pred[i] 
prop_dfx["Local Similar Price"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_dfx["Local Similar Price"] = y_pred


In [1525]:
prop_dfx

Unnamed: 0,Latitude,Longitude,Property Type,NLP Score,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price,Local Similar Price
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.007193,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000,1923000.0
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.004733,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000,1005000.0
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.004418,0.171891,729.000,1.0,1.0,0.0,0.973840,670000,787000.0
"Flat 144, Thomas More House, Barbican, London, Greater London EC2Y 8BU",51.519016,-0.096078,1,0.003312,0.109014,779.000,2.0,1.0,0.0,0.822169,825000,734000.0
"Flat 171, Defoe House, Barbican, London, Greater London EC2Y 8ND",51.518757,-0.092939,1,0.002653,0.167040,764.244,1.0,1.0,0.0,0.726242,925000,724000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
"Second Floor Flat, 102, Portnall Road, London, Greater London W9 3BE",51.527830,-0.203930,1,0.003264,0.433353,527.000,1.0,1.0,0.0,0.747746,635000,1072700.0
"First Floor Flat, 103, Kilburn Park Road, London, Greater London NW6 5LB",51.530825,-0.193699,1,0.005397,0.292400,495.000,1.0,1.0,0.0,0.849204,335000,656736.8
"Flat A, 181, Ashmore Road, London, Greater London W9 3DB",51.531551,-0.203538,1,0.005082,0.204789,928.000,3.0,2.0,0.0,0.338235,682000,914920.0
"117a, Kilburn Park Road, London, Greater London NW6 5LB",51.530505,-0.194053,1,0.006621,0.314292,75.348,2.0,1.0,0.0,0.846204,375000,716000.0


In [1526]:
# Now let's do the regression again and see if the perfomance has improved ...

In [1527]:
x_1 = prop_dfx.drop(['Price'], axis=1).values
y_1 = prop_dfx['Price'].values

In [1528]:
# split the data set into a training and testing set
x_train,x_test,y_train,y_test = train_test_split(x_1,y_1,test_size=0.3,random_state=0)

In [1529]:
# Traing the model (on the training set)
ml = LinearRegression()
ml.fit(x_train,y_train)

In [1530]:
# Predict the test set results
y_pred_1 = ml.predict(x_test)
print(y_pred_1)

[2334622.32285929  741419.22704029 1530731.2645762   653794.31090283
 2677929.25855947 1454611.09952211 3695495.50045896 7541550.33843422
 6097920.81575608 2536487.47311473  378516.86788297 1026289.41633749
 6954174.23466706 2835948.81292486 1573612.93390775  144886.86798859
 1514178.46779823 2912025.85733581 4543922.26301718 -152492.89237714
 4745281.42208552 3688052.79380059 1566824.53777575  244393.66745353
 4140402.5341723   514543.28056765 1174291.88516903 1500289.40763378
 8147659.64521194  109589.96370745  952985.83931732  182085.5311842
 3767936.95929193 2446661.47982478 2171452.10884213 3998121.05081749
 2169188.98709869 5353778.5136373  2432306.53078485  217454.47632051
 1876431.41094851  -63377.96675444 -283361.15457368 -865372.71824431
 2955961.66888356  686332.17437768 1386846.1632669  2907452.3182478
  410436.18541646  295321.60776424  114301.87347937 3005771.83917904
 1052535.66315508  999337.63402414 4033862.56222844 4400894.5864985
 3781669.0038445  -113364.09948945 35

In [1531]:
# evaluate the model 
from sklearn.metrics import r2_score, mean_absolute_error
print(r2_score(y_test, y_pred_1))

print(mean_absolute_error(y_test, y_pred_1))

0.42485880559177536
1001550.2892484842


In [None]:
 # Essentially no change in the results

In [999]:
# NLP 


2554

In [1001]:
dataset_features = []
for i in houses.values():
    if 'features' in i.keys():
        if i['features'] == []:
            pass
        else:
            dataset_features.append(i['features'])

In [1002]:
dataset_features

[['Type 1A',
  'Contemporary Kitchen',
  'Utility Room',
  'Wraparound Balcony',
  'Use Of Communal Gardens',
  'West Facing Views'],
 ['Two Bedrooms',
  'Duplex',
  'Waitrose Nearby',
  'City of London',
  'On-Site Leisure Centre',
  'Grade II Listed'],
 ['3 Bedrooms',
  "Views of St Paul's, London Eye &amp; Other Iconic Landmarks",
  'Bathroom &amp; Separate Shower Room',
  'Open Plan Living Area',
  'Modernised Kitchen',
  '24 Hour Concierge',
  'Extended Lease'],
 ['Two-bedroom, split-level apartment',
  'Large living room with adjoining kitchen',
  'Bathroom and separate WC',
  'Two private balconies',
  'Central London location',
  'Views over the Barbican lake and gardens to the City',
  'Exquisite original features',
  'Closest stations: Moorgate (0.2 miles) &amp; Barbican (0.8 miles)',
  'Leasehold (175 years remain)',
  '890 sq. ft / 82.68 sq. m'],
 ['garden',
  'sought-after-location',
  'balcony',
  'close-to-local-amenities',
  'double-glazed-windows',
  'period-features-c

# KNN Approach


In [1041]:
houses

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': {'property_type': 'Flat',
  'price': 1500000,
  'date': '14 Apr 2022',
  'url': 'https://www.rightmove.co.uk/house-prices/details/england-111198221-14949232?s=6d33d105e453da910edf69d474e18a926d96b6c59d64c2d3aa33091244ad5ad5',
  'features': ['Type 1A',
   'Contemporary Kitchen',
   'Utility Room',
   'Wraparound Balcony',
   'Use Of Communal Gardens',
   'West Facing Views'],
  'floorplan_url': 'https://media.rightmove.co.uk/72k/71134/111198221/71134_30837396_FLP_00_0002.jpeg',
  'latitude': 51.519886,
  'longitude': -0.096743,
  'station_proximities': {'Barbican Station': 0.05088188581023095,
   'Moorgate Station': 0.3460585418348957,
   "St. Paul's Station": 0.35797745876363984},
  'property_size': 1087.9,
  'bedrooms': 3,
  'bathrooms': 2,
  'new_build': False,
  'nearest_outstanding_school': 0.9294055612957386,
  'avg_local_price': 1681475.4385964912},
 'Flat 27, Bayer House, Golden Lane Estate, London, Greate

In [1044]:
knn_data = {}
for i,v in houses.items():
    knn_data[i] = [v['latitude'], v['longitude'], v['price']]

{'Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY': [51.519886,
  -0.096743,
  1500000],
 'Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN': [51.522525,
  -0.095795,
  660000],
 'Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD': [51.520535,
  -0.09278,
  1620000],
 'Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL': [51.519515,
  -0.090796,
  1075000],
 '211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH': [51.5198,
  -0.0948,
  519500],
 '802, Frobisher Crescent, London, Greater London EC2Y 8HD': [51.52051,
  -0.09372,
  559000],
 'Flat 3, John Trundle Court, Barbican, London, Greater London EC2Y 8DJ': [51.520673,
  -0.097013,
  475000],
 'Flat 51, Breton House, Barbican, London, Greater London EC2Y 8DQ': [51.52133,
  -0.09377,
  535000],
 'Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL': [51.520886,
  -0.094049,
  670000],
 'Flat 16, Chequer Court, 3, Chequer Stre

In [1052]:
knn_df = pd.DataFrame(knn_data)
knn_df = knn_df.T
knn_df.columns = ["Latitude", "Longitude", "Price"]

In [1053]:
knn_df

Unnamed: 0,Latitude,Longitude,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1500000.0
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,660000.0
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1620000.0
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1075000.0
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.519800,-0.094800,519500.0
...,...,...,...
"10, Bolt Court, London, Greater London EC4A 3DQ",51.508500,-0.125700,1025000.0
"Flat 107, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,89000.0
"Flat 68, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,145000.0
"Flat 8, Pemberton House, 6, East Harding Street, London, Greater London EC4A 3AS",51.508500,-0.125700,165000.0


In [1064]:
knn_df.min() 

Latitude        51.5051
Longitude       -0.1257
Price        20000.0000
dtype: float64

In [1076]:
# establish the price categories 

    

In [1069]:
# You need to establish the feature_names () and the target_names (category names)
def categorise(row):
    if 20000<=row['Price']<100000:
        return 'A'
    if 100000<=row['Price']<200000:
        return 'B'
    if 200000<=row['Price']<300000:
        return 'C'
    if 300000<=row['Price']<400000:
        return 'D'
    if 400000<=row['Price']<500000:
        return 'E'
    if 500000<=row['Price']<600000:
        return 'F'
    if 600000<=row['Price']<700000:
        return 'G'
    if 700000<=row['Price']<800000:
        return 'H'
    if 800000<=row['Price']<900000:
        return 'I'
    if 900000<=row['Price']<1000000:
        return 'J'
    if 1000000<=row['Price']<1100000:
        return 'K'
    if 1100000<=row['Price']<1200000:
        return 'L'
    if 1200000<=row['Price']<1300000:
        return 'M'
    if 1300000<=row['Price']<1400000:
        return 'N'
    if 1400000<=row['Price']<1500000:
        return 'O'
    if 1500000<=row['Price']<1600000:
        return 'P'
    if 1600000<=row['Price']<1700000:
        return 'Q'
    if 1700000<=row['Price']<1800000:
        return 'R'
    if 1800000<=row['Price']<1900000:
        return 'S'
    if 1900000<=row['Price']<2000000:
        return 'T'
    if row['Price']>=2000000:
        return 'L'
    

In [1075]:
# apply the categorise function on the dataframe
knn_df['price_cat'] = knn_df.apply(lambda row: categorise(row), axis=1)

In [1074]:
knn_df

Unnamed: 0,Latitude,Longitude,Price,price_cat
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1500000.0,P
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,660000.0,G
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1620000.0,Q
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1075000.0,K
"211, Bunyan Court, Barbican, London, Greater London EC2Y 8DH",51.519800,-0.094800,519500.0,F
...,...,...,...,...
"10, Bolt Court, London, Greater London EC4A 3DQ",51.508500,-0.125700,1025000.0,K
"Flat 107, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,89000.0,A
"Flat 68, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,145000.0,B
"Flat 8, Pemberton House, 6, East Harding Street, London, Greater London EC4A 3AS",51.508500,-0.125700,165000.0,B


In [1110]:
# Now we can try to do our KNN
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.model_selection import cross_val_predict

In [1111]:
x = knn_df[["Latitude", "Longitude"]]
y = knn_df["Price"]

In [1153]:
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred = cross_val_predict(knn, x, y, cv=5)

In [1154]:
# Our predictions in an array 
y_pred

array([1494000. ,  606053.8, 1815600. , ...,  491000. ,  461200. ,
        461200. ])

In [1155]:
# Evaluation of the predictions
print(r2_score(y, y_pred))
print(mean_absolute_error(y, y_pred))

-0.39388112752444404
474608.72106499603


In [1158]:
knn_df['Price'].max()

11250000.0

In [1163]:
# I think a good idea is to add features to the house which makes the houses similar 
# and then apply the KNN. This include, bedrooms, bathrooms, new_build, size, lat & lon



In [1179]:
knn_df1 = properties_df 
del knn_df1["Postcode Area"]
del knn_df1["Sold_Date"]
del knn_df1["Local_Similar_Prices"]

In [1180]:
knn_df1 = knn_df1.dropna()

In [1181]:
knn_df1

Unnamed: 0,Latitude,Longitude,Property Type,Closest Station,Size,Bedrooms,Bathrooms,New Build,Nearest OS,Price
"Flat 131, Lauderdale Tower, Barbican, London, Greater London EC2Y 8BY",51.519886,-0.096743,1,0.050882,1087.900,3.0,2.0,0.0,0.929406,1500000
"Flat 27, Bayer House, Golden Lane Estate, London, Greater London EC1Y 0RN",51.522525,-0.095795,1,0.187232,618.800,2.0,1.0,0.0,1.180866,660000
"Flat 192, Cromwell Tower, Barbican, London, Greater London EC2Y 8DD",51.520535,-0.092780,1,0.220410,1140.984,3.0,2.0,0.0,0.919759,1620000
"Flat 336, Willoughby House, Barbican, London, Greater London EC2Y 8BL",51.519515,-0.090796,1,0.110809,890.000,2.0,2.0,0.0,0.797907,1075000
"Flat 246, Ben Jonson House, Barbican, London, Greater London EC2Y 8DL",51.520886,-0.094049,1,0.171891,729.000,1.0,1.0,0.0,0.973840,670000
...,...,...,...,...,...,...,...,...,...,...
"Flat 5, Amen Lodge, Warwick Lane, London, Greater London EC4M 7BY",51.514671,-0.100893,1,0.127141,441.000,1.0,1.0,0.0,0.734900,445000
"Flat 99, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BX",51.514670,-0.109900,1,0.251315,401.000,1.0,1.0,0.0,1.338317,475000
"Flat 141, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514653,-0.110204,1,0.248524,418.000,1.0,1.0,0.0,1.358670,470000
"Flat 125, Clifford's Inn, Fetter Lane, London, Greater London EC4A 1BY",51.514434,-0.110411,1,0.260957,462.852,1.0,1.0,0.0,1.368437,620000


In [1192]:
# Lets do a KNN regression on this data to predict the price
x1 = knn_df1[["Latitude", "Longitude", "Property Type", "Closest Station", "Size", "Bedrooms", "Bathrooms", "New Build", "Nearest OS"]]
y1 = knn_df1["Price"]

In [1193]:
knn = KNeighborsRegressor(n_neighbors=5)
# Using a cross validation of 5 folds
y_pred1 = cross_val_predict(knn, x1, y1, cv=5)

In [1194]:
y_pred1

array([1259000.,  684999., 1703000.,  829500.,  847650.,  857400.,
        909990.,  892400.,  848800.,  921990.,  980100.,  958000.,
        908400.,  864600.,  980100.,  908400.,  847650.,  980100.,
        857000.,  492500.,  868000.,  843000.,  722600.,  868000.,
        980100.,  611199.,  796000.,  889000., 1651000.,  889000.,
       1027000.,  848800.,  894000.,  980100.,  654400., 1405000.,
        655400.,  857800., 1658600.,  894000.,  672500.,  439400.,
        721000.,  756000.,  689000.,  829000.,  862000., 1564000.,
       1564000., 1564000.,  587700., 1684800., 1624600.,  729999.,
        487500.,  961000.,  394400.,  839000.,  505700., 1695000.,
        814000., 1764800.,  862000.,  961000.,  889000.,  863000.,
        729999.,  897000.,  858000.,  961000.,  788100.,  729999.,
        706500., 1682000.,  803500.,  869150.,  845000.,  836000.,
        897000.,  609700.,  788100.,  925000., 1606600., 1184000.,
        709900.,  706499., 1623600.,  886990.,  697000.,  5976

In [1195]:
# Evaluation of the predictions
print(r2_score(y1, y_pred1))
print(mean_absolute_error(y1, y_pred1))

0.5331607027261885
184037.38604651162
