In [1]:
import os
import re
import webbrowser
import requests

import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output, State

import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.graph_objs import Scatter, Figure, Layout

import numpy as np
import pandas as pd
import geopandas as gpd

from copy import copy, deepcopy
from collections import OrderedDict

import matplotlib
from branca.element import Figure

import gdal
import json
from json import dumps

import pickle
import gc

from IPython.display import clear_output

from multiprocessing import Pool, cpu_count

pd.set_option('max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

### Configurations

In [2]:
cfg = dict()

cfg['to save']          = True

cfg['start_year']       = 1995
cfg['end_year']         = 2020
cfg['Years']            = list(range(cfg['start_year'], cfg['end_year']+1))

cfg['geodata dir']      = 'input/geoData'
cfg['distribution dir'] = 'input/Distribution'
cfg['houseprice dir']   = 'input/HousePriceData'

cfg['pp_raw dir']       = os.path.join(cfg['houseprice dir'], 'Raw')
cfg['pp_processed dir'] = os.path.join(cfg['houseprice dir'], 'Processed')

cfg['appData dir']      = 'appData'

cfg['process_raw_pp']   = True
cfg['raw price files']  = ['pp-2020.csv']
# cfg['raw price files']  = ['pp-1995-2017.csv', 'pp-2018.csv', 'pp-2019.csv', 'pp-2020.csv']

cfg['price_threshold']  = 10000 #Filter out transactions below this value

cfg['regions_lookup'] = {   'North East'      : 'North England',                   
                            'North West'      : 'North England',                                    
                            'East Midlands'   : 'Midlands',
                            'West Midlands'   : 'Midlands',
                            'Greater London'  : 'Greater London',                    
                            'South East'      : 'South East',
                            'South West'      : 'South West',
                            'Wales'           : 'Wales',
                            'Scotland'        : 'Scotland',
                            'Northern Ireland': 'Northern Ireland'
                        }

cfg['plotly_config'] = {
                         'All':            {'centre': [53.2, -2.2], 'maxp': 95, 'zoom': 6},
                         'North England':  {'centre': [54.3, -2.0], 'maxp': 99, 'zoom': 7},
                         'Wales':          {'centre': [52.4, -3.3], 'maxp': 99, 'zoom': 7.3},
                         'Midlands':       {'centre': [52.8, -1.2], 'maxp': 99, 'zoom': 7.3},
                         'South West':     {'centre': [51.1, -3.7], 'maxp': 99, 'zoom': 7.2},                                  
                         'South East':     {'centre': [51.5, -0.1], 'maxp': 90, 'zoom': 7.8},
                         'Greater London': {'centre': [51.5, -0.1], 'maxp': 80, 'zoom': 9.5},
                         }

#### Directory structure

In [34]:
'''
- appData (dir.)
- Data Pre-processing.ipynb
- input
    |-- Distribution 
    |-- geoData
    |-- HousePriceData
            |-- Raw
            |-- Processed
'''

os.makedirs(cfg['appData dir'], exist_ok=True)
os.makedirs(cfg['pp_processed dir'], exist_ok=True)

#### Download Data

In [4]:
# Download and unzip postcode shape files
!wget https://www.opendoorlogistics.com/wp-content/uploads/Data/UK-postcode-boundaries-Jan-2015.zip
!unzip UK-postcode-boundaries-Jan-2015.zip -d input
!rm UK-postcode-boundaries-Jan-2015.zip
clear_output()

In [17]:
# Download postcode data
!wget https://www.freemaptools.com/download/full-postcodes/ukpostcodes.zip
!unzip ukpostcodes.zip -d input/geoData
!rm ukpostcodes.zip
clear_output()

In [6]:
# Download and House paid-price files
!wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2020.csv -P input/HousePriceData/Raw/
# !wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2019.csv -P input/HousePriceData/Raw/
# !wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-2018.csv -P input/HousePriceData/Raw/
# !wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv -P input/HousePriceData/Raw/
clear_output()

### Post Code Data

In [18]:
postcodes_df = pd.read_csv(os.path.join(cfg['geodata dir'], 'ukpostcodes.csv'))

postcodes = dict()
for (postcode, latitude, longitude) in postcodes_df[['postcode', 'latitude', 'longitude']].values:
    postcodes[postcode] = [latitude, longitude]

#-----------------------------------------------------#

postcode_region_df = pd.read_csv(os.path.join(cfg['geodata dir'], 'PostCode Region.csv'))

postcode_region = dict()
for (prefix, region) in postcode_region_df[['Prefix', 'Region']].values:
    postcode_region[prefix] = cfg['regions_lookup'][region]

### House Price Data

#### Processing raw data

In [19]:
def lookup_postcode(postcodes, x):
    if x in postcodes:
        return postcodes[x]
    else:
        return ''    

pattern=re.compile(r"\d")
def lookup_region(postcode_region, x, pattern=pattern):
    m = pattern.search(x)
    if m is None:
        return ''
    else:
        x = x[:m.start()]          
        if x in postcode_region:
            return postcode_region[x]
        else:
            return '' 

In [20]:
def clean_pp_df(df, postcodes, postcode_region):
    col = {1:'Price', 2:'Date', 3:'Post Code', 4:'Property Type', 5:'Old/New', 6:'Duration'}
    
    df.rename(columns = col, inplace = True)
    df.fillna('', inplace=True)
    df['Address'] = df[7] + ' ' + df[8] + ' ' + df[9] + ' ' + df[10] + ' ' + df[11] + ' ' + df[12] + ' ' + df[13]
    df['Address'] = df['Address'].apply(lambda x: ' '.join(x.split()))    
    
    cols_to_drop = [col for col in df.columns if isinstance(col, int)]
    df.drop(cols_to_drop, axis=1, inplace=True)
    
    # Drop suspiciously low house price data: (Note: These have Property Type "Other". What is it?) 
    df = df.loc[df.Price > cfg['price_threshold']]
    
    # Exclude property type Other (O)
    df = df.loc[df['Property Type']!='O']
    
    # Sort by Date:
    df.sort_values(by=['Date'], inplace=True, ignore_index=True)
    
    # Get Latitude and Longitude by Post Code:
    df['Post Code Coords'] = df['Post Code'].apply(lambda x: lookup_postcode(postcodes, x))
    
    # Get year-month:
    df['Year-Month'] = df['Date'].apply(lambda s: s[:7])
    df['Year']       = df['Date'].apply(lambda s: s[:4])
    df['Month']      = df['Date'].apply(lambda s: s[5:7])
    
    # Get Post code sector
    df['Sector'] =  df['Post Code'].apply(lambda s: s[:s.find(' ')+2])
    
    # Get Region
    df['Region'] = df['Post Code'].apply(lambda s: lookup_region(postcode_region, s))    
        
    return df

In [35]:
%%time
def process_and_save_houseprice(infile, postcodes, postcode_region):
    print(f"Processing {infile}")
                
    df = pd.read_csv(os.path.join(cfg['pp_raw dir'], infile), header=None)
    df = clean_pp_df(df, postcodes, postcode_region)
    print(f'Number of transactions in {infile}: {len(df) :,}')
        
    for year in df.Year.unique():        
        fname = f'pp-{year}.csv'                   
        df[df.Year==year].to_csv(os.path.join(cfg['pp_processed dir'], fname), index=False)  
        print(f"{fname} saved")
    
#-------------------------------------------------------#
if cfg['process_raw_pp']:
    for infile in cfg['raw price files']:           
        process_and_save_houseprice(infile, postcodes, postcode_region)    

Processing pp-2020.csv
Number of transactions in pp-2020.csv: 304,022
pp-2020.csv saved
CPU times: user 6.01 s, sys: 256 ms, total: 6.27 s
Wall time: 6.27 s


#### Loading processed data

In [36]:
%%time
def load_processed_houseprice_data():
    house_price_df = pd.DataFrame()
    for year in cfg['Years']:
        fname = os.path.join(cfg['pp_processed dir'], f'pp-{str(year)}.csv')        
        if os.path.isfile(fname) :
            df = pd.read_csv(fname)
            house_price_df = pd.concat([house_price_df, df], ignore_index=True)       
            print(f'Transaction count in {year}: {len(df) :,}')
        
    print(f"Total transaction count: {len(house_price_df) :,}")    
    return house_price_df

#----------------------------------------#
house_price_df = load_processed_houseprice_data()

Transaction count in 2020: 304,022
Total transaction count: 304,022
CPU times: user 981 ms, sys: 32 ms, total: 1.01 s
Wall time: 1.01 s


In [38]:
house_price_df.head(2)

Unnamed: 0,Price,Date,Post Code,Property Type,Old/New,Duration,Address,Post Code Coords,Year-Month,Year,Month,Sector,Region
0,39000,2020-01-01 00:00,B71 3PE,F,N,L,CAMBERLEY FLAT 28 BEACONVIEW ROAD WEST BROMWICH SANDWELL WEST MIDLANDS,"[52.54574652, -1.974342581]",2020-01,2020,1,B71 3,Midlands
1,115500,2020-01-01 00:00,BB5 3JG,T,N,L,71 WHITE ASH LANE OSWALDTWISTLE ACCRINGTON HYNDBURN LANCASHIRE,"[53.742662, -2.402434]",2020-01,2020,1,BB5 3,North England


In [39]:
def get_sector_df(house_price_df):

    P = house_price_df[['Year', 'Sector', 'Price']].groupby(by=['Year', 'Sector']).mean()
    V = house_price_df[['Year', 'Sector', 'Price']].groupby(by=['Year', 'Sector']).count()
    V.rename(columns={'Price': 'Volume'}, inplace=True)
    
    P = pd.merge(P, V, how='inner', on=['Year','Sector'])
    
    P.reset_index(inplace=True)
    P = P.loc[P['Sector'] != '']
        
    # Get Region
    P['Region'] = P['Sector'].apply(lambda s: lookup_region(postcode_region, s))
    
    P['Price'] = P['Price'].apply(lambda s: int(np.round(s/1000)*1000))
    P['Display Price'] = P['Price'].apply(lambda x: f"{int(np.round(x/1000)) :,}K")
    P['text']  = P['Sector'] + '<br>' + 'Avg. Price: ' + P['Display Price'] + '<br>' + 'Sales Volume: ' + P['Volume'].astype(str)
    P.drop(columns=['Display Price'], inplace=True)
    
    return P

In [40]:
%%time
sector_df = get_sector_df(house_price_df) 
sector_df.head()

CPU times: user 368 ms, sys: 12 ms, total: 380 ms
Wall time: 379 ms


Unnamed: 0,Year,Sector,Price,Volume,Region,text
0,2020,AL1 1,469000,38,South East,AL1 1<br>Avg. Price: 469K<br>Sales Volume: 38
1,2020,AL1 2,552000,24,South East,AL1 2<br>Avg. Price: 552K<br>Sales Volume: 24
2,2020,AL1 3,419000,101,South East,AL1 3<br>Avg. Price: 419K<br>Sales Volume: 101
3,2020,AL1 4,659000,45,South East,AL1 4<br>Avg. Price: 659K<br>Sales Volume: 45
4,2020,AL1 5,450000,62,South East,AL1 5<br>Avg. Price: 450K<br>Sales Volume: 62


#### Getting and saving sector_price by year (for Chropleth)

In [41]:
%%time

sector_by_year = dict()
for year in cfg['Years']:    
    sector_by_year[year] = sector_df[sector_df.Year==year].reset_index(drop=True)  
    
    if cfg['to save']:
        fname = os.path.join(cfg['appData dir'], f'sector_price_{year}.csv')
        sector_by_year[year].to_csv(fname, index=False)            

CPU times: user 9.98 ms, sys: 4.15 ms, total: 14.1 ms
Wall time: 12.9 ms


#### Getting and saving sector_percentage_delta by year (for Chropleth)

In [42]:
%%time

# Building sector_price[year] {sector: price} dict for quick lookup
sector_price   = dict()
for year in cfg['Years']:
    sector_price[year] = dict()
    for sector, region, price in sector_by_year[year][['Sector', 'Region', 'Price']].values:             
        sector_price[year][sector] = [region, price]

#-------------------------------#
sector_delta = dict()

sector_delta[1995] = dict()
for sector, [region, price] in sector_price[1995].items():
    sector_delta[1995][sector] = [0, region]
    
for y1, y2 in zip(cfg['Years'][1:], cfg['Years'][:-1]):    
    sector_delta[y1] = dict()
    for sector, [region, price] in sector_price[y1].items():        
        if sector in sector_price[y2]:
            last_year_price = sector_price[y2][sector][1]
            delta = int(np.round(100 * (price - last_year_price) / last_year_price))
            sector_delta[y1][sector] = [delta, region]
    
#----------------------------------------------#
for year in cfg['Years']:
    tmp = pd.DataFrame.from_dict(sector_delta[year], orient='index', columns=['Percentage Change', 'Region'])
    tmp.reset_index(inplace=True)
    tmp.rename(columns={'index':'Sector'}, inplace=True)
    tmp['text'] = tmp['Sector'] + '<br>' + 'Price Change: ' + tmp['Percentage Change'].apply(lambda s: str(s)) + '%'
    
    if cfg['to save']:
        fname = os.path.join(cfg['appData dir'], f'sector_percentage_delta_{year}.csv')
        tmp.to_csv(fname, index=False)

CPU times: user 122 ms, sys: 4.44 ms, total: 127 ms
Wall time: 122 ms


#### Get Price and Volume by Year and Property Type df (For time-series)

In [43]:
%%time

def get_price_volume_df(house_price_df):
    P = house_price_df[['Year', 'Sector', 'Property Type', 'Price']].groupby(by=['Year', 'Sector', 'Property Type']).count()
    P.rename(columns={'Price': 'Count'}, inplace=True)
    P.reset_index(inplace=True)
    
    Q = house_price_df[['Year', 'Sector', 'Property Type', 'Price']].groupby(by=['Year', 'Sector', 'Property Type']).mean()
    Q.reset_index(inplace=True)
    
    P['Average Price'] = Q.Price.values
    
    P = P.loc[P['Sector'] != '']
        
    return P
    
#------------------------------------#
price_volume_df = get_price_volume_df(house_price_df)
if cfg['to save']:
    price_volume_df.to_csv(os.path.join(cfg['appData dir'], 'price_volume.csv'), index=False)  

CPU times: user 187 ms, sys: 52 µs, total: 188 ms
Wall time: 186 ms


In [44]:
price_volume_df.head()

Unnamed: 0,Year,Sector,Property Type,Count,Average Price
0,2020,AL1 1,F,15,376260.0
1,2020,AL1 1,S,6,654000.0
2,2020,AL1 1,T,17,486441.176471
3,2020,AL1 2,D,5,682550.0
4,2020,AL1 2,F,3,396166.666667


#### Regional Price data by year

### Geo Data

In [45]:
regions = [r for r in house_price_df.Region.unique() if isinstance(r, str)]
print(f"Regions: {regions}")

def get_regional_price_data(sector_df, regions):
    def inner(region):
        if region == 'South East': #Include Greater London in South East graph
            mask = (sector_df.Region==region) | (sector_df.Region=='Greater London')
            df = sector_df[mask]
        else:
            df = sector_df[sector_df.Region==region]
        return df
    
    ###########################################
    regional_price_data = dict()
    
    for r in regions:
        regional_price_data[r] = inner(r)
    
    return regional_price_data

Regions: ['Midlands', 'North England', 'South East', 'Wales', 'Greater London', 'South West', 'Scotland']


In [46]:
# Convert Shape file to Geojson    
# Code modified from https://github.com/akkana/scripts/blob/master/mapping/polidistmap.py

infile  = os.path.join(cfg['distribution dir'], 'Sectors.shp')
outfile = os.path.join(cfg['geodata dir'], 'ukpostcode_geojson.json')

if not os.path.isfile(outfile):  
    options = gdal.VectorTranslateOptions(format="GeoJSON", dstSRS="EPSG:4326")
    gdal.VectorTranslate(outfile, infile, options=options)
    print("Translated GEOJSON file", outfile)

Translated GEOJSON file input/geoData/ukpostcode_geojson.json


In [47]:
# Breaking price/volume data up by region:
regional_price_data = dict()
for year in cfg['Years']:
    regional_price_data[year] = get_regional_price_data(sector_by_year[year], regions)

In [48]:
def load_geo_data(infile):
    with open(infile, "r") as read_file:    
        geo_data = json.load(read_file)        
    return geo_data

#---------------------------------------------#
infile = os.path.join(cfg['geodata dir'], 'ukpostcode_geojson.json')
geo_data = load_geo_data(infile)

In [49]:
regional_price_data[2018]['Greater London'].head()

Unnamed: 0,Year,Sector,Price,Volume,Region,text


In [50]:
def get_regional_geo_data(geo_data, postcode_region, regions):
    
    pattern = re.compile(r"\d")
    
    def inner(region):
        Y = dict()
        Y['features'] = []
        for k in geo_data.keys():
            if k != 'features':
                Y[k] = geo_data[k]
            else:            
                for i, d in enumerate(geo_data['features']):
                    for k, v in d.items():
                        if k == 'properties':
                            sector = v['name']
                            m = pattern.search(sector)
                            district = sector[:m.start()]
                            
                            if region == 'South East':
                                if postcode_region[district] in [region, 'Greater London']:
                                    Y['features'].append(geo_data['features'][i])
                            else:
                                if postcode_region[district] == region:
                                    Y['features'].append(geo_data['features'][i])                                    
        return Y
        
    ###########################################
    regional_geo_data = dict()    
    for r in regions:
        regional_geo_data[r] = inner(r)
    
    return regional_geo_data
    

In [51]:
%%time
# Breaking geo_data up by region:
regional_geo_data = get_regional_geo_data(geo_data, postcode_region, regions)

CPU times: user 110 ms, sys: 99 µs, total: 110 ms
Wall time: 109 ms


In [52]:
%%time
for region, value in regional_geo_data.items():
    fname = os.path.join(cfg['appData dir'], f'geodata_{region}.csv')        
    with open(fname, "w") as f:    
        json.dump(value, f)         

CPU times: user 9.1 s, sys: 151 ms, total: 9.25 s
Wall time: 9.31 s
