# Geo-Data Data Downloading & Preparation

This notebook creates the relevant geo-data files by downloading data from the London Data Store. You should only need to run this notebook once, no matter how many times you wish to work with the rest of the model and its inputs.

In [None]:
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt 

In [None]:
# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

In [None]:
import pandas as pd
import geopandas as gpd
import requests
import glob
import re
import os
import io
import zipfile

from io import BytesIO

lkp = os.path.join('data','lkp')
shp = os.path.join('data','shp')
for d in [lkp,shp]:
    if not os.path.exists(d):
        os.makedirs(d)

In [None]:
# Make sure you always run this!
boroughs = ['City of London','Barking and Dagenham','Barnet','Bexley','Brent','Bromley',
            'Camden','Croydon','Ealing','Enfield','Greenwich','Hackney','Hammersmith and Fulham',
            'Haringey','Harrow','Havering','Hillingdon','Hounslow','Islington',
            'Kensington and Chelsea','Kingston upon Thames','Lambeth','Lewisham',
            'Merton','Newham','Redbridge','Richmond upon Thames','Southwark','Sutton',
            'Tower Hamlets','Waltham Forest','Wandsworth','Westminster']

### Downloading geo-data

In [None]:
shpt = os.path.join(shp,'tmp')
if not os.path.exists(shpt):
    os.makedirs(shpt)

regions2016 = ('https://opendata.arcgis.com/datasets/'
               'f99b145881724e15a04a8a113544dfc5_2.zip')
gla2015 = ('https://files.datapress.com/london/dataset/'
           'statistical-gis-boundary-files-london/2016-10-03T13:52:28/'
           'statistical-gis-boundaries-london.zip')
    
for f in [regions2016, gla2015]:
    print("Downloading " + f + "...")
    r = requests.get(f, stream=True)
    z = zipfile.ZipFile(BytesIO(r.content))
    z.extractall(shpt)

print("Done.")

### Selecting Regional Data

In [None]:
regions = glob.glob(os.path.join(shpt,'*Regions*.shp'))[0]

print("Processing: " + regions)
regions = gpd.read_file(regions)

london  = regions[regions.rgn16nm=='London']
london.reset_index(inplace=True, drop=True)
london.crs = {'init':'epsg:4326'}
london = london.to_crs({'init':'epsg:27700'})

london.to_file(os.path.join(shp,'London.shp'))
print("Done.")

### Selecting Boroughs

In [None]:
counties = glob.glob(os.path.join(shpt,'statistical-gis-boundaries-london','ESRI','*Borough*.shp'))[0]

print("Processing: " + counties)
LAs = gpd.read_file(counties)

LAs = LAs.loc[LAs.NAME.isin(boroughs)].reset_index(drop=True)
LAs.crs = {'init': u'epsg:27700'}
#LAs = LAs.to_crs({'init':'epsg:27700'})

print("\tSaving to shapefile...")
LAs.to_file(os.path.join(shp,'Boroughs.shp'))

print("Done.")

### Selecting LSOAs

In [None]:
lsoas = glob.glob(os.path.join(shpt,'statistical-gis-boundaries-london','ESRI','*LSOA*.shp'))

for l in lsoas:
    print("Processing: " + l)
    lsoa_y = gpd.read_file(l)
    
    # Extract the year as 4 digits
    m     = re.search(r'\d{4}',l)
    lyear = l[m.start():m.end()]
    
    # Set projection
    lsoa_y.crs = {'init':'epsg:27700'}
    
    # Common name
    lsoa_y.insert(0, 'lsoacd', 
                    lsoa_y[[x for x in lsoa_y.columns if 'LSOA' in x and ('CD' in x or 'CODE' in x)][0]])
    
    print("\tSaving to shapefile...")
    lsoa_y.to_file(os.path.join(shp,'LSOAs ' + str(lyear) + '.shp'))
    
    print("\tSaving to pickle...")
    lsoa_y.to_pickle(os.path.join(lkp,'LSOAs ' + str(lyear) + '.pkl'))

print("Done.")

### Selecting and Joining Wards

In [None]:
wards = glob.glob(os.path.join(shpt,'statistical-gis-boundaries-london','ESRI','*Ward*Merged.shp'))[0]

print("Processing wards...")
ward_geo = gpd.read_file(wards)
ward_geo.crs = {'init':'epsg:27700'}
    
print("\tSaving to shapefile...")
ward_geo.to_file(os.path.join(shp,'Wards.shp'))

# Create a mapping for LSOAs to Wards
lsoa = gpd.read_file(os.path.join(shp,'LSOAs 2011.shp'))
lsoa.crs = {'init':'epsg:27700'}
    
lsoa_c = lsoa
lsoa_c.geometry = lsoa_c.centroid
lsoa_c.to_file(os.path.join(shp,'LSOAs 2011 Points.shp'))

print("\tJoining Wards to LSOAs...")
t = gpd.sjoin(lsoa_c, ward_geo, how='left')
t.rename(columns={
    'GSS_CODE':'gss_cd',
    'LB_GSS_CD':'lb_gss_cd'
}, inplace=True)
t[['lsoacd','gss_cd','lb_gss_cd']].to_csv(os.path.join(lkp,'LSOA_WARD_JR.csv'), index=False)

print("Done.")

### Selecting and Joining Output Areas

In [None]:
oas = glob.glob(os.path.join(shpt,'statistical-gis-boundaries-london','ESRI','OA_*.shp'))[0]

print("Processing Output Areas...")
oa_geo = gpd.read_file(oas)
oa_geo.crs = {'init':'epsg:27700'}
    
print("\tSaving to shapefile...")
oa_geo.to_file(os.path.join(shp,'OAs 2011.shp'))

# Create a mapping for LSOAs to Wards
lsoa = gpd.read_file(os.path.join(shp,'LSOAs 2011.shp'))
lsoa.crs = {'init':'epsg:27700'}
    
oa_c = oa_geo
oa_c.geometry = oa_c.centroid

print("\tSaving point OAs...")
oa_c.to_file(os.path.join(shp,'OAs 2011 Points.shp'))

print("\tOutput Areas to LSOAs...")

oa_geo.rename(columns={
        'LSOA11CD':'lsoacd',
        'OA11CD':'oacd'
    }, inplace=True)
oa_geo[['lsoacd','oacd']].to_csv(os.path.join(lkp,'LSOA_OA_JR.csv'), index=False)

print("Done.")

## Tidying up

In [None]:
import shutil 
shutil.rmtree(shpt)
print("Done.")