In [33]:
#_author_jeffrey_t
#clean the OSM data for Sydney Metro Area:
#https://mapzen.com/data/metro-extracts/your-extracts/c98c29b17741

#import required libraries
import xml.etree.cElementTree as ET
from bs4 import BeautifulSoup
import os
import re
from collections import defaultdict
import pprint

#some global refs
osm_test_file = 'sydney_australia.osm'

In [34]:
#print some basic stats about our file
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    from: http://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

file_info = os.stat(osm_test_file)
print convert_bytes(file_info.st_size)

def count_tags(filename):
    # YOUR CODE HERE
    p_evt = ('start',)
    tags = {}
    for _, elem in ET.iterparse(filename, events=p_evt):
        my_tag = elem.tag
        if my_tag in tags:
            tags[my_tag] += 1
        else:
            tags[my_tag] = 1
    return tags

print count_tags(osm_test_file)

65.9 MB
{'node': 282039, 'nd': 359303, 'bounds': 1, 'member': 25166, 'tag': 241467, 'osm': 1, 'way': 49541, 'relation': 2094}


In [3]:
#module to audit street names
st_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
st_types = set() #defaultdict(set) ???

def is_st_name(elem):
    return (elem.attrib['k'].rstrip() == 'addr:street')

def audit_st_type(street_types, street_name):
    m = st_type_re.search(street_name)
    if m:
        this_st_type = m.group()
        st_types.add(this_st_type)

def list_st_types(filename):
    p_evt = ('start',)
    n = 0
    for _, elem in ET.iterparse(filename, events=p_evt):
        if elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_st_name(tag):
                    audit_st_type(st_types, tag.attrib['v'])

list_st_types(osm_test_file)
print st_types

#We can see that there are a few inconsistencies in naming which we should clean

set(['Fitzroy', 'Rd', 'Way', 'Highway', 'Road', 'Jones', 'Lane', 'Drive', 'St', 'Place', 'Circuit', 'Gardens', 'South', 'Square', 'Parade', 'Point', 'Esplanade', 'Boulevarde', 'Street', 'Crescent', 'Broadway', 'Avenue'])


In [13]:
# audit postcodes - a field not covered in the case study
postcode_re = re.compile(r'^\d{4}$')
postcode_flds = set()
postcode_bad = set()

# according to the osm documentation, there are a few postcode fields:
# addr:postcode, boundary=postal_code

def is_postcode(elem):
    return (elem.attrib['k'].rstrip() == 'addr:postcode')

def audit_postcode_fld(elem):
    if elem.attrib['k'].find('code') != -1:
        postcode_flds.add(elem.attrib['k'])

postcode_types = defaultdict(set)
def audit_postcodes(filename):
    p_evt = ('start',)
    n = 0
    for _, elem in ET.iterparse(filename, events=p_evt):
        for tag in elem.iter('tag'):
            if is_postcode(tag):
                pc = postcode_re.search(tag.attrib['v'])
                if not pc:
                    postcode_types[tag.attrib['k']].add(tag.attrib['v'])
            else:
                if tag.attrib['k'].find('code') != -1:
                    postcode_types[tag.attrib['k']].add(tag.attrib['v'])

audit_postcodes(osm_test_file)
pprint.pprint(dict(postcode_types))

{'addr:postcode': set(['200',
                       '210',
                       'NSW 1460',
                       'NSW 2000',
                       'NSW 2010',
                       'NSW 2011',
                       'NSW 2015',
                       'NSW 2021',
                       'NSW 2022',
                       'NSW 2026',
                       'NSW 2034',
                       'NSW 2037']),
 'is_in:country_code': set(['AU']),
 'is_in:state_code': set(['NSW']),
 'postal_code': set(['1465;2033',
                     '2000',
                     '2005',
                     '2007',
                     '2008',
                     '2009',
                     '2010',
                     '2011',
                     '2015',
                     '2016',
                     '2017',
                     '2018',
                     '2019',
                     '2020',
                     '2021',
                     '2022',
                     '2023',
                   

Postcodes in NSW should all 4 numbers - here we can see that there are inconsistencies in our data with some postcodes having been prefixed with NSW, these will need to be cleaned. 

Some entries also seem to have postcodes stored under the field 'postal_code'. All but one of these entries (namely the one we have identified as '1465;2033') appear to be valide postcodes. For consistency, we will move these to 'addr:postcode'.

We now compare any elements where both are present and see if there are any differences.

In [14]:
def compare_postcodes(filename):
    p_evt = ('start',)
    n = 0
    test_flds = set(['addr:postcode', 'postal_code'])
    for _, elem in ET.iterparse(filename, events=p_evt):
        #setup test variables for each tag
        addr_postcode = False
        postal_code = False
        postcodes = {}
        for tag in elem.iter('tag'):
            tag_k = tag.attrib['k'].rstrip()
            if tag_k == 'addr:postcode':
                addr_postcode = True
                postcodes['addr:postcode'] = tag.attrib['v']
            elif tag_k == 'postal_code':
                postal_code = True
                postcodes['postalcode'] = tag.attrib['v']
        if (addr_postcode and postal_code):
            print postcodes

compare_postcodes(osm_test_file)

We can see here that there are no elements which contain both the 'addr:postcode' and 'postal_code' fields.

In [15]:
# testing tag types
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_two_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def test_tag_type(elem):
    tag_type = elem.attrib['k']
    if lower.match(tag_type) != None:
        keys['lower'] += 1
    elif lower_colon.match(tag_type) != None:
        keys['lower_colon'] += 1
    elif lower_two_colon.match(tag_type) != None:
        keys['lower_two_colon'] += 1
        two_colon_set.add(tag_type)
    elif problemchars.match(tag_type) != None:
        keys['problemchars'] += 1
    else:
        keys['other'] += 1
        others_set.add(tag_type)

keys = {
        'lower': 0,
        'lower_colon': 0,
        'lower_two_colon': 0,
        'problemchars': 0,
        'other': 0
        }
two_colon_set = set()
others_set = set()

def test_tag_types(filename):
    p_evt = ('start',)
    n = 0
    test_flds = set(['addr:postcode', 'postal_code'])
    for _, elem in ET.iterparse(filename, events=p_evt):
        if 'k'  in elem.attrib:
            test_tag_type(elem)

test_tag_types(osm_test_file)
print keys
print two_colon_set
print others_set

{'problemchars': 0, 'lower': 200191, 'other': 640, 'lower_colon': 38671, 'lower_two_colon': 1965}
set(['seamark:beacon_lateral:system', 'service:bicycle:velocio', 'lane:psv:conditional', 'access:backward:conditional', 'seamark:fog_signal:group', 'turn:lanes:forward', 'bicycle:lanes:backward', 'service:bicycle:rental', 'cycleway:right:lane', 'lanes:psv:bicycle', 'service:bicycle:orbea', 'service:bicycle:pinarello', 'service:bicycle:cleaning', 'scuba_diving:type:biology', 'cycleway:forward:lane', 'seamark:beacon_cardinal:colour_pattern', 'cycleway:left:lane', 'lane:psv:bicycle', 'bus:lanes:backward', 'seamark:light:sequence', 'sidewalk:left:surface', 'seamark:light:reference', 'seamark:beacon_cardinal:category', 'scuba_diving:type:training', 'width:lanes:forward', 'seamark:light:character', 'scuba_diving:type:sand', 'seamark:harbour:category', 'source:name:date', 'seamark:light:period', 'cycleway:lane:width', 'service:bicycle:second_hand', 'motorcycle:lanes:backward', 'building:source:le

The tag types seem to be quite well maintained - we do not see any problem chars and there do not appear to be any two colon fields relevant to what we are attempting to extract. We note there may be on problematic other type - namely: 'addr:city_1'. 

## In Depth Exploration of Issues
We know take a slightly deeper look at some of the problems in our data, namely we will print out a some of the problem elements in our data.

### Fixing problem postcodes
The first issue will look at are the postcodes which have problems other than being append with the state prefix - i.e. if "NSW 2000" we know that the problem is fairly simple - namely we will strip out the prefix. 

However we will look at the following issues identified above: 
* 3 digit poscodes
* '1465;2033' case

In [28]:
# Printing out problem postcode elements
# We know from above that postcodes are stored in two fields 
postcode_tags = ["addr:postcode", "postal_code"]


def examine_postcodes(filename, test_flds):
    p_evt = ("start",)
    for _, elem in ET.iterparse(filename, events=p_evt):
        print_elem = False
        for tag in elem.iter("tag"):
            if tag.attrib["k"] in test_flds:
                # If in fields test value
                pc_value = tag.attrib["v"]
                if not postcode_re.search(pc_value):
                    # Also exclude where starts with state and ends with 4 digits
                    if not ((pc_value[:3] == "NSW") and (postcode_re.search(pc_value[-4:]) is not None)):
                        print_elem = True
        if print_elem:
            #print BeautifulSoup(elem.iter("tag"), "xml").prettify()
            for tag in elem.iter("tag"):
                print ET.tostring(tag)
            print "---"

examine_postcodes(osm_test_file, postcode_tags)

<tag k="name" v="Big Boy Thai" />
		
<tag k="amenity" v="restaurant" />
		
<tag k="cuisine" v="thai" />
		
<tag k="addr:city" v="Darlinghurst" />
		
<tag k="addr:street" v="Stanley Street" />
		
<tag k="addr:postcode" v="210" />
		
<tag k="addr:housenumber" v="82" />
	
---
<tag k="addr:postcode" v="210" />
		
---
<tag k="name" v="Kensington POSTshop" />
		
<tag k="amenity" v="post_office" />
		
<tag k="building" v="yes" />
		
<tag k="addr:street" v="Anzac Parade" />
		
<tag k="postal_code" v="1465;2033" />
		
<tag k="addr:housenumber" v="168,170" />
	
---
<tag k="postal_code" v="1465;2033" />
		
---
<tag k="name" v="Queen Victoria Building" />
		
<tag k="shop" v="mall" />
		
<tag k="name:ko" v="&#53304; &#48709;&#53664;&#47532;&#50500; &#48716;&#46377;" />
		
<tag k="name:zh" v="&#32500;&#22810;&#21033;&#20122;&#22899;&#29579;&#22823;&#21414;" />
		
<tag k="tourism" v="attraction" />
		
<tag k="building" v="retail" />
		
<tag k="wikidata" v="Q54518" />
		
<tag k="wikipedia" v="en:Queen

Here we can see the non-obvious errors are a small enough sample for us to correct manually. And we can explain them:
* The first error is a mistake - the postcode for Darlinghurst is 2010 - not 210
* The second one is a post office - 2033 is the postcode for the suburb, while 1465 is the postcode for PO Boxes located there. Since we are interested in the location and suburb, we will amend this to 2033
* The last one is also another mistake, the Queen Victoria Building is a popular building located in Sydney - which has postcode 2000
These should be relatively simple fixes but which will need to account for in our fix postcode function.

### Examining Street Problems
From above we have collected all the endings in "addr:street" field. Of the ones that are not good street names, we can form them into two groups:
* Abbreviations - these we will form a dictionary to correct
* Miscellaneous unrecognised forms - we will print these out to get a better handle on these issues

Full list from above:
set(['Fitzroy', 'Rd', 'Way', 'Highway', 'Road', 'Jones', 'Lane', 'Drive', 'St', 'Place', 'Circuit', 'Gardens', 'South', 'Square', 'Parade', 'Point', 'Esplanade', 'Boulevarde', 'Street', 'Crescent', 'Broadway', 'Avenue'])

In [36]:
# Set of good names:
valid_st_types = set(["Way",
                     "Highway",
                     "Road",
                     "Lane",
                     "Drive",
                     "Place",
                     "Circuit",
                     "Square",
                     "Parade",
                     "Point",
                     "Esplanade",
                     "Boulevarde",
                     "Street",
                     "Crescent",
                     "Broadway",
                     "Avenue"])

# Dictionary to unify abbreviations
abbr_dict = {"Rd": "Road",
            "St": "Street"}

# Print out things that don't conform
def examine_street(filename):
    p_evt = ("start",)
    # Create a set of good street names and solved problems
    solved_set = valid_st_types
    solved_set.update(set(abbr_dict.keys()))
    
    for _, elem in ET.iterparse(filename, events=p_evt):
        print_elem = False
        for tag in elem.iter("tag"):
            if elem.tag == "way":
                if tag.attrib["k"] == "addr:street":
                    # If in fields test value
                    st_value = tag.attrib["v"]
                    m = st_type_re.search(st_value)
                    st_end = m.group()
                    if not st_end in solved_set:
                        #print tag.attrib["v"]
                        print_elem = True
        if print_elem:
            for tag in elem.iter("tag"):
                print ET.tostring(tag)
            print "---"

examine_street(osm_test_file)

<tag k="layer" v="1" />
		
<tag k="source" v="nearmap" />
		
<tag k="building" v="yes" />
		
<tag k="addr:city" v="Sydney" />
		
<tag k="addr:state" v="New South Wales" />
		
<tag k="addr:street" v="Bardsley Gardens" />
		
<tag k="addr:suburb" v="Cammeray" />
		
<tag k="addr:country" v="AU" />
		
<tag k="addr:postcode" v="2062" />
		
<tag k="addr:housenumber" v="1" />
	
---
<tag k="layer" v="1" />
		
<tag k="source" v="nearmap" />
		
<tag k="building" v="yes" />
		
<tag k="addr:city" v="Sydney" />
		
<tag k="addr:state" v="New South Wales" />
		
<tag k="addr:street" v="Bardsley Gardens" />
		
<tag k="addr:suburb" v="Cammeray" />
		
<tag k="addr:country" v="AU" />
		
<tag k="addr:postcode" v="2062" />
		
<tag k="addr:housenumber" v="2" />
	
---
<tag k="layer" v="1" />
		
<tag k="source" v="nearmap" />
		
<tag k="building" v="yes" />
		
<tag k="addr:city" v="Sydney" />
		
<tag k="addr:state" v="New South Wales" />
		
<tag k="addr:street" v="Bardsley Gardens" />
		
<tag k="addr:suburb" v=

AttributeError: 'NoneType' object has no attribute 'group'

We manually check the entries and resolve them the following way:
* Bardsley Gardens and Roslyn Gardens are valid street names - in fact I have learnt something new here in that Gardens is a valid street ending.
* Fitroy is a data entry problem - the proper name is "Fitzroy Street", an easy correction.
* The last one is slightly tricky - the official street name does turn out to be "Alfred Street South". We so the entry is correctly entered. If we were taking statistics on street types and that was important, we may consider changing it to "Alfred (South) Street" or something similar. However I have decided to leave it be as it stands.

## Functions to fix the identified gaps in our data
Our goals are to:
* Ensure that all the street names are uniform

In [None]:
# set(['Court', 'Place', 'Way', 'Revesby', 'Walk', 'Highway', 'Ogilve', 
#'Promenade', 'Corination', 'Lane', 'Drive', 'St', 'Hilma', 'Circuit', 'Road', 
#'Square', 'Parade', 'Point', 'st', 'Street', 'Crescent', 'Ave', 'Avenue'])

# From the sample above create a set of good street styles to check against.
good_st_types = set(["Court",
                    "Place",
                    "Way",
                    "Walk",
                    "Highway",
                    "Promenade",
                    "Corination",
                    "Lane",
                    "Drive",
                    "Street",
                    "Circuit",
                    "Road",
                    "Square",
                    "Parade",
                    "Point",
                    "Crescent",
                    "Avenue"
                    ])
fix_st_dict = {"St": "Street",
              "st": "Street",
              "Ave": "Avenue",
              "ave": "Avenue"
              }