In [8]:
#_author_jeffrey_t
#clean the OSM data for Sydney Metro Area:
#https://mapzen.com/data/metro-extracts/your-extracts/c98c29b17741

#import required libraries
import xml.etree.cElementTree as ET
import os
import re
from collections import defaultdict
import pprint

#some global refs
osm_test_file = 'sydney_australia.osm'

In [2]:
#print some basic stats about our file
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    from: http://stackoverflow.com/questions/2104080/how-to-check-file-size-in-python
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

file_info = os.stat(osm_test_file)
print convert_bytes(file_info.st_size)

def count_tags(filename):
    # YOUR CODE HERE
    p_evt = ('start',)
    tags = {}
    for _, elem in ET.iterparse(filename, events=p_evt):
        my_tag = elem.tag
        if my_tag in tags:
            tags[my_tag] += 1
        else:
            tags[my_tag] = 1
    return tags

print count_tags(osm_test_file)

65.9 MB
{'node': 282039, 'nd': 359303, 'bounds': 1, 'member': 25166, 'tag': 241467, 'osm': 1, 'way': 49541, 'relation': 2094}


In [3]:
#module to audit street names
st_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
st_types = set() #defaultdict(set) ???

def is_st_name(elem):
    return (elem.attrib['k'].rstrip() == 'addr:street')

def audit_st_type(street_types, street_name):
    m = st_type_re.search(street_name)
    if m:
        this_st_type = m.group()
        st_types.add(this_st_type)

def list_st_types(filename):
    p_evt = ('start',)
    n = 0
    for _, elem in ET.iterparse(filename, events=p_evt):
        if elem.tag == 'way':
            for tag in elem.iter('tag'):
                if is_st_name(tag):
                    audit_st_type(st_types, tag.attrib['v'])

list_st_types(osm_test_file)
print st_types

#We can see that there are a few inconsistencies in naming which we should clean

set(['Fitzroy', 'Rd', 'Way', 'Highway', 'Road', 'Jones', 'Lane', 'Drive', 'St', 'Place', 'Circuit', 'Gardens', 'South', 'Square', 'Parade', 'Point', 'Esplanade', 'Boulevarde', 'Street', 'Crescent', 'Broadway', 'Avenue'])


In [13]:
# audit postcodes - a field not covered in the case study
postcode_re = re.compile(r'^\d{4}$')
postcode_flds = set()
postcode_bad = set()

# according to the osm documentation, there are a few postcode fields:
# addr:postcode, boundary=postal_code

def is_postcode(elem):
    return (elem.attrib['k'].rstrip() == 'addr:postcode')

def audit_postcode_fld(elem):
    if elem.attrib['k'].find('code') != -1:
        postcode_flds.add(elem.attrib['k'])

postcode_types = defaultdict(set)
def audit_postcodes(filename):
    p_evt = ('start',)
    n = 0
    for _, elem in ET.iterparse(filename, events=p_evt):
        for tag in elem.iter('tag'):
            if is_postcode(tag):
                pc = postcode_re.search(tag.attrib['v'])
                if not pc:
                    postcode_types[tag.attrib['k']].add(tag.attrib['v'])
            else:
                if tag.attrib['k'].find('code') != -1:
                    postcode_types[tag.attrib['k']].add(tag.attrib['v'])

audit_postcodes(osm_test_file)
pprint.pprint(dict(postcode_types))

{'addr:postcode': set(['200',
                       '210',
                       'NSW 1460',
                       'NSW 2000',
                       'NSW 2010',
                       'NSW 2011',
                       'NSW 2015',
                       'NSW 2021',
                       'NSW 2022',
                       'NSW 2026',
                       'NSW 2034',
                       'NSW 2037']),
 'is_in:country_code': set(['AU']),
 'is_in:state_code': set(['NSW']),
 'postal_code': set(['1465;2033',
                     '2000',
                     '2005',
                     '2007',
                     '2008',
                     '2009',
                     '2010',
                     '2011',
                     '2015',
                     '2016',
                     '2017',
                     '2018',
                     '2019',
                     '2020',
                     '2021',
                     '2022',
                     '2023',
                   

Postcodes in NSW should all 4 numbers - here we can see that there are inconsistencies in our data with some postcodes having been prefixed with NSW, these will need to be cleaned. 

Some entries also seem to have postcodes stored under the field 'postal_code'. All but one of these entries (namely the one we have identified as '1465;2033') appear to be valide postcodes. For consistency, we will move these to 'addr:postcode'.

We now compare any elements where both are present and see if there are any differences.

In [None]:
def compare_postcodes(filename):
    p_evt = ('start',)
    n = 0
    test_flds = set(['addr:postcode', 'postal_code'])
    for _, elem in ET.iterparse(filename, events=p_evt):
        #setup test variables for each tag
        addr_postcode = False
        postal_code = False
        postcodes = {}
        for tag in elem.iter('tag'):
            tag_k = tag.attrib['k'].rstrip()
            if tag_k == 'addr:postcode':
                addr_postcode = True
                postcodes['addr:postcode'] = tag.attrib['v']
            elif tag_k == 'postal_code':
                postal_code = True
                postcodes['postalcode'] = tag.attrib['v']
        if (addr_postcode and postal_code):
            print postcodes

compare_postcodes(osm_test_file)

We can see here that there are no elements which contain both the 'addr:postcode' and 'postal_code' fields.

In [None]:
# testing tag types
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
lower_two_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

def test_tag_type(elem):
    tag_type = elem.attrib['k']
    if lower.match(tag_type) != None:
        keys['lower'] += 1
    elif lower_colon.match(tag_type) != None:
        keys['lower_colon'] += 1
    elif lower_two_colon.match(tag_type) != None:
        keys['lower_two_colon'] += 1
        two_colon_set.add(tag_type)
    elif problemchars.match(tag_type) != None:
        keys['problemchars'] += 1
    else:
        keys['other'] += 1
        others_set.add(tag_type)

keys = {
        'lower': 0,
        'lower_colon': 0,
        'lower_two_colon': 0,
        'problemchars': 0,
        'other': 0
        }
two_colon_set = set()
others_set = set()

def test_tag_types(filename):
    p_evt = ('start',)
    n = 0
    test_flds = set(['addr:postcode', 'postal_code'])
    for _, elem in ET.iterparse(filename, events=p_evt):
        if 'k'  in elem.attrib:
            test_tag_type(elem)

test_tag_types(osm_test_file)
print keys
print two_colon_set
print others_set

The tag types seem to be quite well maintained - we do not see any problem chars and there do not appear to be any two colon fields relevant to what we are attempting to extract. We note there may be on problematic other type - namely: 'addr:city_1'. 

## Functions to fix the identified gaps in our data
Our goals are to:
* Ensure that all the street names are uniform

In [None]:
# set(['Court', 'Place', 'Way', 'Revesby', 'Walk', 'Highway', 'Ogilve', 
#'Promenade', 'Corination', 'Lane', 'Drive', 'St', 'Hilma', 'Circuit', 'Road', 
#'Square', 'Parade', 'Point', 'st', 'Street', 'Crescent', 'Ave', 'Avenue'])

# From the sample above create a set of good street styles to check against.
good_st_types = set(["Court",
                    "Place",
                    "Way",
                    "Walk",
                    "Highway",
                    "Promenade",
                    "Corination",
                    "Lane",
                    "Drive",
                    "Street",
                    "Circuit",
                    "Road",
                    "Square",
                    "Parade",
                    "Point",
                    "Crescent",
                    "Avenue"
                    ])
fix_st_dict = {"St": "Street",
              "st": "Street",
              "Ave": "Avenue",
              "ave": "Avenue"
              }