# OpenStreetMaps Wrangling Case Study

In [6]:
# %load python/1_count_tags.py
#!/usr/bin/env python
"""
This file counts how many of each tag is present in the given XML file.
"""
import xml.etree.cElementTree as ET
import pprint

from collections import defaultdict
from util import logging_itr

def count_tags(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    tags = {}
    for child in root:
        tag = child.tag
        if tag not in tags:
            tags[tag] = 1
        else:
            tags[tag] = tags[tag]+1
    return tags

if __name__ == "__main__":
    tags = count_tags('data/mountainview.osm')
    pprint.pprint(tags)
    

{'meta': 1, 'node': 411790, 'note': 1, 'relation': 445, 'way': 56355}


In [10]:
# %load python/2_count_key_types.py
#!/usr/bin/env python
import xml.etree.cElementTree as ET
import pprint

from collections import Counter
from util import logging_itr, lower, alpha, word_plus_colon, lower_colon, problemchars

"""
This file verifies whether any keys will be problematic by classifying them into
various types and printing out the counts of each type.
"""



def key_type(element):
    if element.tag == "tag":
        if lower.search(element.attrib['k']):
            return 'lower'
        elif lower_colon.search(element.attrib['k']):
            return 'lower_colon'
        elif alpha.search(element.attrib['k']):
            return 'alpha_with_upper'
        elif word_plus_colon.search(element.attrib['k']):
            return 'word_plus_colon'
        elif problemchars.search(element.attrib['k']):
            return 'problem_chars'
        else:
            print 'Unidentified type for key "{}"'.format(element.attrib['k'])
            return 'other'

def process_map(filename):
    '''
    Counts the number of keys belonging to each type (as determined by the
    function key_type) in the given input file.
    '''
    keys = Counter()
    for _, element in logging_itr(ET.iterparse(filename)):
        ktype = key_type(element)
        if ktype is not None:
            keys[ktype] += 1

    return keys

if __name__ == "__main__":
    keys = process_map('data/mountainview.osm')
    pprint.pprint(keys.most_common())


Finished 500000 items
Finished 1000000 items
[('lower_colon', 128696),
 ('lower', 126270),
 ('word_plus_colon', 2791),
 ('alpha_with_upper', 458),
 ('problem_chars', 25)]


In [12]:
# %load python/3_find_users.py
#!/usr/bin/env python
import xml.etree.cElementTree as ET
import pprint
import re

from util import logging_itr

"""
This file finds the number of unique users who have contributed to this map.
"""

def get_user(element):
    '''
    Returns the user id associated with the given element.
    '''
    return element.attrib.get('uid')


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            if element.attrib['uid'] not in users:
                users.add(element.attrib['uid'])

    return users


if __name__ == "__main__":
    users = process_map('data/mountainview.osm')
    print '{} users contributed'.format(len(users))


0 users contributed


In [16]:
# %load python/4_audit_street_names.py
"""
This file audits the OSMFILE by checking whether there are any unexpected street
suffixes, or whether there are any street names which would change upon
normalization. In both cases, it prints examples of any possibly messy street
names.
"""
import xml.etree.cElementTree as ET
import regex
import pprint

from util import defaultdict, logging_itr, split_street, normalize_name, street_type_re

OSMFILE = "data/mountainview.osm"

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons"]



def get_street_type(street_name):
    '''
    Gets the type or suffix from the given street name. For example,
    get_street_type('Washington Ave.') would return 'Ave.'
    '''
    _, street_type = split_street(street_name)
    if street_type is not None:
        return street_type
    else:
        return None

def get_street_name(elem):
    '''
    If the given element is a tag specifying a street name, returns the street
    name. Otherwise, returns None.
    '''
    if elem.attrib.get('k') == 'addr:street':
        return elem.attrib['v']
    else:
        return None


def get_street_name_and_type(elem):
    '''
    Returns a tuple (name, type) with the street name and type, if they exist,
    for the given element. First finds the tag that specifies the street name,
    then extracts the street type. If either name or type does not exist, that
    element of the tuple will be None.
    '''
    for tag in elem.iter("tag"):
        street_name = get_street_name(tag)
        if street_name is not None:
            street_type = get_street_type(street_name)
            if street_type is not None:
                return street_name, street_type
            else:
                return street_name, None

    return None, None

def audit(osmfile):
    '''
    Performs the auditing operations on the given file. Returns a tuple
    (street_types, unnormalized_street_names), where street_types is a
    dictionary mapping unexpected street types to example street names with that
    type, and unnormalized_street_names is a set of street names that are not in
    normalized form.
    '''
    street_types = defaultdict(set)
    unnormalized_street_names = set()

    for _, elem in logging_itr(ET.iterparse(osmfile)):
        if elem.tag == "node" or elem.tag == "way":
            street_name, street_type = get_street_name_and_type(elem)

            # Check for unexpected street types
            if street_type is not None and street_type not in expected:
                street_types[street_type].add(street_name)

            # Check for badly capitalized streets
            if street_name is not None and street_name != normalize_name(street_name):
                unnormalized_street_names.add(street_name)

        if elem.tag != 'tag':
            elem.clear()

    return street_types, unnormalized_street_names

if __name__ == '__main__':
    st_types, unnorm_sts = audit(OSMFILE)
    pprint.pprint(st_types)
    pprint.pprint(unnorm_sts)


Finished 500000 items
Finished 1000000 items
{'#114': set(['West Evelyn Avenue Suite #114']),
 '#C': set(['Plymouth Street #C']),
 '2': set(['Showers Drive STE 2']),
 '7': set(['Showers Drive STE 7']),
 'AA': set(['Showers Drive BLDG AA']),
 'Ave': set(['California Ave',
             'El Monte Ave',
             'N Blaney Ave',
             'Portage Ave',
             'S California Ave']),
 'Calle': set(['La Calle']),
 'Central': set(['Plaza Central']),
 'Circle': set(['Bobolink Circle',
                'Carlson Circle',
                'Continental Circle',
                'Distel Circle',
                'Duluth Circle',
                'East Meadow Circle',
                'Los Palos Circle',
                'Redwood Circle',
                'Rincon Circle',
                'Roosevelt Circle',
                'San Antonio Circle',
                'Starr King Circle',
                'Van Buren Circle']),
 'Corte': set(['Bella Corte']),
 'Ct': set(['Stierlin Ct']),
 'Ct.': set(['Stie

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    #Read in XML data
    #Update XML data
    #Write XML data
    return data

if __name__ == "__main__":
    test()