In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
import operator
from collections import defaultdict

OSMFILE = "albuquerque.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_dir_re = re.compile(r'\b[NESW][NESW]\.?$', re.IGNORECASE)
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
addr_two_colons = re.compile(r'^addr:([a-z]|_)*:([a-z]|_)*$')
addr_one_colon = re.compile(r'^addr:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "Northeast", "Northwest", "Southeast", "Southwest"]
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
mapping = { "St": "Street",
            "St.": "Street",
            "ST": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "AVE": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
            "RD": "Road",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "BLVD": "Boulevard",
            "Dr": "Drive",
            "Dr.": "Drive",
            "DR": "Drive",
            "Ct": "Court",
            "Ct.": "Court",
            "CT": "Court",
            "Pl": "Place",
            "Pl.": "Place",
            "PL": "Place",
            "Sq": "Square",
            "Sq.": "Square",
            "SQ": "Square",
            "Ln": "Lane",
            "Ln.": "Lane",
            "LN": "Lane",
            "Tr": "Trail",
            "Tr.": "Trail",
            "TR": "Trail",
            "Pkwy": "Parkway",
            "Pkwy.": "Parkway",
            "PKWY": "Parkway",
            "Cmns": "Commons",
            "Cmns.": "Commons",
            "CMNS": "Commons",
            "Cir": "Circle",
            "Cir.": "Circle",
            "CIR": "Circle",
           "NE": "Northeast",
           "NE.": "Northeast",
           "NW": "Northwest",
           "Norhteast": "Northeast",
           "SE": "Southeast",
           "SW": "Southwest",
           "SouthWest": "Southwest"
            }

#count_tags is used to count all of the different types of tags in the data set--------------------------

def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename):
        if elem.tag in tags:
            tags[elem.tag] += 1
            #print elem.tag + ' in tags'
        else:
            tags[elem.tag] = 1
            #print elem.tag + ' not in tags'
    return tags

#key_type and key_map functions are used to classify & count tag keys by their text characteristics-----

def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter("tag"):
            if lower.search(tag.attrib['k']):
                keys['lower'] += 1
            elif lower_colon.search(tag.attrib['k']):
                keys['lower_colon'] += 1
            elif problemchars.search(tag.attrib['k']):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1
    return keys

def key_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

#user_contributions function counts the number of times each username is cited-------------------------

def user_contributions(filename):
    users = {}
    for _, element in ET.iterparse(filename):
        #print element.tag
        if 'uid' in element.attrib:
            #print element.attrib['user']
            if element.attrib['user'] in users:
                users[element.attrib['user']] += 1
            else:
                users[element.attrib['user']] = 1
    return users

#audit_street_type audits the street name against the list of expected values--------------------------

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

#is_street_name reports whether an element is a street name or not-------------------------------------
            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

#audit uses the audit_street_type function to audit the street names in the data against a list of expected values-----

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            #print "START ELEMENT TAGS"
            for tag in elem.iter('tag'):
                #print tag.attrib['k'] + " >> " + tag.attrib['v']
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    bad_type = street_type_re.search(name).group()
    better_type = mapping[bad_type]
    name = name.replace(bad_type, better_type)
    return name

#shape_element function converts elements to JSON format-----------------------------------------------

def shape_element(element):
    node = {}
    node['created'] = {}
    node['pos'] = [0, 0]
    node['node_refs'] = []
    node['address'] = {}
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag
        for attrName, attrValue in element.attrib.items():
            if attrName == "id":
                node[attrName] = attrValue
            elif attrName == "visible":
                node[attrName] = attrValue
            elif attrName in CREATED:
                node['created'][attrName] = [attrValue][0]
            elif attrName == "lat":
                node['pos'][0] = float([attrValue][0])
            elif attrName == "lon":
                node['pos'][1] = float([attrValue][0])
            else:
                node[attrName] = attrValue[0]
        for tag in element.iter():
            if tag.tag == "tag":
                if problemchars.search(tag.attrib['k']):
                    pass
                elif addr_one_colon.search(tag.attrib['k']):
                    if tag.attrib['k'] == "addr:street" and \
                    street_type_re.search(tag.attrib['v']).group() not in expected \
                    and street_type_re.search(tag.attrib['v']).group() in mapping:
                        node['address'][tag.attrib['k'][5:]] = update_name(tag.attrib['v'], mapping)
                    else:
                        node['address'][tag.attrib['k'][5:]] = tag.attrib['v']
                elif addr_two_colons.search(tag.attrib['k']):
                    pass
                else:
                    node[tag.attrib['k']] = tag.attrib['v']
            elif tag.tag == "nd":
                node["node_refs"].append(tag.attrib['ref'])
            else:
                pass
        if node['address'] == {}:
            del node['address']
        if node['created'] == {}:
            del node['created']
        if node['pos'] == [0, 0]:
            del node['pos']
        if node['node_refs'] == []:
            del node['node_refs']
        #print node
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    tags = count_tags(OSMFILE)
    sorted_tags = sorted(tags.items(), key=operator.itemgetter(1), reverse=True)
    users = user_contributions(OSMFILE)
    sorted_users = sorted(users.items(), key=operator.itemgetter(1), reverse=True)
    print "Tags contained:"
    pprint.pprint(sorted_tags)
    print "Top 10 contributing users:"
    pprint.pprint(sorted_users[:10])  
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))

    #data = process_map(OSMFILE, False)
    #pprint.pprint(data)


if __name__ == "__main__":
    test()

Tags contained:
[('nd', 367230),
 ('node', 292869),
 ('tag', 237428),
 ('way', 44966),
 ('member', 3178),
 ('relation', 644),
 ('bounds', 1),
 ('osm', 1)]
Top 10 contributing users:
[('EdHillsman', 113327),
 ('anjbe', 68935),
 ('jackbus', 30624),
 ('woodpeck_fixbot', 27784),
 ('beweta', 12422),
 ('greenv505', 10223),
 ('polarsleuth', 9296),
 ('Dilys', 4126),
 ('balrog-kun', 3884),
 ('derricknehrenberg', 3071)]
{'11115': set(['11115']),
 '250': set(['4th Street NW Suite 250']),
 '2nd': set(['Gold Between 1st and 2nd']),
 '5th': set(['5th']),
 '87102': set(['1833 8th Street NorthwestAlbuquerque, NM 87102']),
 'A': set(['3301 Menaul Blvd. NE Suite A', 'Juan Tabo NE, Suite A']),
 'Albuquerque': set(['UNM Hospitals, Albuquerque']),
 'Ave': set(['8700 Central Ave']),
 'AvenueSW': set(['Cental AvenueSW']),
 'Basehart': set(['1401 Basehart']),
 'Central': set(['8th and Central']),
 'E-18': set(['Eubank Northeast Ste E-18']),
 'East': set(['Redondo East']),
 'Felipe': set(['San Felipe']),
 'Fre