In [22]:
from collections import defaultdict
import pprint
import re
import pandas as pd
import numpy as np

In [1]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "cambridge.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write(bytes('<?xml version="1.0" encoding="UTF-8"?>\n',encoding='utf-8'))
    output.write(bytes('<osm>\n  ',encoding='utf-8'))

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write(bytes('</osm>',encoding='utf-8'))

In [24]:
def count_tags(filename):
    tag_dict = defaultdict(int)
    for event,elem in ET.iterparse(filename):
        tag_dict[elem.tag] +=1
    return tag_dict

In [25]:
count_tags(SAMPLE_FILE)

defaultdict(int,
            {'member': 1476,
             'nd': 107245,
             'node': 87458,
             'osm': 1,
             'relation': 159,
             'tag': 46116,
             'way': 15021})

In [26]:
OSMFILE = SAMPLE_FILE
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Terrace"]

#initial mapping
mapping = { "St": "Street",
            "St.": "Street",
            "Ave":"Avenue",
            "Rd.":"Road"
            }


#Create list of mapping keys
mapping_keys = []
for k,v in mapping.items():
    mapping_keys.append(k)


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r",encoding='utf8')
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file,events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        bad_suffix = m.group()
        if m.group() in mapping_keys: #If the bad key is in the mapping dictionary, then perform a substitute, otherwise leave as-is
            good_suffix = mapping[bad_suffix]
            return re.sub(bad_suffix,good_suffix,name)
        else:
            return name
        
def st_types(file):
    st_types = audit(file)
    pprint.pprint(dict(st_types))

def test_w_update(file):
    st_types = audit(file)
    pprint.pprint(dict(st_types))
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            print(name, "=>", better_name)
                  
def test_w_update_two(file):
    st_types = audit(file)
    pprint.pprint(dict(st_types))
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_street_name(name)
            print(name, "=>", better_name)

In [27]:
st_types(SAMPLE_FILE)

{'3': {'Kendall Square - 3'},
 '303': {'First Street, Suite 303'},
 'Ave': {'Commonwealth Ave',
         'Francesca Ave',
         'Josephine Ave',
         'Massachusetts Ave',
         'Somerville Ave',
         'Western Ave',
         'Willow Ave'},
 'Ave.': {'Somerville Ave.'},
 'Broadway': {'Broadway'},
 'Center': {'Cambridge Center'},
 'Floor': {'Boylston Street, 5th Floor'},
 'Highway': {"Monsignor O'Brien Highway"},
 'Mall': {'Cummington Mall'},
 'Park': {'Acorn Park', 'Giles Park', 'Austin Park'},
 'Plaza': {'Park Plaza'},
 'Rd': {'Soldiers Field Rd', 'Abby Rd'},
 'Row': {'Assembly Row'},
 'St': {'Antwerp St',
        'Athol St',
        'Brentwood St',
        'Duval St',
        'Everett St',
        'Holton St',
        'Kirkland St',
        'Litchfield St',
        'Lothrop St',
        'Mackin St',
        'Merrill St',
        'N Beacon St',
        'Norfolk St',
        'Portsmouth St',
        'Richardson St',
        'Waverly St'},
 'St.': {'Albion St.', 'Stuart St.'

There seems to be lots of formatting issues, especially for Street (St, St.), Road (Rd, Rd.), and Avenue (Ave, Ave.). Let's fix that by modifying our mapping table.

In [28]:
mapping = { "St": "Street",
            "St.": "Street",
            "Ave":"Avenue",
            "Ave.":"Avenue",
            "Rd.":"Road",
            "Rd":"Road",
            "Ct":"Court",
            "Ct.":"Court",
            "Pkwy":"Parkway"
            }

mapping_keys = []
for k,v in mapping.items():
    mapping_keys.append(k)

In [29]:
test_w_update(SAMPLE_FILE)

{'3': {'Kendall Square - 3'},
 '303': {'First Street, Suite 303'},
 'Ave': {'Commonwealth Ave',
         'Francesca Ave',
         'Josephine Ave',
         'Massachusetts Ave',
         'Somerville Ave',
         'Western Ave',
         'Willow Ave'},
 'Ave.': {'Somerville Ave.'},
 'Broadway': {'Broadway'},
 'Center': {'Cambridge Center'},
 'Floor': {'Boylston Street, 5th Floor'},
 'Highway': {"Monsignor O'Brien Highway"},
 'Mall': {'Cummington Mall'},
 'Park': {'Acorn Park', 'Giles Park', 'Austin Park'},
 'Plaza': {'Park Plaza'},
 'Rd': {'Soldiers Field Rd', 'Abby Rd'},
 'Row': {'Assembly Row'},
 'St': {'Antwerp St',
        'Athol St',
        'Brentwood St',
        'Duval St',
        'Everett St',
        'Holton St',
        'Kirkland St',
        'Litchfield St',
        'Lothrop St',
        'Mackin St',
        'Merrill St',
        'N Beacon St',
        'Norfolk St',
        'Portsmouth St',
        'Richardson St',
        'Waverly St'},
 'St.': {'Albion St.', 'Stuart St.'

That seems to have gotten rid of the bolus of issues. There are, however, a few more potential areas of concern. Most notably, there are a few values that have floor numbers and/or suite numbers. Given that some of these records are likely businesses, this is not unusual. Three other results do stand out, though: (1) 'Kendall Square - 3,' which ends in a number, but with no floor, (2) 'argus place,' which is lower cased, and (3) 'Broadway' and 'Windsor' which appears to have no street name associated with them.

Let's update our function to account for these edge cases.

In [32]:
def update_street_name(name):
    m = street_type_re.search(name)
    if name == 'argus place':
        return name.title()
    elif name == 'Kendall Square - 3':
        return 'Kendall Square'
    elif m:
        bad_suffix = m.group()
        if m.group() in mapping_keys: #If the bad key is in the mapping dictionary, then perform a substitute, otherwise leave as-is
            good_suffix = mapping[bad_suffix]
            return re.sub(bad_suffix,good_suffix,name)
        else:
            return name
    else:
        return name

In [33]:
test_w_update_two(SAMPLE_FILE)

{'3': {'Kendall Square - 3'},
 '303': {'First Street, Suite 303'},
 'Ave': {'Commonwealth Ave',
         'Francesca Ave',
         'Josephine Ave',
         'Massachusetts Ave',
         'Somerville Ave',
         'Western Ave',
         'Willow Ave'},
 'Ave.': {'Somerville Ave.'},
 'Broadway': {'Broadway'},
 'Center': {'Cambridge Center'},
 'Floor': {'Boylston Street, 5th Floor'},
 'Highway': {"Monsignor O'Brien Highway"},
 'Mall': {'Cummington Mall'},
 'Park': {'Acorn Park', 'Giles Park', 'Austin Park'},
 'Plaza': {'Park Plaza'},
 'Rd': {'Soldiers Field Rd', 'Abby Rd'},
 'Row': {'Assembly Row'},
 'St': {'Antwerp St',
        'Athol St',
        'Brentwood St',
        'Duval St',
        'Everett St',
        'Holton St',
        'Kirkland St',
        'Litchfield St',
        'Lothrop St',
        'Mackin St',
        'Merrill St',
        'N Beacon St',
        'Norfolk St',
        'Portsmouth St',
        'Richardson St',
        'Waverly St'},
 'St.': {'Albion St.', 'Stuart St.'

Great! The results seem to be much more acceptable now. 

In [11]:
distinct_keys = defaultdict(int)
for _,elem in ET.iterparse(SAMPLE_FILE):
    try:
        distinct_keys[elem.attrib['k']] += 1
    except:
        pass

k_df = pd.DataFrame.from_dict(distinct_keys,orient='index')
print(k_df.sort_values(by=0,ascending=False).head(50))

                      0
building          11530
source             3361
highway            3131
attribution        2888
name               2557
massgis:way_id     1375
lanes              1215
condition          1192
width              1170
addr:housenumber    890
oneway              767
amenity             764
addr:street         641
created_by          538
ref                 448
addr:city           409
operator            373
addr:postcode       370
surface             347
building:levels     337
man_made            329
addr:state          316
service             309
network             252
railway             227
public_transport    223
bus                 222
leisure             220
access              213
foot                208
building:part       202
ele                 195
gnis:feature_id     192
bicycle             183
type                183
lit                 167
sidewalk            160
cycleway            139
website             138
shop                135
layer           

Based on an overview of the other items in the XML, I will also explore addr:state and addr:postcode for any issues. 