In [77]:
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict

In [86]:
OSM_FILE = "sample-k-1000.osm"

street_type_re = re.compile(r'(\b\S+\.?)$', re.IGNORECASE) # "300 Bernal Ave." or "900 Cy Ranch Drive"
street_type_num_re = re.compile(r'(\b\S+\.?) #?[0-9]+$', re.IGNORECASE) # "20 Cal Avenue #32" or "15 Stevens Creek Hwy 2"
re.compile(r'\b\S+\.? \b[0-9]+$', re.IGNORECASE)

re.compile(r'\b\S+\.? \b[0-9]+$', re.IGNORECASE)

In [79]:
# Examples of using street_type_num_re
m = street_type_num_re.search("2030 Hwy 1")
print(m.group(1))

m = street_type_num_re.search("Old Bernal Ave #1")
print(m.group(1))

Hwy
Ave


In [94]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [98]:
def count_tags(filename):
    tags = defaultdict(int)
    
    #for i, elem in enumerate(get_element(OSM_FILE)):
    for event, elem in ET.iterparse(filename, events=('start', 'end')):
        tags[elem.tag] += 1
        
        # TODO(hkwik): I know I ought to clear the root, but not sure about elem.tag in tags.
        # Once I understand, I'll put this in.
        # if event == 'end' and elem.tag in tags: 
        #    root.clear()
    
    return tags

count_tags(OSM_FILE)

defaultdict(int,
            {'member': 206,
             'nd': 28832,
             'node': 24542,
             'osm': 2,
             'relation': 22,
             'tag': 10976,
             'way': 2596})

In [109]:
expected = ["Alley", "Avenue", "Boulevard", "Center", "Circle", "Common", "Commons", 
            "Corte", "Court", "Courtyard", "Drive", "Expressway", 
            "Highway", "Lane", "Loop", "Mall", "Path", "Park", "Parkway", "Place", "Plaza",
            "Real", "Road", "Square", "Street", "Terrace", "Trail", "Walk",
            "Way"]

mapping = { "Aly": "Alley",
            "AVE": "Avenue",
            "Ave": "Avenue",
            "Aveenue": "Avenue",
            "Avenie": "Avenue",
            "Ave.": "Avenue",
            "blvd": "Boulevard",
            "BLVD.": "Boulevard",
            "BLVD": "Boulevard",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Boulvevard": "Boulevard",
            "Boulevar": "Boulevard",
            "Cir": "Circle",
            "Circle:": "Circle",
            "court": "Court",
            "Ct": "Court",
            "Ct.": "Court",
            "Ctr": "Center",
            "Dr": "Drive",
            "Expwy": "Expressway",
            "Hwy": "Highway",
            "Hwy.": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "parkway": "Parkway",
            "PKWY": "Parkway",
            "PL": "Place",
            "Pl": "Place",
            "PT": "Point",
            "road": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "st": "Street",
            "St": "Street",
            "St.": "Street",
            "street": "Street",
            "terrace": "Terrace",
            "way": "Way",
            "WAy": "Way"
            }

In [107]:
def audit_street_type_regex(street_types, regex, street_name):
    # Assume group(1) contains the street type
    m = regex.search(street_name)
    if m:
        street_type = m.group(1)
        if street_type not in expected and street_type not in mapping:
            street_types[street_type].add(street_name)

def audit_street_type(street_types, street_name):
    # audit_street_type_regex(street_types, street_type_re, street_name)
    
    # TODO: not sure if this is working
    audit_street_type_regex(street_types, street_type_num_re, street_name)

    
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    
    for i, elem in enumerate(get_element(osm_file)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                    
    osm_file.close()
    return street_types


def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        new_street_type = mapping[street_type]
        name = re.sub(street_type_re, new_street_type, name)
    return name

In [108]:
audit(OSMFILE)

defaultdict(set,
            {'APT': {'East Charleston Road APT 9'},
             'Ave': {'Old Bernal Ave #1',
              'University Ave #155',
              'Watt Ave #200'},
             'Blvd': {'Mission Blvd #302',
              'N California Blvd #105',
              'Northgate Blvd #100',
              'San Ramon Valley Blvd #2'},
             'Box': {'PO Box 5259'},
             'Ca': {'Pera Dr at Escuela Drive, Rancho Murieta, Ca 95683'},
             'Ct': {'Las Positas Ct #165'},
             'Dr': {'Concourse Dr #81'},
             'HWY': {'N HWY 88'},
             'Hwy': {'10795 Hwy 1',
              '2030 Hwy 1',
              'E. Hwy 120',
              'Hwy 116',
              'Hwy 9',
              'State Hwy 1'},
             'Hwy.': {'Hwy. 9'},
             'No.': {'California Highway 16, House No. 1'},
             'PM': {'San Mateo 35 PM 10'},
             'Padre': {'Via Padre #23',
              'Via Padre #25',
              'Via Padre #27',
              'Via