In [176]:
import xml.etree.cElementTree as ET
import pprint
import re
from collections import defaultdict
from IPython.display import display
import operator
import codecs
import json

In [177]:
#OSM_FILE = "sample-k-25.osm"
OSM_FILE = "sample-k-100.osm"

street_type_re = re.compile(r'(\b\S+\.?)$', re.IGNORECASE) # "300 Bernal Ave." or "900 Cy Ranch Drive"
street_type_num_re = re.compile(r'(\b\S+\.?)(?= #?[0-9]+$)', re.IGNORECASE) # "20 Cal Avenue #32" or "15 Stevens Creek Hwy 2"

In [178]:
# Examples of using street_type_num_re
m = street_type_num_re.search("2030 Hwy 1")
print(m.group(1))

m = street_type_num_re.search("Old Bernal Ave #1")
print(m.group(1))

Hwy
Ave


In [179]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [180]:
def count_high_level_tags(filename):
    tags = defaultdict(int)
    
    #for i, elem in enumerate(get_element(OSM_FILE)):
    for event, elem in ET.iterparse(filename, events=('start', 'end')):
        tags[elem.tag] += 1
        
        # TODO(hkwik): I know I ought to clear the root, but not sure about elem.tag in tags.
        # Once I understand, I'll put this in.
        # if event == 'end' and elem.tag in tags: 
        #    root.clear()
    
    return tags

count_high_level_tags(OSM_FILE)

defaultdict(int,
            {'member': 2866,
             'nd': 290588,
             'node': 245418,
             'osm': 2,
             'relation': 220,
             'tag': 110246,
             'way': 25960})

# Audit Keys

In [181]:
def count_keys(tag, keys):
    k = tag.attrib['k']
    keys[k] += 1
    pass

def audit_keys(osm_file):
    keys = defaultdict(int)
    for i, elem in enumerate(get_element(osm_file, tags=['node', 'way'])):
        for tag in elem.iter('tag'):
            count_keys(tag, keys)
    
    #osm_file.close()
    return keys

In [182]:
keys = audit_keys(OSM_FILE)

sorted_keys = sorted(keys.items(), key=operator.itemgetter(1))
sorted_keys[::-1]

[('building', 7698),
 ('highway', 4440),
 ('name', 2948),
 ('addr:housenumber', 2265),
 ('addr:street', 2157),
 ('tiger:county', 2087),
 ('tiger:cfcc', 1990),
 ('addr:city', 1873),
 ('tiger:name_base', 1806),
 ('source', 1747),
 ('tiger:name_type', 1719),
 ('tiger:reviewed', 1461),
 ('tiger:zip_left', 1391),
 ('tiger:zip_right', 1322),
 ('tiger:tlid', 973),
 ('tiger:source', 965),
 ('addr:state', 921),
 ('tiger:separated', 919),
 ('addr:postcode', 754),
 ('oneway', 662),
 ('height', 626),
 ('amenity', 520),
 ('service', 483),
 ('lanes', 445),
 ('created_by', 441),
 ('landuse', 404),
 ('waterway', 390),
 ('power', 298),
 ('surface', 297),
 ('tiger:upload_uuid', 285),
 ('maxspeed', 279),
 ('bicycle', 274),
 ('paloalto_ca:id', 252),
 ('natural', 244),
 ('access', 203),
 ('leisure', 201),
 ('cycleway', 198),
 ('foot', 185),
 ('redwood_city_ca:bld_gid', 185),
 ('nhd:reach_code', 177),
 ('ref', 176),
 ('layer', 175),
 ('gnis:fcode', 171),
 ('nhd:fdate', 171),
 ('redwood_city_ca:addr_id', 171

# Audit Addresses

In [183]:
expected = ["Alley", "Avenue", "Boulevard", "Center", "Circle", "Common", "Commons", 
            "Corte", "Court", "Courtyard", "Drive", "Expressway", 
            "Highway", "Lane", "Loop", "Mall", "Path", "Park", "Parkway", "Place", "Plaza",
            "Real", "Road", "Square", "Street", "Terrace", "Trail", "Walk",
            "Way"]

mapping = { "Aly": "Alley",
            "AVE": "Avenue",
            "Ave": "Avenue",
            "Aveenue": "Avenue",
            "Avenie": "Avenue",
            "Ave.": "Avenue",
            "blvd": "Boulevard",
            "BLVD.": "Boulevard",
            "BLVD": "Boulevard",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Boulvevard": "Boulevard",
            "Boulevar": "Boulevard",
            "Cir": "Circle",
            "Circle:": "Circle",
            "court": "Court",
            "Ct": "Court",
            "Ct.": "Court",
            "Ctr": "Center",
            "Dr": "Drive",
            "Expwy": "Expressway",
            "Hwy": "Highway",
            "Hwy.": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "parkway": "Parkway",
            "PKWY": "Parkway",
            "PL": "Place",
            "Pl": "Place",
            "PT": "Point",
            "road": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "st": "Street",
            "St": "Street",
            "St.": "Street",
            "street": "Street",
            "terrace": "Terrace",
            "way": "Way",
            "WAy": "Way"
            }

In [184]:
def audit_street_type_regex(street_types, regex, street_name):
    # Assume group(1) contains the street type
    m = regex.search(street_name)
    if m:
        street_type = m.group(1)
        if street_type not in expected and street_type not in mapping:
            street_types[street_type].add(street_name)

def audit_street_type(street_types, street_name):
    audit_street_type_regex(street_types, street_type_re, street_name)
    audit_street_type_regex(street_types, street_type_num_re, street_name)
    
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_postcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def is_county(elem):
    return (elem.attrib['k'] == "addr:county")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    postcodes = defaultdict(int)
    counties = defaultdict(int)
    
    for i, elem in enumerate(get_element(osm_file)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                k = tag.attrib['k']
                v = tag.attrib['v']
                
                if is_street_name(tag):
                    audit_street_type(street_types, v)
                
                if is_postcode(tag):
                    postcodes[v] += 1
                    
                if is_county(tag):
                    counties[v] += 1
                    
    osm_file.close()
    return street_types, postcodes, counties


def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        new_street_type = mapping[street_type]
        name = re.sub(street_type_re, new_street_type, name)
    return name

In [185]:
street_types, post_codes, counties = audit(OSM_FILE)

In [186]:
street_types

defaultdict(set,
            {'100': {'Northgate Blvd #100'},
             '140': {'West Highway 140'},
             '2': {'Showers Drive STE 2'},
             '88': {'East State Route 88'},
             'Alameda': {'Alameda'},
             'Axis': {'North-South Axis'},
             'Barcelona': {'Calle de Barcelona'},
             'Broadway': {'Broadway'},
             'Bruno': {'Serra San Bruno'},
             'Cay': {'Cavalla Cay'},
             'D': {'El Camino Real #D'},
             'East': {'Park Circle East'},
             'H': {'Avenue H'},
             'Key': {'Montego Key'},
             'Lagoon': {'Del Oro Lagoon'},
             'PZ': {'Portage PZ'},
             'PlaceZ': {'Town Center PlaceZ'},
             'Presada': {'Paseo Presada'},
             'Reef': {'Bahama Reef'},
             'Route': {'East State Route 88'},
             'Row': {'Alvarado Row'},
             'STE': {'Showers Drive STE 2'},
             'Verda': {'Valle Verda'},
             'Village': {'Town a

In [187]:
[k for k in post_codes if len(k) > 10]

['1 Donner St, San Juan Bautista, CA 95045']

In [188]:
counties

defaultdict(int,
            {'Alameda': 5,
             'Contra Costa': 7,
             'Marin': 8,
             'Merced': 1,
             'Monterey': 2,
             'Napa': 4,
             'Sacramento': 11,
             'San Benito': 1,
             'San Benito County': 67,
             'San Joaquin': 13,
             'San Mateo': 2,
             'Santa Clara': 15,
             'Solano': 8,
             'Sonoma': 1,
             'Stanislaus': 2,
             'Yolo': 9})

# Cleaning

In [237]:
def update_street_name_regex(regex, name, mapping):
    m = regex.search(name)
    if m:
        street_type = m.group(1)
        new_street_type = mapping.get(street_type)
        if new_street_type:
            name = re.sub(regex, new_street_type, name)
            return name
    
    return name

def update_street_name(name, mapping):
    new_name = update_street_name_regex(street_type_re, name, mapping)
    if new_name != name:
        return new_name
    
    new_name = update_street_name_regex(street_type_num_re, name, mapping)
    if new_name != name:
        return new_name
    
    return name

postcode_re = re.compile(r'[0-9]{5,5}$', re.IGNORECASE)
postcode_dash_re = re.compile(r'[0-9]{5,5}-[0-9]{4,4}$', re.IGNORECASE)

def has_valid_postcode(name):
    return postcode_re.search(name) is not None or postcode_dash_re.search(name) is not None

def update_postcode(name):
    m = postcode_re.search(name)
    if m is None:
        m = postcode_dash_re.search(name)
    return m.group()

In [220]:
postcode1 = "94588"
postcode2 = "94588-1234"
address = "1 Donner St, San Juan Bautista, CA 95045"
address2 = "950 San Felipe Rd, San Benito County, CA, -"

postcode_re = re.compile(r'[0-9]{5,5}$', re.IGNORECASE)
postcode_dash_re = re.compile(r'[0-9]{5,5}-[0-9]{4,4}$', re.IGNORECASE)
#postcode_re = re.compile(r'(\b\S+\.?)(?= #?[0-9]+$)', re.IGNORECASE)

In [200]:
postcode_re.search(postcode1).group()

'94588'

In [206]:
postcode_dash_re.search(postcode2).group()

'94588-1234'

In [225]:
postcode_re.search(address).group() and len(address) > 5

'95045'

In [224]:
display(has_valid_postcode(postcode1))
display(has_valid_postcode(postcode2))
display(has_valid_postcode(address))
display(has_valid_postcode(address2))

True

True

True

False

In [235]:
m = postcode_re.search(address)
m

<_sre.SRE_Match at 0x1314f5850>

In [236]:
update_postcode(address)

'95045'

In [193]:
display(update_street_name("2nd st", mapping))
display(update_street_name("Northgate Blvd #100", mapping))

'2nd Street'

'Northgate Boulevard #100'

In [238]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = ["version", "changeset", "timestamp", "user", "uid"]
POSITION = ["lat", "lon"]

def has_lat_lon(element):
    return 'lat' in element.attrib and 'lon' in element.attrib

def shape_attributes(element, node):
    for attr in element.attrib:
        if attr in CREATED or attr in POSITION:
            continue
        node[attr] = element.attrib[attr]
        #print attr, element.attrib[attr]
    
    created = {}
    for attr in CREATED:
        if attr in element.attrib:
            created[attr] = element.attrib[attr]
    node['created'] = created
    
    if has_lat_lon(element):
        node['pos'] = [float(element.attrib['lat']), float(element.attrib['lon'])]

def shape_tags(element, node):
    address = {}
    for tag in element.iter("tag"):
        key = tag.attrib['k']
        value = tag.attrib['v']
        
        if is_street_name(tag):
            value = update_street_name(value, mapping)
        
        if is_postcode(tag):
            if not has_valid_postcode(value):
                continue
            value = update_postcode(value)
                    
        #print key, value
        if (re.search(problemchars, key) or 
            (re.search(lower, key) is None and 
            re.search(lower_colon, key) is None)):
            continue
        
        if re.search(lower_colon, key):
            key_arr = key.split(':')
            prefix = key_arr[0]
            suffix = key_arr[1]
            
            if prefix == 'addr':
                address[suffix] = value
            else:
                key = re.sub(':', "-", key)
                node[key] = value
                
            continue
        
        node[key] = value
    
    if len(address) > 0:
        #print node
        node['address'] = address    

def shape_node_refs(element, way):
    refs = []
    if element.tag == "way":
        for nd in element.iter("nd"):
            refs.append(nd.attrib['ref'])
        way['node_refs'] = refs
    
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # Handle top-level element
        node['type'] = element.tag
        
        shape_attributes(element, node)   
        
        # Handle tags
        shape_tags(element, node)
        
        # Handle node refs for way
        shape_node_refs(element, node)
        
        #print node
        return node
    else:
        return None

In [239]:
def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [240]:
process_map(OSM_FILE, True)

[{'created': {'changeset': '22950379',
   'timestamp': '2014-06-15T18:55:15Z',
   'uid': '1660455',
   'user': 'juergenb22',
   'version': '6'},
  'id': '281266',
  'pos': [37.5601845, -122.3025783],
  'type': 'node'},
 {'created': {'changeset': '6777070',
   'timestamp': '2010-12-27T12:28:53Z',
   'uid': '14293',
   'user': 'KindredCoda',
   'version': '18'},
  'id': '26027702',
  'pos': [37.3673202, -122.0315503],
  'type': 'node'},
 {'created': {'changeset': '26582399',
   'timestamp': '2014-11-05T22:50:24Z',
   'uid': '318696',
   'user': 'n76',
   'version': '22'},
  'id': '26029621',
  'pos': [37.3703601, -122.042248],
  'type': 'node'},
 {'created': {'changeset': '16795110',
   'timestamp': '2013-07-02T17:40:18Z',
   'uid': '14293',
   'user': 'KindredCoda',
   'version': '4'},
  'id': '26050461',
  'pos': [37.4086331, -122.1069576],
  'type': 'node'},
 {'created': {'changeset': '26243141',
   'timestamp': '2014-10-21T21:10:13Z',
   'uid': '595221',
   'user': 'matthieun',
   'v