In [1]:
#datafile = "../raleigh_north-carolina.osm"
datafile = "../Submission/04-sample.osm"

In [2]:
import json
import xml.etree.cElementTree as ET
import re
from collections import defaultdict

# Audit Data for some fields

## Count Tags

In [3]:
def count_tags(filename):
        # YOUR CODE HERE
        tags = {}
        for event, elem in ET.iterparse(filename):
            if event == 'end':
                if elem.tag not in tags.keys():
                    tags[elem.tag] = 1
                else:
                    tags[elem.tag] += 1
        return tags

In [4]:
tags = count_tags(datafile)
print(tags)

{'member': 723, 'nd': 281437, 'relation': 74, 'osm': 1, 'node': 252427, 'way': 21146, 'tag': 81286}


## Check Street Names

In [5]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Loop", "Way", "Run", "Circle"]

#create default dict and add known mappings
#TODO: update with S, W, E, etc.
mapping_street_name_def = defaultdict(lambda: "")
mapping_street_name = { "St": "Street", "St.": "Street", "Rd": "Road", "Rd.": "Road", "N.":"North",
            "Ave":"Avenue", "Ave.":"Avenue", "Blvd":"Boulevard", "Blvd.":"Boulevard",
            "Pkwy":"Parkway", "Dr":"Drive", "Ln":"Lane", "Ct":"Court"}
for k,v in mapping_street_name.items():
    mapping_street_name_def[k] = v

def audit_street_type(street_types, street_mapping, street_name):
    """Check if street name is a known name or maps to a known name.  If not, record it."""
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()           
        if (street_type not in expected) and (street_mapping[street_type] not in expected):
            street_types[street_type].add(street_name)

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == "addr:street":
                    audit_street_type(street_types, mapping_street_name_def, tag.attrib['v'])

    return street_types

In [6]:
street_types = audit(datafile)
for k,v in street_types.items():
    print(k,v)

Crescent {'West Acres Crescent'}
West {'Highway 54 West'}
501 {'US 15;US 501'}
Fork {'Dry Fork'}
Crossing {'Oldham Forest Crossing'}
Plaza {'City Hall Plaza'}
LaurelcherryStreet {'LaurelcherryStreet'}
55 {'NC Highway 55', 'Highway 55'}
751 {'NC Highway 751'}


### Correction Functions for specific strings

In [7]:
def update_name(name, mapping):
    name = name.split(' ')
    for idx, subname in enumerate(name):
        if subname in mapping.keys():
            name[idx] = mapping[subname]
    name = " ".join(name)

    return name

### Define specific formatting functions for each type of data

In [8]:
def process_node(element):
    """Format Node object and associated tags.  Remove attributes as they are processed."""
    node = {'type':'node'}
    #Extract raw information for this node
    attributes = {a:element.attrib[a] for a in element.attrib}
    tags = [(t.attrib['k'],t.attrib['v']) for t in element.findall('tag')]
    #tags used a list of tuples instead of dict because of potential duplicate keys
    
    #Add ID
    node['id'] = element.attrib['id']
    del attributes['id']
    
    #Add Position
    if 'lat' in attributes and 'lon' in attributes:
        node['pos'] = [float(element.attrib["lat"]), float(element.attrib["lon"])]
        del attributes['lat']
        del attributes['lon']

    #Created
    CREATED = ["version", "changeset", "timestamp", "user", "uid"]
    created_attrib = [a for a in attributes if a in CREATED] #Which of the created attr are present
    if len(created_attrib) > 0:
        node['created'] = {}
        for a in created_attrib:
            node['created'][a] = element.attrib[a]
            del attributes[a]
    
    #Remaining Attributes
    for a in attributes:
        if problemchars.match(a): #skip attributes with problematic keys
            continue
        else:
            node[a] = element.attrib[a]

    #Process tags
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    address = re.compile(r'^addr:([a-z]|_)+$')#match 'addr', one colon, some tag
    gnis = re.compile(r'^gnis:([a-z]|_)+$')#similar structure to address.  Seems interesting.
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

    for k,v in tags:
        if problemchars.match(k):
            continue
        elif address.match(k):
            #Add Address dict if needed
            if 'address' not in node.keys(): #add address dict if not present
                node['address'] = dict()
            #Get subnode and corrected (if possible) value
            subnode = k.split(':')[1]
            if subnode == 'street':
                v = update_name(v, mapping_street_name)
            #Add value
            node['address'][subnode] = v
        elif gnis.match(k):
            if 'gnis' not in node.keys(): #add address dict if not present
                node['gnis'] = dict()
            node['gnis'][k.split(':')[1]] = v
        elif lower.match(k) or lower_colon.match(k):
            node[k] = v         
    
    return node
    
def process_way(element):
    """Format Way object and associated nd and tags"""
    way = {'type':'way'}
    #changeset, id, timestamp, uid, user, version
    #list nd each with ref
    #list tags
    return way

### Iterate through XML file.  Process elements and write to JSON one at a time.

In [9]:
def shape_element(element):
    if element.tag == "node":
        shaped_element = process_node(element)
    elif element.tag == 'way':
        shaped_element = process_way(element)
    else:
        shaped_element = None
    return shaped_element
        

def process_map(file_in):
    file_out = "{0}.json".format(file_in)
    with open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                fo.write(json.dumps(el, indent=2)+"\n")

In [10]:
process_map(datafile)