In [1]:
datafile = "../raleigh_north-carolina.osm"
#datafile = "../Submission/04-sample.osm"

In [2]:
import json
import xml.etree.cElementTree as ET
import re
from collections import defaultdict

# Audit Data for some fields

## Count Tags

In [3]:
def count_tags(filename):
        # YOUR CODE HERE
        tags = {}
        for event, elem in ET.iterparse(filename):
            if event == 'end':
                if elem.tag not in tags.keys():
                    tags[elem.tag] = 1
                else:
                    tags[elem.tag] += 1
        return tags

In [4]:
tags = count_tags(datafile)
print(tags)
print('Total Nodes and Ways = {:,}'.format(tags['node']+tags['way']))

{'osm': 1, 'node': 2524263, 'relation': 732, 'way': 211467, 'tag': 813843, 'nd': 2784745, 'member': 7647, 'bounds': 1}
Total Nodes and Ways = 2,735,730


## Check Street Names

In [5]:
#Extract Street Type
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

#Known Street Types
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Loop", "Way", "Run", "Circle"]

#Known Mappings
mapping_street_name = { "St": "Street", "St.": "Street", "Rd": "Road", "Rd.": "Road",
            "Ave":"Avenue", "Ave.":"Avenue", "Blvd":"Boulevard", "Blvd.":"Boulevard",
            "Pkwy":"Parkway", "Pky":"Parkway", "Dr":"Drive", "Ln":"Lane", "Ct":"Court",
            "Pl":"Place", "Cir":"Circle", "N":"North","E":"East","S":"South","W":"West"}

def audit_street_type(street_types, street_mapping, street_name):
    """Check if street name is a known name or maps to a known name.  If not, record it."""
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()           
        if (street_type not in expected) and (street_type not in street_mapping.keys()):
            street_types[street_type].add(street_name)

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == "addr:street":
                    audit_street_type(street_types, mapping_street_name, tag.attrib['v'])

    return street_types

In [6]:
street_types = audit(datafile)
for k,v in street_types.items():
    print(k,v)

Suite {'N Duke St Suite'}
Plaza {'Park Forty Plaza', 'City Hall Plaza', 'Exchange Plaza'}
Crescent {'West Acres Crescent'}
Ext {'New Hope Commons Boulevard Ext'}
Hills {'The Circle at North Hills'}
100 {'100'}
LaurelcherryStreet {'LaurelcherryStreet'}
PI {'Alexander Promenade PI'}
1000 {'Six Forks Road #1000'}
ST {'W EDENTON ST'}
West {'NC Highway 55 West', 'Highway 54 West', 'Highway West', 'Highway 55 West'}
55 {'NC Highway 55', 'Highway 55', 'US 55'}
Practice {'Triangle Family Practice'}
Grove {'Newton Grove'}
Extension {'Weaver Dairy Road Extension'}
Highway {'Wake Forest Highway', 'Apex Highway'}
Crossing {'Oldham Forest Crossing'}
Bypass {'US 15 501 Bypass'}
751 {'NC Highway 751'}
70 {'US 70'}
17 {'US Highway 17'}
Point {'Rocky Point'}
St, {'Morris St,'}
CIrcle {'Meadowmont Village CIrcle'}
54 {'West Highway 54', 'State Highway 54', 'West NC Highway 54', 'Highway 54'}
Hill {'Chapel Hill'}
Fork {'Dry Fork'}
Terrace {'Stonebrook Terrace'}
East {'US Highway 70 East'}
Driver {'Garret

# Correct and Convert to JSON

## Correction Functions for specific strings

In [7]:
def update_name(name, mapping):
    name = name.split(' ')
    for idx, subname in enumerate(name):
        if subname in mapping.keys():
            name[idx] = mapping[subname]
    name = " ".join(name)

    return name

## Formatting Function for Node

In [8]:
def process_node(element):
    """Format Node object and associated tags.  Remove attributes as they are processed."""
    node = {'type':'node'}
    #Extract raw information for this node
    attributes = {a:element.attrib[a] for a in element.attrib}
    tags = [(t.attrib['k'],t.attrib['v']) for t in element.findall('tag')]
    #tags used a list of tuples instead of dict because of potential duplicate keys
    
    #Add ID
    node['id'] = element.attrib['id']
    del attributes['id']
    
    #Add Position
    if 'lat' in attributes and 'lon' in attributes:
        node['pos'] = [float(element.attrib["lat"]), float(element.attrib["lon"])]
        del attributes['lat']
        del attributes['lon']

    #Created
    CREATED = ["version", "changeset", "timestamp", "user", "uid"]
    created_attrib = [a for a in attributes if a in CREATED] #Which of the created attr are present
    if len(created_attrib) > 0:
        node['created'] = {}
        for a in created_attrib:
            node['created'][a] = element.attrib[a]
            del attributes[a]
    
    #Remaining Attributes
    for a in attributes:
        if problemchars.match(a): #skip attributes with problematic keys
            continue
        else:
            node[a] = element.attrib[a]

    #Process tags
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    address = re.compile(r'^addr:([a-z]|_)+$')#match 'addr', one colon, some tag
    gnis = re.compile(r'^gnis:([a-z]|_)+$')#similar structure to address.  Seems interesting.
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

    for k,v in tags:
        if problemchars.match(k):
            continue
        elif address.match(k):
            #Add Address dict if needed
            if 'address' not in node.keys(): #add address dict if not present
                node['address'] = dict()
            #Get subnode and corrected (if possible) value
            subnode = k.split(':')[1]
            if subnode == 'street':
                v = update_name(v, mapping_street_name)
            #Add value
            node['address'][subnode] = v
        elif gnis.match(k):
            if 'gnis' not in node.keys(): #add address dict if not present
                node['gnis'] = dict()
            node['gnis'][k.split(':')[1]] = v
        elif lower.match(k) or lower_colon.match(k):
            node[k] = v         
    
    return node

## Formatting function for Way

In [9]:
def process_way(element):
    """Format Way object and associated nd and tags"""
    way = {'type':'way'}
    
    #Extract raw information for this node
    attributes = {a:element.attrib[a] for a in element.attrib}
    noderefs = [n.attrib['ref'] for n in element.findall('nd')]
    tags = [(t.attrib['k'],t.attrib['v']) for t in element.findall('tag')]
    
    #Add ID
    way['id'] = element.attrib['id']
    del attributes['id']
    
    #Created
    CREATED = ["version", "changeset", "timestamp", "user", "uid"]
    created_attrib = [a for a in attributes if a in CREATED] #Which of the created attr are present
    if len(created_attrib) > 0:
        way['created'] = {}
        for a in created_attrib:
            way['created'][a] = element.attrib[a]
            del attributes[a]
            
    #Remaining Attributes
    for a in attributes:
        if problemchars.match(a): #skip attributes with problematic keys
            continue
        else:
            way[a] = element.attrib[a]
            
    #Node Refs
    if len(noderefs) > 0:
        way['node_refs'] = noderefs
    
    #Process tags
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    tiger = re.compile(r'^tiger:([a-z]|_)+$') #Import from US Census Data for roadways, etc
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

    for k,v in tags:
        if problemchars.match(k):
            continue
        elif tiger.match(k):
            if 'tiger' not in way.keys(): #add address dict if not present
                way['tiger'] = dict()
            way['tiger'][k.split(':')[1]] = v
        elif lower.match(k) or lower_colon.match(k):
            way[k] = v 
    
    return way

## Iterate through XML file.

In [10]:
def shape_element(element):
    if element.tag == "node":
        shaped_element = process_node(element)
    elif element.tag == 'way':
        shaped_element = process_way(element)
    else:
        shaped_element = None
    return shaped_element
        

def process_map(file_in):
    file_out = "{0}.json".format(file_in)
    with open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                fo.write(json.dumps(el)+"\n")

In [11]:
process_map(datafile)

# Import into MongoDB

In [12]:
#Using mongoimport
!mongoimport -d osm -c raleigh --drop --file="../raleigh_north-carolina.osm.json"

2015-07-05T18:39:29.710-0400	connected to: localhost
2015-07-05T18:39:29.710-0400	dropping: osm.raleigh
2015-07-05T18:39:32.708-0400	[#.......................] osm.raleigh	25.6 MB/562.6 MB (4.6%)
2015-07-05T18:39:35.709-0400	[##......................] osm.raleigh	53.7 MB/562.6 MB (9.5%)
2015-07-05T18:39:38.708-0400	[###.....................] osm.raleigh	82.0 MB/562.6 MB (14.6%)
2015-07-05T18:39:41.709-0400	[####....................] osm.raleigh	110.4 MB/562.6 MB (19.6%)
2015-07-05T18:39:44.708-0400	[#####...................] osm.raleigh	139.1 MB/562.6 MB (24.7%)
2015-07-05T18:39:47.708-0400	[#######.................] osm.raleigh	166.7 MB/562.6 MB (29.6%)
2015-07-05T18:39:50.708-0400	[########................] osm.raleigh	195.2 MB/562.6 MB (34.7%)
2015-07-05T18:39:53.708-0400	[#########...............] osm.raleigh	224.1 MB/562.6 MB (39.8%)
2015-07-05T18:39:56.708-0400	[##########..............] osm.raleigh	253.0 MB/562.6 MB (45.0%)
2015-07-05T18:39:59.708-0400	[############............]