In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB. 

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to 
update the street names before you save them to JSON. 

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings. 
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""

'\nYour task is to wrangle the data and transform the shape of the data\ninto the model we mentioned earlier. The output should be a list of dictionaries\nthat look like this:\n\n{\n"id": "2406124091",\n"type: "node",\n"visible":"true",\n"created": {\n          "version":"2",\n          "changeset":"17206049",\n          "timestamp":"2013-08-03T16:43:42Z",\n          "user":"linuxUser16",\n          "uid":"1219059"\n        },\n"pos": [41.9757030, -87.6921867],\n"address": {\n          "housenumber": "5157",\n          "postcode": "60625",\n          "street": "North Lincoln Ave"\n        },\n"amenity": "restaurant",\n"cuisine": "mexican",\n"name": "La Cabana De Don Luis",\n"phone": "1 (773)-271-5176"\n}\n\nYou have to complete the function \'shape_element\'.\nWe have provided a function that will parse the map file, and call the function with the element\nas an argument. You should return a dictionary, containing the shaped data for that element.\nWe have also provided a way to save t

In [2]:
OSMFILE = 'everett.osm'
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

## Street names cleaning

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road"
            }

In [4]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def update_name(name, mapping):
    st = street_type_re.search(name)
    if st:
        wrong = st.group(0)
        name = street_type_re.sub(mapping[wrong], name)
    return name


In [5]:
st_types = audit(OSMFILE)
pprint.pprint(dict(st_types))

for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name

{'1': set(['228th St SE Suite 1']),
 '2': set(['Highway 2', 'U.S. Hwy 2']),
 '200': set(['4th Ave W, Suite 200']),
 '98204': set(['98204']),
 '98296': set(['196th Street Southeast, Snohomish, WA 98296']),
 '99': set(['Highway 99', 'State Highway 99']),
 'Ave': set(['Rucker Ave', 'South 2nd Ave']),
 'Blvd': set(['Alderwood Mall Blvd', 'Mill Creek Blvd']),
 'Broadway': set(['Broadway']),
 'C': set(['228th Street SE  Suite C']),
 'D': set(['Avenue D']),
 'Dr': set(['Harbour Reach Dr']),
 'E': set(['Martin Way E']),
 'H': set(['Hwy 99, Ste H']),
 'Highway': set([' Bothell-Everett Highway',
                 'Bothell Everett Highway',
                 'Bothell Evertt Highway',
                 'Bothell Highway',
                 'Bothell-Everett Highway',
                 'Pacific Highway',
                 'Stevens Pass Highway']),
 'Hwy': set(['Bothell-Everett Hwy']),
 'J': set(['Bothell Everett Highway, Suite J']),
 'Loop': set(['Northwest Cherry Loop']),
 'N': set(['5th Ave N', 'Aurora A

KeyError: '98204'

## Data preparation for mongodb

In [6]:
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        node["created"] = {}
        node['type'] = element.tag
        ats = element.attrib 
        for k in ats:
            if problemchars.search(k):
                continue
            if k in CREATED:
                node["created"][k] = ats[k]
                continue
            if k in ['lat', 'lon']:
                if not node.get('pos'):
                    node['pos'] = [None] * 2
                if k == "lon":
                    node["pos"][1] = float(ats[k])
                else: 
                    node["pos"][0] = float(ats[k])
                continue
            for tag in element:
                if tag.tag == 'nd':
                    if not node.get('node_refs'):
                        node['node_refs'] = []
                    node["node_refs"].append(tag.attrib['ref'])
                    continue
                if tag.tag == 'tag':
                    tats = tag.attrib
                    lc = lower_colon.search(tats['k'])
                    if lc:
                        if not node.get('address'):
                            node['address'] = {}
                        addr = tats['k'].split(':')
                        if addr[0] == 'addr' and len(addr) == 2:
                            node["address"][addr[1]] = tats['v']
                        continue
            node[k] = ats[k]
        pprint.pprint(node)
        return node
    else:
        return None

In [7]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [None]:
%%capture
data = process_map(OSMFILE, False);

{'created': {'changeset': '640330',
             'timestamp': '2008-10-31T17:06:17Z',
             'uid': '34124',
             'user': 'Sunny',
             'version': '3'},
 'id': '29937735',
 'pos': [47.7120118, -122.276973],
 'type': 'node'}
{'created': {'changeset': '640330',
             'timestamp': '2008-10-31T17:06:18Z',
             'uid': '34124',
             'user': 'Sunny',
             'version': '3'},
 'id': '29937736',
 'pos': [47.7125604, -122.2770416],
 'type': 'node'}
{'created': {'changeset': '640330',
             'timestamp': '2008-10-31T17:06:18Z',
             'uid': '34124',
             'user': 'Sunny',
             'version': '3'},
 'id': '29937737',
 'pos': [47.7136057, -122.2773764],
 'type': 'node'}
{'created': {'changeset': '640330',
             'timestamp': '2008-10-31T17:06:18Z',
             'uid': '34124',
             'user': 'Sunny',
             'version': '3'},
 'id': '29937738',
 'pos': [47.7147434, -122.277754],
 'type': 'node'}
{'created': {'