Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
228 lines (197 sloc) 7.64 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:
"id": "2406124091",
"type: "node",
"created": {
"pos": [41.9757030, -87.6921867],
"address": {
"housenumber": "5157",
"postcode": "60625",
"street": "North Lincoln Ave"
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB.
Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to
update the street names before you save them to JSON.
In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
- attributes in the CREATED array should be added under a key "created"
- attributes for latitude and longitude should be added to a "pos" array,
for use in geospacial indexing. Make sure the values inside "pos" array are floats
and not strings.
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
the tag should be ignored, for example:
<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>
should be turned into:
"address": {
"housenumber": 5158,
"street": "North Lincoln Avenue"
"amenity": "pharmacy",
- for "way" specifically:
<nd ref="305896090"/>
<nd ref="1719825889"/>
should be turned into
"node_refs": ["305896090", "1719825889"]
from collections import OrderedDict
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_re = re.compile(r'^address\:')
street_re = re.compile(r'^street')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
position_attributes = ['lat', 'lon']
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
address_re = re.compile(r'^addr\:')
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]
position_attributes = ['lat', 'lon']
def shape_element(element):
node = {}
address = {}
if element.tag == "node" or element.tag == "way":
# populate tag type
node['type'] = element.tag
# parse through attributes
for attribute in element.attrib:
if attribute in CREATED:
if 'created' not in node:
node['created'] = {}
node['created'][attribute] = element.get(attribute)
elif attribute in position_attributes:
node[attribute] = element.get(attribute)
# populate position
if 'lat' in element.attrib and 'lon' in element.attrib:
node['pos'] = [float(element.get('lat')), float(element.get('lon'))]
# parse second-level tags
for second in element:
#populate `node_refs`
if second.tag == 'nd':
if 'node_refs' not in node:
node['node_refs'] = []
if 'ref' in second.attrib:
# pass non-tag elements and elements without `k` or `v`
if second.tag != 'tag'\
or 'k' not in second.attrib\
or 'v' not in second.attrib:
# populate k-v pairs
k = second.get('k')
v = second.get('v')
# skip problem characters
# parse and strip address values from k-v pairs
key = k.replace('addr:', '')
address[key] = v
# compile address
if address:
node['address'] = {}
street = None
street_unjoin = {}
street_dict = ['prefix', 'name', 'type']
# parse through address objects
for key in address:
val = address[key]
if key == 'street':
street = val
elif 'street:' in key:
street_unjoin[key.replace('street:', '')] = val
node['address'][key] = val
# assign street or fallback to compile street_unjoin(ed)
if street:
node['address']['street'] = street
elif len(street_unjoin) > 0:
node['address']['street'] = ' '.join([street_unjoin[key] for key in street_dict])
return node
return None
def process_map(file_in, pretty = False):
file_out = "{0}.json".format(file_in)
data = []
with, "w") as fo:
for _, element in ET.iterparse(file_in):
el = shape_element(element)
if el:
if pretty:
fo.write(json.dumps(el, indent=2)+"\n")
fo.write(json.dumps(el) + "\n")
return data
def test():
# NOTE: if you are running this code on your computer, with a larger dataset,
# call the process_map procedure with pretty=False. The pretty=True option adds
# additional spaces to the output, making it significantly larger.
data = process_map('example.osm', True)
correct_first_elem = {
"id": "261114295",
"visible": "true",
"type": "node",
"pos": [41.9730791, -87.6866303],
"created": {
"changeset": "11129782",
"user": "bbmiller",
"version": "7",
"uid": "451048",
"timestamp": "2012-03-28T18:31:23Z"
assert data[0] == correct_first_elem
assert data[-1]["address"] == {
"street": "West Lexington St.",
"housenumber": "1412"
assert data[-1]["node_refs"] == [ "2199822281", "2199822390", "2199822392", "2199822369",
"2199822370", "2199822284", "2199822281"]
if __name__ == "__main__":
You can’t perform that action at this time.