In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task is to use the iterative parsing to process the map file and
find out not only what tags are there, but also how many, to get the
feeling on how much of which data you can expect to have in the map.
Fill out the count_tags function. It should return a dictionary with the
tag name as the key and number of times this tag can be encountered in
the map as value.

Note that your code will be tested with a different data file than the 'example.osm'
"""
import xml.etree.cElementTree as ET
import pprint


def count_tags(filename):
    d = {}
    for event, elem in ET.iterparse(filename):
        # print elem.tag
        if elem.tag in d:
            d[elem.tag] += 1
        else:
            d[elem.tag] = 1
    # print pprint.pprint(d)
    return d


def test():
    osm_file = open("/Users/jpowell/Downloads/data/miami_florida.osm", "r")

    tags = count_tags(osm_file)
    pprint.pprint(tags)

if __name__ == "__main__":
    test()


{'bounds': 1,
 'member': 42426,
 'nd': 1807642,
 'node': 1516787,
 'osm': 1,
 'relation': 1523,
 'tag': 1477841,
 'way': 198244}


In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
Before you process the data and add it into MongoDB, you should
check the "k" value for each "<tag>" and see if they can be valid keys in MongoDB,
as well as see if there are any other potential problems.

We have provided you with 3 regular expressions to check for certain patterns
in the tags. As we saw in the quiz earlier, we would like to change the data model
and expand the "addr:street" type of keys to a dictionary like this:
{"address": {"street": "Some value"}}
So, we have to see if we have such tags, and if we have any tags with problematic characters.
Please complete the function 'key_type'.
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter("tag"):
            value = tag.attrib['k'] 
            if lower.search(value):
                keys['lower'] += 1
#                 print 'lower'
#                 print value
            elif lower_colon.search(value):
                keys['lower_colon'] += 1
#                 print 'lower_colon'
#                 print value
            elif problemchars.search(value):
                keys['problemchars'] += 1
                print 'problemchars'
                print value
            else:
                keys['other'] += 1
#                 print 'other'
#                 print value

    return keys


def count_tags(element, dictionary):
    if element.tag == "tag":
        for tag in element.iter("tag"):
            value = tag.attrib["k"]
            # print elem.tag
            if value in dictionary:
                dictionary[value] += 1
            else:
                dictionary[value] = 1
        # print pprint.pprint(d)
    return dictionary


def process_map(filename):
    d = {}
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
        d = count_tags(element, d)

    return keys, d


def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertions will be incorrect then.
    osm_file = open("/Users/jpowell/Downloads/data/miami_florida.osm", "r")

    keys = process_map(osm_file)
    pprint.pprint(keys)
    

if __name__ == "__main__":
    test()

problemchars
Gourmet Foods
problemchars
Moving From
({'lower': 602295, 'lower_colon': 829057, 'other': 46487, 'problemchars': 2},
 {'Company': 1,
  'FCC_ASR_Reg': 2,
  'FIXME': 1528,
  'FIXME:aeroway': 1,
  'FIXME:amenity': 10,
  'FIXME:building': 2,
  'FIXME:construction': 7,
  'FIXME:cycleway': 23,
  'FIXME:depth': 4,
  'FIXME:hgv': 39,
  'FIXME:highway': 6,
  'FIXME:lanes': 9,
  'FIXME:lcn_ref': 1,
  'FIXME:maxspeed': 193,
  'FIXME:name': 38,
  'FIXME:old_ref:1945': 40,
  'FIXME:old_ref:pre_1945': 4,
  'Gourmet Foods': 1,
  'HFCS': 11196,
  'ISO3166-1': 1,
  'ISO3166-1:alpha2': 1,
  'ISO3166-1:alpha3': 1,
  'ISO3166-1:numeric': 1,
  'ISO3166-2': 1,
  'Moving From': 1,
  'NHD:ComID': 103,
  'NHD:FCode': 103,
  'NHD:FDate': 12,
  'NHD:FTYPE': 12,
  'NHD:FType': 91,
  'NHD:Permanent_': 12,
  'NHD:RESOLUTION': 103,
  'NHD:ReachCode': 88,
  'NHD:way_id': 91,
  'NHS': 2473,
  'PKSCHOOL': 1,
  'abandoned:building': 1,
  'access': 4409,
  'addr:city': 20011,
  'addr:country': 18304,
  'addr

In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
	if 'uid' in element.attrib.keys():
		user = element.attrib['uid']

		return user


def process_map(filename):
    users = set()
    user_list = []
    for _, element in ET.iterparse(filename):
	  	if 'uid' in element.attrib.keys():
			user = element.attrib['uid']
  			user_list.append(user)

    users = set(user_list)
    return users


def test():
    osm_file = open("/Users/jpowell/Downloads/data/miami_florida_sample.osm", "r")

    users = process_map(osm_file)
    print len(users)
    pprint.pprint(users)


if __name__ == "__main__":
    test()

668
set(['1007528',
     '1012362',
     '1018224',
     '102691',
     '103107',
     '103253',
     '1043009',
     '104519',
     '104962',
     '1051550',
     '105255',
     '10557',
     '1058308',
     '10786',
     '1087647',
     '108775',
     '1090081',
     '10927',
     '1093154',
     '109570',
     '1096309',
     '110263',
     '110489',
     '110639',
     '111159',
     '11154',
     '1119200',
     '11374',
     '115102',
     '11547',
     '115918',
     '1163648',
     '1163952',
     '1171517',
     '118021',
     '118048',
     '119748',
     '1198074',
     '1198303',
     '119881',
     '121264',
     '1214881',
     '12290',
     '123633',
     '1240849',
     '1244855',
     '1246157',
     '124815',
     '1249205',
     '1251419',
     '1260280',
     '128470',
     '12919',
     '129232',
     '1298694',
     '1306',
     '131059',
     '131200',
     '1318885',
     '1330847',
     '1339815',
     '13413',
     '1342842',
     '1342868',
     '1346164',
  

In [62]:
"""
Your task in this exercise has two steps:

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
    We have provided a simple test so that you see what exactly is expected
"""
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint


street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_direction_re = re.compile(r'^\w+', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "Circle", "Crescent", "Highway", "Manor", "Terrace",
            "Trace", "Way", "Causeway", "Point", "State Road"]

expected_direction = ["North", "South", "East", "West", "Northeast", "Northwest", "Southeast", "Southwest"]

# UPDATE THIS VARIABLE
mapping = { "AVE": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "ave": "Avenue",
            "BLVD": "Boulevard",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Cir": "Circle",
            "Cirlce": "Circle",
            "Cres": "Crescent",
            "Ct": "Court",
            "Dr": "Drive",
            "Druve": "Drive",
            "HWY": "Highway",
            "Hwy": "Highway",
            "Ln": "Lane",
            "Mnr": "Manor",
            "Pkwy": "Parkway",
            "Pl": "Place",
            "Pt": "Point",
            "ST": "Street",
            "st": "Street",
            "St": "Street",
            "Sr": "State Road",
            "street": "Street",
            "St.": "Street",
            "RD": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "Ter": "Terrace",
            "Trce": "Trace",
            "Trl": "Trail", 
            "NW": "Northwest",
            "Cv": "Cove",
            "Hwy-1": "US Highway 1",
            "Ste": "Suite",
            "Spgs" : ""
            }

mapping_direction = {
            "N": "North",
            "S": "South",
            "E": "East",
            "W": "West",
            "NE": "Northeast",
            "Norhwest": "Northwest",
            "NW": "Northwest",
            "SE": "Southeast",
            "SW": "Southwest",
            "sw": "Southwest",
            "southwest": "Southwest", 
            "St": "Saint",
            "Hwy": "Highway", 
            "royal": "Royal",
            "wellington": "Wellington"
}

def audit_street_type(street_types, street_name):
    m = street_type_re.search(update_name(street_name, mapping))
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def audit_street_direction(street_directions, street_name):
    m = street_direction_re.search(update_name(street_name, mapping_direction))
    if m:
        street_direction = m.group()
        if street_direction not in expected_direction:
            street_directions[street_direction].add(street_name)

            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def is_zip_code(elem):
    return (elem.attrib['k'] == "addr:postcode")


def audit_type(osmfile):
    osm_file = osmfile
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


def audit_direction(osmfile):
    osm_file = osmfile
    street_directions = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_direction(street_directions, tag.attrib['v'])
                if is_zip_code(tag):                  
                    zip_code = re.sub(r"\D", "", tag.attrib['v'])
                    print zip_code[0:5]
  
    return street_directions


def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = name.replace(street_type, mapping[street_type])

    a = street_direction_re.search(name)
    if a: 
        street_direction = a.group()
        if street_direction in mapping:
            name = name.replace(street_direction, mapping[street_direction])
            
    return name


def test():
    osmfile = open("/Users/jpowell/Downloads/data/miami_florida_sample.osm", "r")
    
#     st_types = audit_type(osmfile)
#     print len(st_types)
#     pprint.pprint(dict(st_types))


    st_directions = audit_direction(osmfile)
    print len(st_directions)
#     pprint.pprint(dict(st_directions))
    
    
#     for st_type, ways in st_types.iteritems():
#         for name in ways:
#             better_name = update_name(name, mapping)
#             print name, "=>", better_name

#     for st_direction, ways in st_directions.iteritems():
#         for name in ways:
#             print name
#             better_name = update_name(name, mapping_direction)
#             print better_name


if __name__ == '__main__':
    test()

33144
33444
33308
33130
33187
33143
33134
33060
33060
33316
33060
33132
33139
33139
33139
33139
33172
33029
33316
33065
33460
33133
33325
33189
33157
33062
33186
33012
33138
33155
33401
33155
33132
33156
33139
33461
33127
33144
33019
33180
33138
33004
33326
33326
33327
33327
33327
33327
33327
33327
33326
33327
33327
33326
33327
33327
33327
33326
33327
33327
33326
33327
33326
33326
33326
33326
33326
33327
33326
33327
33327
33326
33327
33327
33326
33327
33326
33326
33327
33326
33326
33326
33327
33327
33326
33326
33326
33326
33327
33327
33327
33327
33326
33326
33326
33327
33326
33327
33327
33327
33327
33327
33326
33327
33327
33326
33326
33326
33327
33327
33326
33326
33327
33327
33327
33327
33326
33327
33326
33326
33327
33327
33327
33327
33326
33326
33326
33326
33327
33327
33326
33326
33327
33327
33326
33326
33327
33326
33327
33327
33326
33326
33327
33326
33327
33327
33327
33327
33327
33326
33326
33326
33327
33327
33327
33326
33327
33327
33327
33327
33326
33327
33327
33326
33327
33327
3332

In [25]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
from collections import defaultdict
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

You have to complete the function 'shape_element'.
We have provided a function that will parse the map file, and call the function with the element
as an argument. You should return a dictionary, containing the shaped data for that element.
We have also provided a way to save the data in a file, so that you could use
mongoimport later on to import the shaped data into MongoDB.

Note that in this exercise we do not use the 'update street name' procedures
you worked on in the previous exercise. If you are using this code in your final
project, you are strongly encouraged to use the code from previous exercise to
update the street names before you save them to JSON.

In particular the following things should be done:
- you should process only 2 types of top level tags: "node" and "way"
- all attributes of "node" and "way" should be turned into regular key/value pairs, except:
    - attributes in the CREATED array should be added under a key "created"
    - attributes for latitude and longitude should be added to a "pos" array,
      for use in geospacial indexing. Make sure the values inside "pos" array are floats
      and not strings.
- if second level tag "k" value contains problematic characters, it should be ignored
- if second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
- if second level tag "k" value does not start with "addr:", but contains ":", you can process it
  same as any other tag.
- if there is a second ":" that separates the type/direction of a street,
  the tag should be ignored, for example:

<tag k="addr:housenumber" v="5158"/>
<tag k="addr:street" v="North Lincoln Avenue"/>
<tag k="addr:street:name" v="Lincoln"/>
<tag k="addr:street:prefix" v="North"/>
<tag k="addr:street:type" v="Avenue"/>
<tag k="amenity" v="pharmacy"/>

  should be turned into:

{...
"address": {
    "housenumber": 5158,
    "street": "North Lincoln Avenue"
}
"amenity": "pharmacy",
...
}

- for "way" specifically:

  <nd ref="305896090"/>
  <nd ref="1719825889"/>

should be turned into
"node_refs": ["305896090", "1719825889"]
"""


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

CREATED = ["version", "changeset", "timestamp", "user", "uid"]

MAPPING = { "AVE": "Avenue",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "ave": "Avenue",
            "BLVD": "Boulevard",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Cir": "Circle",
            "Cirlce": "Circle",
            "Cres": "Crescent",
            "Ct": "Court",
            "Dr": "Drive",
            "Druve": "Drive",
            "HWY": "Highway",
            "Hwy": "Highway",
            "Ln": "Lane",
            "Mnr": "Manor",
            "Pkwy": "Parkway",
            "Pl": "Place",
            "Pt": "Point",
            "ST": "Street",
            "st": "Street",
            "St": "Street",
            "Sr": "State Road",
            "street": "Street",
            "St.": "Street",
            "RD": "Road",
            "Rd": "Road",
            "Rd.": "Road",
            "Ter": "Terrace",
            "Trce": "Trace",
            "Trl": "Trail", 
            "Spgs" : ""
            }



def update_name(name, mapping):
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type in mapping:
            name = name.replace(street_type, mapping[street_type])

    return name


def shape_element(element):
    node = {"created": {}, "type": 'node', "address": {}, "node_refs": []}
    if element.tag == "node" or element.tag == "way":
        node["type"] = element.tag
        # print element.tag
        # print element.attrib.keys()
        for key in element.attrib.keys():
            if key in CREATED:
                node["created"][key] = element.attrib[key]
            elif key == "lat" or key == "lon":
                node["pos"] = []
                lat = float(element.attrib["lat"])
                lon = float(element.attrib["lon"])
                node["pos"].extend((lat, lon))
            else:
                node[key] = element.attrib[key]

        for child in element:
            if child.tag == "tag":
                if problemchars.search(child.attrib["k"]):
                    continue
                elif child.attrib["k"].startswith("addr:"):
                    addr_attribute = child.attrib["k"].replace("addr:", "")
                    if not lower_colon.search(addr_attribute):
                        if addr_attribute == "full":
                            continue
                        elif addr_attribute == "street":
#                             print addr_attribute
#                             print child.attrib["v"]
#                             print update_name(child.attrib["v"], MAPPING)
                            node["address"][addr_attribute] = update_name(child.attrib["v"], MAPPING)
#                             print node["address"]
                        else:
                            node["address"][addr_attribute] = child.attrib["v"]
#                             print node
                    print node["address"]
                else: 
                    node[child.attrib["k"]] = child.attrib["v"]
            elif child.tag == "nd":
                node["node_refs"].append(child.attrib["ref"])
            else:
                print child.tag, child.attrib

        if not bool(node["address"]):
            node.pop("address", None)
        if not bool(node["node_refs"]):
            node.pop("node_refs", None)

#         print node
        return node
    else:
        return None


def process_map(file_in, pretty=False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data


def test():
    # NOTE: if you are running this code on your computer, with a larger dataset,
    # call the process_map procedure with pretty=False. The pretty=True option adds
    # additional spaces to the output, making it significantly larger.
    osmfile = "/Users/jpowell/Downloads/data/miami_florida_sample.osm"
    data = process_map(osmfile, False)
    #pprint.pprint(data)


if __name__ == "__main__":
    test()


{'city': 'Miami'}
{'city': 'Miami', 'street': 'Southwest 78th Place'}
{'city': 'Miami', 'street': 'Southwest 78th Place', 'postcode': '33144'}
{'city': 'Delray Beach'}
{'city': 'Delray Beach', 'state': 'FL'}
{'city': 'Delray Beach', 'state': 'FL', 'street': 'North Swinton Avenue'}
{'city': 'Delray Beach', 'state': 'FL', 'street': 'North Swinton Avenue', 'postcode': '33444'}
{'city': 'Delray Beach', 'state': 'FL', 'street': 'North Swinton Avenue', 'housenumber': '51', 'postcode': '33444'}
{'city': 'Fort Lauderdale'}
{'city': 'Fort Lauderdale', 'state': 'FL'}
{'city': 'Fort Lauderdale', 'state': 'FL', 'street': 'Federal Highway'}
{'city': 'Fort Lauderdale', 'state': 'FL', 'street': 'Federal Highway', 'postcode': '33308'}
{'city': 'Fort Lauderdale', 'state': 'FL', 'street': 'Federal Highway', 'housenumber': '4725', 'postcode': '33308'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': 'FL'}
{'state': '

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

# OSM_FILE = "osm_file.osm"  # Replace this with your osm file
OSM_FILE = "/Users/jpowell/Downloads/data/miami_florida.osm"
SAMPLE_FILE = "/Users/jpowell/Downloads/data/miami_florida_sample.osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every 10th top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % 10 == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')