In [48]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re
import codecs
import json
import os
from pymongo import MongoClient

In [38]:
OSMFILE = "orangecounty.osm"

In [7]:
def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename):
        if elem.tag in tags:
            tags[elem.tag] += 1
        else: 
            tags[elem.tag] = 1        
    return tags

In [4]:
count_tags(OSMFILE)

{'bounds': 1,
 'member': 14494,
 'meta': 1,
 'nd': 971913,
 'node': 795527,
 'note': 1,
 'osm': 1,
 'relation': 1524,
 'tag': 755451,
 'way': 108197}

In [6]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [8]:
def key_type(element, keys):
    if element.tag == "tag":       
        if lower.search(element.attrib['k']):
            keys['lower'] += 1
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon'] += 1 
        elif problemchars.search(element.attrib['k']):
            keys['problemchars'] += 1
        else:
            keys['other'] += 1        
    return keys

In [15]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

In [10]:
process_map(OSMFILE)

{'lower': 325600, 'lower_colon': 419432, 'other': 10408, 'problemchars': 11}

In [16]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "Way"]

In [20]:
def audit_street_type(street_types, street_name):
    match = street_type_re.search(street_name)
    if match:
        street_type = match.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [21]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [22]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types

In [24]:
pprint.pprint(dict(audit(OSMFILE)))

{'100': set(['3347 Michelson Dr Suite 100']),
 '1725': set(['1725']),
 '203A': set(['Bolsa Avenue Suite 203A']),
 '26': set(['Vista Del Lago # 26']),
 '26705': set(['26705']),
 '44': set(['44']),
 '503': set(['503']),
 '630': set(['Irvine Center Drive, #630']),
 '66': set(['Route 66']),
 '7': set(['7']),
 '92627': set(['Newport Blvd, Costa Mesa, CA 92627']),
 'A': set(['Via Estrada, Unit A']),
 'Aberdeen': set(['Aberdeen']),
 'Abeto': set(['Paseo Abeto']),
 'Acres': set(['Glen Acres']),
 'Ada': set(['Ada']),
 'Aims': set(['Aims']),
 'Alameda': set(['La Alameda']),
 'Alamitos': set(['Alamitos']),
 'Albercon': set(['Avenida Albercon']),
 'Alder': set(['Speckled Alder']),
 'Alderwood': set(['Alderwood']),
 'Alhambra': set(['Alhambra']),
 'Allium': set(['Allium']),
 'Almondwood': set(['Almondwood']),
 'Alondra': set(['Alondra', 'Calle Alondra']),
 'Altazano': set(['Camino Altazano']),
 'Alter': set(['Alter']),
 'Amapola': set(['Avenida Amapola']),
 'Amberhill': set(['Amberhill']),
 'Ambien

In [25]:
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Circle", "Way"]

mapping = { "St": "Street",
            "St.": "Street",
            "Rd." : "Road",
            "Ave" : "Avenue",
            "Dr" : "Drive",
            "Dr." : "Drive",
            "Pkwy" : "Parkway",
            "Pkwy." : "Parkway",
            "Rd" : "Road",
            "Av" : "Avenue",
            "Ave." : "Avenue",
            "Blvd" : "Boulevard",
            "Blvd." : "Boulevard",
            "Ln." : "Lane",
            "WAY" : "Way"
            }

In [26]:
pprint.pprint(dict(audit(OSMFILE)))

{'100': set(['3347 Michelson Dr Suite 100']),
 '1725': set(['1725']),
 '203A': set(['Bolsa Avenue Suite 203A']),
 '26': set(['Vista Del Lago # 26']),
 '26705': set(['26705']),
 '44': set(['44']),
 '503': set(['503']),
 '630': set(['Irvine Center Drive, #630']),
 '66': set(['Route 66']),
 '7': set(['7']),
 '92627': set(['Newport Blvd, Costa Mesa, CA 92627']),
 'A': set(['Via Estrada, Unit A']),
 'Aberdeen': set(['Aberdeen']),
 'Abeto': set(['Paseo Abeto']),
 'Acres': set(['Glen Acres']),
 'Ada': set(['Ada']),
 'Aims': set(['Aims']),
 'Alameda': set(['La Alameda']),
 'Alamitos': set(['Alamitos']),
 'Albercon': set(['Avenida Albercon']),
 'Alder': set(['Speckled Alder']),
 'Alderwood': set(['Alderwood']),
 'Alhambra': set(['Alhambra']),
 'Allium': set(['Allium']),
 'Almondwood': set(['Almondwood']),
 'Alondra': set(['Alondra', 'Calle Alondra']),
 'Altazano': set(['Camino Altazano']),
 'Alter': set(['Alter']),
 'Amapola': set(['Avenida Amapola']),
 'Amberhill': set(['Amberhill']),
 'Ambien

In [27]:
def update_name(name, mapping):
    street = street_type_re.search(name)
    if street:
        street_type = street.group()
        if street_type not in expected:
            name = re.sub(street_type_re, mapping[street_type], name)
    return name

In [81]:
def update_zip(zipcode):
    if len(zipcode) > 5:
        zipcode_re = re.compile(r'\d\-')
        if "CA" in zipcode:
            zipcode = re.sub('[CA]|\s', '', zipcode)
        if zipcode_re.search(zipcode):
            zipcode = zipcode.split('-', 1)[0]
    return zipcode

In [39]:
CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [40]:
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way" :
        # YOUR CODE HERE
        node['type'] = element.tag
        for key in element.attrib:
            if key in CREATED:
                if 'created' not in node:
                    node['created'] = {}
                node['created'][key] = element.attrib[key]
            elif key in ['lat', 'lon']:
                if 'pos' not in node:
                    node['pos'] = [None, None]
                if key == 'lat':
                    node['pos'][0] = float(element.attrib[key])
                else:
                    node['pos'][1] = float(element.attrib[key])
            else:
                node[key] = element.attrib[key]
                
        for tag in element.iter('tag'):
            if not problemchars.search(tag.attrib['k']):
                if lower_colon.search(tag.attrib['k']):
                    if tag.attrib['k'].find('addr') == 0:
                        if 'address' not in node:
                            node['address'] = {}
                        sub_attr = tag.attrib['k'].split(':', 1)
                        node['address'][sub_attr[1]] = tag.attrib['v']
                    else:
                        node[tag.attrib['k']] = tag.attrib['v']
                elif tag.attrib['k'].find(':') == -1:
                    node[tag.attrib['k']] = tag.attrib['v']
        for nd in element.iter('nd'):
            if 'node_refs' not in node:
                node['node_refs'] = []
            node['node_refs'].append(nd.attrib['ref'])
        return node
    else:
        return None

In [41]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [42]:
process_map(OSMFILE)

[{'created': {'changeset': '195194',
   'timestamp': '2008-10-09T01:26:47Z',
   'uid': '28923',
   'user': 'California Bear',
   'version': '28'},
  'id': '10565814',
  'pos': [33.7832108, -118.0912621],
  'type': 'node'},
 {'created': {'changeset': '195194',
   'timestamp': '2008-10-09T01:26:48Z',
   'uid': '28923',
   'user': 'California Bear',
   'version': '28'},
  'id': '10565817',
  'pos': [33.7745522, -118.0855069],
  'type': 'node'},
 {'created': {'changeset': '180400',
   'timestamp': '2008-10-08T23:21:07Z',
   'uid': '28923',
   'user': 'California Bear',
   'version': '3'},
  'id': '10565824',
  'pos': [33.7983548, -118.0856233],
  'type': 'node'},
 {'created': {'changeset': '180400',
   'timestamp': '2008-10-08T23:21:06Z',
   'uid': '28923',
   'user': 'California Bear',
   'version': '4'},
  'id': '10565825',
  'pos': [33.801864, -118.082499],
  'type': 'node'},
 {'created': {'changeset': '666836',
   'timestamp': '2008-09-19T16:57:00Z',
   'uid': '28923',
   'user': 'Cali

Get the size of each of the files:

In [55]:
print format(os.path.getsize(OSMFILE)/1.0e6)
print format(os.path.getsize("orangecounty.osm.json")/1.0e6)

197.805094
212.607897


In [None]:
#This line was excecuted in the terminal
mongoimport --db osm --file orangecounty.osm.json

In [49]:
db = MongoClient('localhost:27017')['osm']['orangecounty.osm']

In [50]:
db

Collection(Database(MongoClient('localhost', 27017), u'osm'), u'orangecounty.osm')

How many documents?

In [51]:
db.find().count()

903724

How many unique users?

In [72]:
len(db.distinct("created.user"))

869

How many nodes?

In [62]:
db.find({"type":"node"}).count()

795519

How many ways?

In [61]:
db.find({"type":"way"}).count()

108177

Make a list of zip codes and sort by frequency

In [67]:
pprint.pprint(list(db.aggregate([{"$match" :  {"address.postcode" : {"$exists" : 1}}}, \
                           {"$group" : {"_id" : "$address.postcode", 
                                        "count" : {"$sum" : 1}}}, \
                           {"$sort" : {"count" : -1}}])))

[{u'_id': u'92630', u'count': 11378},
 {u'_id': u'92618', u'count': 5500},
 {u'_id': u'92620', u'count': 5144},
 {u'_id': u'92679', u'count': 1583},
 {u'_id': u'92602', u'count': 734},
 {u'_id': u'92782', u'count': 699},
 {u'_id': u'92610', u'count': 257},
 {u'_id': u'93630', u'count': 243},
 {u'_id': u'92692', u'count': 102},
 {u'_id': u'92866', u'count': 75},
 {u'_id': u'92626', u'count': 64},
 {u'_id': u'92614', u'count': 61},
 {u'_id': u'90620', u'count': 61},
 {u'_id': u'92780', u'count': 60},
 {u'_id': u'92802', u'count': 54},
 {u'_id': u'92606', u'count': 53},
 {u'_id': u'92675', u'count': 50},
 {u'_id': u'92663', u'count': 50},
 {u'_id': u'92677', u'count': 42},
 {u'_id': u'92881', u'count': 36},
 {u'_id': u'92691', u'count': 35},
 {u'_id': u'92660', u'count': 31},
 {u'_id': u'92676', u'count': 28},
 {u'_id': u'92672', u'count': 24},
 {u'_id': u'92612', u'count': 21},
 {u'_id': u'92648', u'count': 21},
 {u'_id': u'92653', u'count': 17},
 {u'_id': u'92708', u'count': 16},
 {u'_i

How many exist of each ammenity

In [82]:
pprint.pprint(list(db.aggregate([{"$match" :  {"amenity" : {"$exists" : 1}}}, \
                           {"$group" : {"_id" : "$amenity", "count" : {"$sum" : 1}}}, \
                           {"$sort" : {"count" : -1}}, \
                           {"$limit" : 10}])))

[{u'_id': u'parking', u'count': 1061},
 {u'_id': u'school', u'count': 640},
 {u'_id': u'restaurant', u'count': 522},
 {u'_id': u'fountain', u'count': 410},
 {u'_id': u'fast_food', u'count': 361},
 {u'_id': u'bench', u'count': 289},
 {u'_id': u'place_of_worship', u'count': 275},
 {u'_id': u'toilets', u'count': 271},
 {u'_id': u'fuel', u'count': 189},
 {u'_id': u'drinking_water', u'count': 188}]


How many of each leisure?

In [84]:
pprint.pprint(list(db.aggregate([{"$match" : {"leisure" : {"$exists" : 1}}}, \
                                {"$group" : {"_id" : "$leisure", "count" : {"$sum" : 1}}}, \
                                {"$sort" : {"count" : -1}}, \
                                {"$limit" : 10}])))

[{u'_id': u'garden', u'count': 1745},
 {u'_id': u'park', u'count': 864},
 {u'_id': u'pitch', u'count': 839},
 {u'_id': u'swimming_pool', u'count': 273},
 {u'_id': u'playground', u'count': 144},
 {u'_id': u'golf_course', u'count': 33},
 {u'_id': u'picnic_table', u'count': 30},
 {u'_id': u'sports_centre', u'count': 29},
 {u'_id': u'track', u'count': 21},
 {u'_id': u'court', u'count': 15}]


How many of each office type in the building:commercial tag?

In [93]:
pprint.pprint(list(db.aggregate([{"$match" : {"building" : {"$exists" : 1}, 
                                              "building" : "commercial"}}, \
                                {"$group" : {"_id" : "$office", "count" : {"$sum" : 1}}}, \
                                {"$sort" : {"count" : -1}}, \
                                {"$limit" : 10}])))

[{u'_id': None, u'count': 1744}, {u'_id': u'company', u'count': 1}]


How many of each sports area?

In [96]:
pprint.pprint(list(db.aggregate([{"$match" : {"sport" : {"$exists" : 1}}}, \
                                {"$group" : {"_id" : "$sport", "count" : {"$sum" : 1}}}, \
                                {"$sort" : {"count" : -1}}])))

[{u'_id': u'baseball', u'count': 360},
 {u'_id': u'tennis', u'count': 258},
 {u'_id': u'basketball', u'count': 114},
 {u'_id': u'swimming', u'count': 72},
 {u'_id': u'soccer', u'count': 52},
 {u'_id': u'american_football', u'count': 20},
 {u'_id': u'volleyball', u'count': 19},
 {u'_id': u'skateboard', u'count': 9},
 {u'_id': u'equestrian', u'count': 8},
 {u'_id': u'beachvolleyball', u'count': 6},
 {u'_id': u'athletics', u'count': 5},
 {u'_id': u'golf', u'count': 5},
 {u'_id': u'multi', u'count': 5},
 {u'_id': u'billiards', u'count': 5},
 {u'_id': u'football', u'count': 4},
 {u'_id': u'hockey', u'count': 4},
 {u'_id': u'bowls', u'count': 3},
 {u'_id': u'archery', u'count': 3},
 {u'_id': u'10pin', u'count': 2},
 {u'_id': u'canoe', u'count': 2},
 {u'_id': u'tenns', u'count': 1},
 {u'_id': u'horseshoe', u'count': 1},
 {u'_id': u'bowling', u'count': 1},
 {u'_id': u'horse_racing', u'count': 1},
 {u'_id': u'diving', u'count': 1},
 {u'_id': u'skating;soccer;basketball', u'count': 1},
 {u'_id':