In [4]:
#This file reads the osm files and looks for anomalies in the data
#In particular, it analyses postcodes and how contacts information are 
#saved in the file


#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET
from collections import defaultdict
import re

osm_file = open("milan_italy.osm", "r")

#phone, fax, website, url, should all become contact:X
#fix elem.attrib['k']=url, website and change it to "contact:website"

#TO DO change label gym, lotto, nightclub, picnic_table from amenity to leisure
#change 14 restaurant under building

#regex to check postal code that have <5 digits
postcode_type_re = re.compile(r'^\d{1,4}$', re.IGNORECASE)

#dictionaries for all wrong codes and contacts info
wrong_mails = defaultdict(int)
wrong_faxes = defaultdict(int)
wrong_phones = defaultdict(int)
wrong_websites = defaultdict(int)
wrong_postcodes = defaultdict(int)
wrong_postcodes_cities = []

#this function adds to a dictionary the wrong postcode (<5 digits)
def audit_postcode_type(wrong_postcode, postcode):
    r = postcode_type_re.search(postcode)
    if r:
        wrong_postcode = r.group()
        wrong_postcodes[wrong_postcode] += 1

#this function adds to a dictionary the contacts information that are stored in a wrong way
def audit_contact_type(contact_types, name):
    contact_types[name] += 1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print "%s: %d" % (k, v) 

#helper functions to check types of tag
def is_mail_contact(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "mail")

def is_fax_contact(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "fax")

def is_phone_contact(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "phone")

def is_website_contact(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "website")

def is_url_contact(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "url")

def is_postcode_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:postcode")


#audit functions that checks for wrong postcodes and contact info (mail, fax, phone, website) stored wrongly
def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_mail_contact(elem):
            audit_contact_type(wrong_mails, elem.attrib['v'])
        if is_fax_contact(elem):
            audit_contact_type(wrong_faxes, elem.attrib['v'])
        if is_phone_contact(elem):
            audit_contact_type(wrong_phones, elem.attrib['v'])
        if is_website_contact(elem) or is_url_contact(elem):
            audit_contact_type(wrong_websites, elem.attrib['v'])
        if is_postcode_name(elem):
            audit_postcode_type(wrong_postcodes, elem.attrib['v'])
    print_sorted_dict(wrong_postcodes)
    print_sorted_dict(wrong_mails)
    print_sorted_dict(wrong_faxes)
    print_sorted_dict(wrong_phones)
    print_sorted_dict(wrong_websites)
    


if __name__ == '__main__':
    audit()


+39 02 22476566: 1
+39 02 3311032: 1
+39 02 88462541: 1
+39 02 88462822: 1
+39 02 22478383: 1
+39 02 3311859: 1
+39 02 6416331: 1
+39 02 6597732: 1
+39 02 6706063: 1
+39 02 6884305: 1
+39 02 6886449: 1
+39 02 76367311: 1
+39 02 7636901: 1
+39 03 9793944: 1
http://farmaciasangiovannisnc.lafarmacia.biz/: 1
http://www.bethshlomo.it/: 1
http://www.comune.cormano.mi.it: 1
http://www.comune.rodano.mi.it/: 1
http://www.eataly.net/: 1
http://www.epn.lloydsfarmacia.it/stores/farmacie-milano/milano-64: 1
http://www.farmaciacenisio.it/: 1
http://www.ilgigante.it/: 1
http://www.milanosport.it/impianto/16/de-marchi/28/: 1
http://www.teatromanzoni.it/manzoni/: 1
www.comune.bussero.mi.it: 1


In [5]:
#correct 

postcodes_tofix = {
    "2090":"20090",
    "2121":"20121",
    "2043":"20143",
    "2014":"20124",
    "2009":"20092",
    "2003":"20030"
}

#import xml.etree.cElementTree as ET
#from collections import defaultdict
#import re

osm_file = open("milan_italy.osm", "r")

def correct_postcode(postcode):
    for wrongcode, correctedcode in postcodes_tofix.iteritems():
        if wrongcode == postcode:
            return correctedcode

def correct_contact_type(contact_type):
    return "contact:"+ contact_type


def correct():
    for event, elem in ET.iterparse(osm_file):
        if is_mail_contact(elem):
            elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
        if is_fax_contact(elem):
            elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
        if is_phone_contact(elem):
            elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
        if is_website_contact(elem) or is_url_contact(elem):
            elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
        if is_postcode_name(elem) and elem.attrib['v'] in postcodes_tofix.keys():
            elem.attrib['v'] = correct_postcode(elem.attrib['v'])
            print elem.attrib['v']



if __name__ == '__main__':
    correct()


In [10]:
#this function reads the complete osm files, 
#it corrects the point found in the audit part and it creates a json file
#that can be used to load it to the DB


#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.cElementTree as ET
import pprint
import re
import codecs
import json
"""
Your task is to wrangle the data and transform the shape of the data
into the model we mentioned earlier. The output should be a list of dictionaries
that look like this:

{
"id": "2406124091",
"type: "node",
"visible":"true",
"created": {
          "version":"2",
          "changeset":"17206049",
          "timestamp":"2013-08-03T16:43:42Z",
          "user":"linuxUser16",
          "uid":"1219059"
        },
"pos": [41.9757030, -87.6921867],
"address": {
          "housenumber": "5157",
          "postcode": "60625",
          "street": "North Lincoln Ave"
        },
"amenity": "restaurant",
"cuisine": "mexican",
"name": "La Cabana De Don Luis",
"phone": "1 (773)-271-5176"
}

"""
osm_file = open("milan_italy.osm", "r")

#Dictionary used to fix the wrong postcodes found in the audit part
postcodes_tofix = {
    "2090":"20090",
    "2121":"20121",
    "2043":"20143",
    "2014":"20124",
    "2009":"20092",
    "2003":"20030"
}

#function to correct postcodes
def correct_postcode(postcode):
    for wrongcode, correctedcode in postcodes_tofix.iteritems():
        if wrongcode == postcode:
            return correctedcode

#function to standardize contacts information to have a tag "contact:X" instead of just X
#creating such a correspondency it would allow us to then create a dictionary containing just
#contact information
def correct_contact_type(contact_type):
    return "contact:"+ contact_type

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'\"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

NODE = ["id", "version", "visible", "changeset", "timestamp", "user", "uid", "lat", "lon"]

WAY = ["id", "version", "visible", "changeset", "timestamp", "user", "uid"]

#Shape function that read nodes and ways and creates dictionaries as explained above
def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag =="way":
        created={}
        pos = [0,0]
        #create a key for each attribute
        for n in element.attrib:
            #if the attribute is in the created dictionary, then create a separate dictionary
            if n in CREATED:
                created[n]=element.attrib[n]
            else:
                #put longitude and latidute as an array
                if n in ["lon", "lat"]:
                    if n in ["lon"]:
                        pos[1]= (float(element.attrib[n]))
                    if n in ["lat"]:
                        pos[0]= (float(element.attrib[n]))
                else:
                    node[n]= element.attrib[n]
        node['type']=element.tag
        node['created']= created
        node['pos'] = pos
        address={}
        contact= {}
        #explore the child of the element
        for child in element.iter("tag"):
            if re.search(problemchars, child.attrib['k']):
                pass
            #if it's an address, create a dictionary with all information about it 
            if child.attrib['k'].startswith('addr:'):
                if child.attrib['k'].count(':') ==1:
                    address[child.attrib['k'][5:]] = child.attrib['v']
            #if it's a contact info, create a dictionary with all those values
            if child.attrib['k'].startswith('contact:'):
                contact[child.attrib['k'][8:]] = child.attrib['v']
            else:
                node[child.attrib['k'].split(':')[0]] = child.attrib['v']
        if address:
            node["address"] = address
        if contact:
            node["contact"] = contact
        node_refs = []
        for tag in element.iter("nd"):
            node_refs.append(tag.attrib['ref'])
            node['node_refs'] = node_refs
                
        return node
    
    else:
        return None


def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, elem in ET.iterparse(file_in):
            if is_mail_contact(elem):
                elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
            if is_fax_contact(elem):
                elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
            if is_phone_contact(elem):
                elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
            if is_website_contact(elem) or is_url_contact(elem):
                elem.attrib['k'] = correct_contact_type(elem.attrib['k'])
            #print elem.attrib['k']
            if is_postcode_name(elem) and elem.attrib['v'] in postcodes_tofix.keys():
                elem.attrib['v'] = correct_postcode(elem.attrib['v'])
            
            el = shape_element(elem)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    fo.close()
    return data

def test():
    # NOTE: if you are running this code on your computer, with a larger dataset, 
    # call the process_map procedure with pretty=False. The pretty=True option adds 
    # additional spaces to the output, making it significantly larger.
    data = process_map(osm_file, True)
    #pprint.pprint(data)
    
    
if __name__ == "__main__":
    test()

In [14]:
#this code runs some queries over the OpenStreet data loaded in the MongoDB database


from pymongo import MongoClient
from pprint import pprint
import pymongo

client=MongoClient("localhost", 27017)

# Use shortcut to access the database
db = client.OpenStreet

# Use shortcut to access the collection
coll = db.Milan

# Number of documents
print "Number of documents"                                                
print  coll.find().count()

                                              
# Number of nodes
print "Number of nodes"
print coll.find({"type": "node"}).count()

                                                
# Number of ways
print "Number of ways"
print coll.find({"type": "way"}).count()                               


# Number of unique users
print "Number of unique users"                                            
print len(coll.distinct("created.user"))

                                                
# Top 1 contributing user
print "Most contributing user"                                                
aggr =  coll.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}}, {"$sort":{"count":-1}}, {"$limit":1}])
pprint(list(aggr))

                              
# Average of contributing user
print "Most contributing user"                                                
aggr =  coll.aggregate([{"$group":{"_id":"$created.user", "count":{"$sum":1}}},
                        {"$group": {"_id" : None, "avg_count": { "$avg": "$count" }}}])
pprint(list(aggr))

#Find amenities 
print "Count of amenities in the data"
aggr = coll.aggregate( [{"$group":{"_id": "$amenity", "count":{"$sum":1}}}, {"$sort":{"count":-1}}])
pprint(list(aggr))

#Find all restaurant in Milan 
print "Count of different cuisines in resto in Milan city"
aggr = coll.aggregate( [{"$match":{"address.city":"Milano", 
                                  "amenity":"restaurant"}},
                       {"$group":{"_id": "$cuisine", "count":{"$sum":1}}}, {"$sort":{"count":-1}}])
pprint(list(aggr))

#Count opening hours 
print "Check how many times opening hours are there"
print len(coll.distinct("opening_hours"))
    

Number of documents
4016658
Number of nodes
3484193
Number of ways
532392
Number of unique users
2410
Most contributing user
[{u'_id': u'Alecs01', u'count': 594237}]
Most contributing user
[{u'_id': None, u'avg_count': 1666.663070539419}]
Count of amenities in the data
[{u'_id': None, u'count': 3973821},
 {u'_id': u'parking', u'count': 13640},
 {u'_id': u'bench', u'count': 4633},
 {u'_id': u'waste_basket', u'count': 4034},
 {u'_id': u'restaurant', u'count': 2281},
 {u'_id': u'cafe', u'count': 1576},
 {u'_id': u'place_of_worship', u'count': 1179},
 {u'_id': u'school', u'count': 1179},
 {u'_id': u'drinking_water', u'count': 1137},
 {u'_id': u'bank', u'count': 1025},
 {u'_id': u'bicycle_parking', u'count': 1020},
 {u'_id': u'fuel', u'count': 1018},
 {u'_id': u'bar', u'count': 986},
 {u'_id': u'pharmacy', u'count': 737},
 {u'_id': u'vending_machine', u'count': 577},
 {u'_id': u'fast_food', u'count': 512},
 {u'_id': u'telephone', u'count': 503},
 {u'_id': u'post_box', u'count': 498},
 {u'_i

1561
