## P3: Wrangling with OpenStreetMap Data

OpenStreetMap is an open source project, contains a large number of city maps where users can add, remove and edit local places and streets. It is directly connected with Google Maps which would help improving maps.
The challange is to insure the correctency and constincy of the provided dataset.

In this project, Southampton street map is analysed which it's size is over 45 MB as requested. I have chose this city because I have been there and I loved it.

Southamptom dataset is available to download from: https://mapzen.com/data/metro-extracts/metro/southampton_england/

#this tag to load py into Jupyter and run against the dataset: %load mapparser.py

In [2]:
#%load mapparser.py
"""""
1- find documentatin of osm xml format in wiki
    https://wiki.openstreetmap.org/wiki/Main_Page
    http://wiki.openstreetmap.org/wiki/OSM_XML
"""""
import xml.etree.ElementTree as ET
import pprint

OSM_FILE='southampton.osm'

def count_tags(filename):
    tag_count = {}
    for _, element in ET.iterparse(filename, events=("start",)):
        add_tag(element.tag, tag_count)
    return tag_count

def add_tag(tag, tag_count):
    if tag in tag_count:
        tag_count[tag] += 1
    else:
        tag_count[tag] = 1

def test():

    tags = count_tags(OSM_FILE)
    pprint.pprint(tags)
    """
    assert tags == {'bounds': 1,
                     'member': 3,
                     'nd': 4,
                     'node': 20,
                     'osm': 1,
                     'relation': 1,
                     'tag': 7,
                     'way': 1}

    """

if __name__ == "__main__":
    print ("Conut tags in southampton.osm")
    test()

Conut tags in southampton.osm
{'bounds': 1,
 'member': 12072,
 'nd': 375357,
 'node': 272642,
 'osm': 1,
 'relation': 1175,
 'tag': 204363,
 'way': 51358}


In [3]:
#%load users.py

import xml.etree.ElementTree as ET
import pprint
import re
"""
Your task is to explore the data a bit more.
The first task is a fun one - find out how many unique users
have contributed to the map in this particular area!

The function process_map should return a set of unique user IDs ("uid") and user
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.get('user'))
            users.add(element.get('uid'))

    return users


def test():
    users = process_map('southampton.osm')
    pprint.pprint(users)
    #assert len(users) == 6



if __name__ == "__main__":
    test()

{'0123456789',
 '100597',
 '101043',
 '101150',
 '1030',
 '103253',
 '104459',
 '1051550',
 '1061999',
 '106914',
 '107237',
 '107617',
 '10845',
 '10983',
 '110263',
 '111135',
 '11126',
 '113818',
 '1150946',
 '1158381',
 '115894',
 '1164',
 '1165720',
 '116727',
 '116979',
 '117013',
 '118021',
 '1185',
 '120664',
 '1213696',
 '1214803',
 '1217467',
 '1221633',
 '1233572',
 '124414',
 '1244489',
 '124550',
 '1246157',
 '1246869',
 '1249205',
 '12660',
 '12671',
 '127060',
 '128186',
 '128322',
 '1293386',
 '12935',
 '1295',
 '12992',
 '13013',
 '1302643',
 '131236',
 '13203',
 '13241',
 '132439',
 '133272',
 '13334',
 '134778',
 '1362674',
 '1362708',
 '1362719',
 '1362734',
 '1362779',
 '1365690',
 '1366222',
 '1369743',
 '14020',
 '1404136',
 '141442',
 '1425497',
 '1428769',
 '145231',
 '1473',
 '1474063',
 '1479299',
 '14846',
 '1497116',
 '1518807',
 '152289',
 '152305',
 '1534261',
 '1540938',
 '154584',
 '156284',
 '1566',
 '1566481',
 '1569426',
 '1589260',
 '159575',
 '1616

In [4]:
# %load tags.py

import xml.etree.ElementTree as ET
import pprint
import re


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    """ 
    Count the criteria in dictionary for the content of the tag.
    """
    if element.tag == "tag":
        if lower.search(element.attrib['k']):
            keys['lower'] +=1
        elif lower_colon.search(element.attrib['k']):
            keys['lower_colon']+=1
        elif problemchars.search(element.attrib['k']):
            keys['problemchars']+=1
        else:
            keys['other']+=1
        
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys



def test():
    # You can use another testfile 'map.osm' to look at your solution
    # Note that the assertions will be incorrect then.
    keys = process_map('southampton.osm')
    pprint.pprint(keys)
#     assert keys == {'lower': 5, 'lower_colon': 0, 'other': 2, 'problemchars': 0}


if __name__ == "__main__":
    test()


{'lower': 110892, 'lower_colon': 90941, 'other': 2528, 'problemchars': 2}


In [5]:
#%load audit_street.py
"""
2- AUDIT DATA - Street names

"""	

import xml.etree.cElementTree as ET
from collections import defaultdict
# python regular expression module
import re 
import pprint
OSM_FILE = open("southampton.osm", "r")

# \S: matches any non-whitespace character
# \.: matches any character except a newline
# $: Occur at the end of string.

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(set)

expected =["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road","Trail", "Parkway", "Commons", "Circle", "Way"]

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        #street_types[street_type] += 1
        if street_type not in expected:
            #tracking unusual street type
            street_types[street_type].add(street_name)
            #return True if need to be updated
            return True
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print ("%s: %d" % (k, v) )
# checking k attribute in same tag.
def is_street_name(elem):
    #return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")
    return (elem.attrib['k'] == "addr:street") or (elem.attrib['k'] == "addr:full")

#create a records of all street types that we find in .osm dataset
def audit():
    # start means start tag. so generate next tag (elem "object type")
    for event, elem in ET.iterparse(OSM_FILE, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            #return iteration all of sub tags ( nested in elem object) named tag only.
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                # v is an attribute in tag
                    audit_street_type(street_types, tag.attrib['v'])    
    #print_sorted_dict(street_types) # uncomment if wanted to count the frequancey of each name.   
    pprint.pprint(dict(street_types))

if __name__ == '__main__':
    audit()

{'2HW': {'11-13 and 19-22 Market Buildings, High Road, Swaythling, '
         'Southampton, SO16 2HW',
         '2-3 Market Buildings, High Road, Swaythling, Southampton, SO16 2HW',
         '5 Market Buildings, High Road, Swaythling, Southampton, SO16 2HW',
         'Market Buildings, High Road, Swaythling, Southampton, SO16 2HW'},
 '387': {'387'},
 'Bridge': {'Horseshoe Bridge'},
 'Broadway': {'Midanbury Broadway', 'The Broadway'},
 'Buildings': {'Hanover Buildings', 'Gordon Buildings', 'Market Buildings'},
 'Centre': {'The Marlands Shopping Centre', 'Townhill Farm District Centre'},
 'Cloisters': {'The Cloisters'},
 'Close': {'Abbotsfield Close',
           'Aldermoor Close',
           'Andes Close',
           'Arnheim Close',
           'Ashridge Close',
           'Atlantic Close',
           'Bassett Close',
           'Bealing Close',
           'Beaumont Close',
           'Bembridge Close',
           'Berkeley Close',
           'Berwick Close',
           'Bevan Close',
  


##### Develop plan for cleaning.
I guess 2HW is an area in southampton so I googled the address
where the result met my expectation so I will convert it to High Road which is the correct strret name.
No idea about the meaning of S and re, so I will ignore them.

mapping = { "2HW": "HighRoad",
            "Raod": "Road",
            "Rd": "Road",
            "Street)":"Street",
            "road":"Road",
            "Road Westal":"Road West"
          }

In [6]:
# %load improve_street.py

"""""
4- Improving Street Names
"""""
import xml.etree.cElementTree as ET
from collections import defaultdict
from __future__ import division, absolute_import, print_function, unicode_literals
import re
import pprint

OSMFILE = "southampton.osm"
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "2HW": "HighRoad",
            "Raod": "Road",
            "Rd": "Road",
            "Street)":"Street",
            "road":"Road",
            "Road Westal":"Road West"
          }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                    tag.attrib['v'] = update_name(tag.attrib['v'],mapping)

    osm_file.close()
    return street_types


def update_name(name, mapping):

    dictionary_map = sorted(mapping.keys(), key=len, reverse=True)
    for key in dictionary_map:
        
        if name.find(key) != -1:          
            name = name.replace(key,mapping[key])

    return name


def test():
    st_types = audit(OSMFILE)
    #assert len(st_types) == 6
    #pprint.pprint(dict(st_types))
    # in python 3: iteritems =>>> items 
    for st_type, ways in st_types.items():
        for name in ways:
            better_name = update_name(name, mapping)
            print (name, "=>", better_name)
            if name == "Bluebell Raod":
                assert better_name == "Bluebell Road"
            if name == "Hythe Rd":
                assert better_name == "Hythe Road"


if __name__ == '__main__':
    test()

Albert Road South => Albert Road South
Osborne Road South => Osborne Road South
Asturias Way => Asturias Way
Redwood Way => Redwood Way
Redhill Way => Redhill Way
Normandy Way => Normandy Way
Rudd Way => Rudd Way
Channel Way => Channel Way
Ocean Way => Ocean Way
Thomas Lewis Way => Thomas Lewis Way
Woodlands Way => Woodlands Way
International Way => International Way
Pentire Way => Pentire Way
Pine Way => Pine Way
Teboura Way => Teboura Way
leaside Way => leaside Way
Hadrian Way => Hadrian Way
Leaside Way => Leaside Way
Glen Eyre Way => Glen Eyre Way
Lord's Hill Way => Lord's Hill Way
Oriana Way => Oriana Way
Lower Brownhill Way => Lower Brownhill Way
Cranford Way => Cranford Way
Monks Way => Monks Way
Uplands Way => Uplands Way
Abbotts Way => Abbotts Way
Marvin Way => Marvin Way
Castle Way => Castle Way
Brookside Way => Brookside Way
Orchards Way => Orchards Way
Oaklands Way => Oaklands Way
Glencarron Way => Glencarron Way
Cumbrian Way => Cumbrian Way
Vernon Walk => Vernon Walk
Vincen

In [20]:
#%load data.py
"""""
4- Preparing for Database - MongoDB
"""""

import xml.etree.ElementTree as ET 
import pprint 
import re 
import codecs 
import json 
import sys 

OSM_FILE = 'southampton.osm'  
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


CREATED = [ "version", "changeset", "timestamp", "user", "uid"] 
 
 
 
def shape_element(element): 
    node = {} 
     # you should process only 2 types of top level tags: "node" and "way" 
    if element.tag == "node" or element.tag == "way" : 
        #create the dictionary based on exaclty the value in element attribute.
        for key in element.attrib.keys(): 
            val = element.attrib[key] 
            node["type"] = element.tag 
            if key in CREATED: 
                if not "created" in node.keys(): 
                    node["created"] = {} 
                node["created"][key] = val 
            elif key == "lat" or key == "lon": 
                if not "pos" in node.keys(): 
                    node["pos"] = [0.0, 0.0] 
                old_pos = node["pos"] 
                if key == "lat": 
                    new_pos = [float(val), old_pos[1]] 
                else: 
                    new_pos = [old_pos[0], float(val)] 
                node["pos"] = new_pos 
            else: 
                node[key] = val 
            for tag in element.iter("tag"): 
                tag_key = tag.attrib['k'] 
                tag_val = tag.attrib['v'] 
                if problemchars.match(tag_key): 
                    continue 
                elif tag_key.startswith("addr:"): 
                    if not "address" in node.keys(): 
                        node["address"] = {} 
                    addr_key = tag.attrib['k'][len("addr:") : ] 
                    if lower_colon.match(addr_key): 
                        continue 
                    else: 
                        node["address"][addr_key] = tag_val 
                elif lower_colon.match(tag_key): 
                    node[tag_key] = tag_val 
                else: 
                    node[tag_key] = tag_val 
        for tag in element.iter("nd"): 
            if not "node_refs" in node.keys(): 
                node["node_refs"] = [] 
            node_refs = node["node_refs"] 
            node_refs.append(tag.attrib["ref"]) 
            node["node_refs"] = node_refs 
 
 
        return node 
    else: 
        return None 
 
 
 
 # Process the osm file to json file to be prepared for input file to monggo

def process_map(file_in, pretty = False): 
    # You do not need to change this file 
    file_out = "{0}.json".format(file_in) 
    data = [] 
    with codecs.open(file_out, "w") as fo: 
        for _, element in ET.iterparse(file_in): 
            el = shape_element(element) 
            if el: 
                data.append(el) 
                if pretty: 
                    fo.write(json.dumps(el, indent=2)+"\n") 
                else: 
                    fo.write(json.dumps(el) + "\n") 
    return data 

 
def run(): 
    data = process_map(OSM_FILE, False) 
    pprint.pprint(data[2]) # uncomment for checking data format before transfar it into MongoDB

    
if __name__ == "__main__": 
    run ()
    
    
#print (run(osm_file))    
#  if __name__ == "__main__": 
#     if not len(sys.argv) == 2: 
#          print "Usage: python maps/data.py input-file" 
#     else: 
#          run(sys.argv[1]) 


{'created': {'changeset': '153019',
             'timestamp': '2006-11-11T17:21:36Z',
             'uid': '2231',
             'user': 'Deanna Earley',
             'version': '1'},
 'id': '132709',
 'pos': [50.9507933, -1.4643494],
 'type': 'node'}


In [23]:
from data import *
data = process_map(OSM_FILE, False) 
pprint.pprint(data[:4]) # checking data format before transfar it into MongoDB


[{'created': {'changeset': '8139974',
              'timestamp': '2011-05-14T11:45:29Z',
              'uid': '260682',
              'user': 'monxton',
              'version': '5'},
  'id': '132707',
  'pos': [50.9454657, -1.4775675],
  'type': 'node'},
 {'created': {'changeset': '153019',
              'timestamp': '2006-11-11T17:58:16Z',
              'uid': '2231',
              'user': 'Deanna Earley',
              'version': '1'},
  'id': '132708',
  'pos': [50.9474216, -1.4709162],
  'type': 'node'},
 {'created': {'changeset': '153019',
              'timestamp': '2006-11-11T17:21:36Z',
              'uid': '2231',
              'user': 'Deanna Earley',
              'version': '1'},
  'id': '132709',
  'pos': [50.9507933, -1.4643494],
  'type': 'node'},
 {'created': {'changeset': '153019',
              'timestamp': '2006-11-11T17:24:45Z',
              'uid': '2231',
              'user': 'Deanna Earley',
              'version': '1'},
  'id': '132710',
  'pos': [50.9533581,

In [27]:
"""""
4- ACCESSING MongoDB
>> Point your command prompt to C:\Program Files\MongoDB\Server\3.4\bin
>> mongo.exe
http://stackoverflow.com/questions/23726684/mongodb-on-a-windows-7-machine-no-connection-could-be-made
"""""

from pymongo import MongoClient

#client  = MongoClient('mongodb://localhost:27017')
client  = MongoClient()

# Access Database Objects
db = client.southampton


In [28]:
# Insert a document into a collection named southampton in MongoDB.
[db.southampton.insert(e) for e in data]

  from ipykernel import kernelapp as app


ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it

In [29]:
# Thoroughness and Succinctness of Submission

def find():
    x={"highway": "residential"}
    resdient = db.southampton.find(x)
    for r in resdient:
        pprint.pprint(r)
        
if __name__ == '__main__':
    find()

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it

# 2. Data Overview

In [41]:
"""
FILE SIZE.

"""

import os
os.path.getsize('southampton.osm')


# OR:
# statinfo = os.stat(OSM_FILE)
# #print (statinfo)
# statinfo.st_size



66903798

In [88]:
""""
Number of documents

""""

# count() is equivalent to the db.collection.find(query).count() construct.
db.southampton.find().count()  

324000

In [93]:
"""
Number of nodes

"""

db.southampton.find({"type":"node"}).count() 


272641

In [94]:
"""
Number of ways

"""

db.southampton.find({"type":"way"}).count() 

51354

In [95]:
"""
Number of unique users

"""
                                                
db.southampton.distinct({"created.user"}).length


TypeError: key must be an instance of str

In [32]:
import os
statinfo = os.stat(OSM_FILE)
#print (statinfo)
statinfo.st_size

66903798

In [None]:
db.stats
function (scale){
    return this.runCommand({dbstats:1, scale:scale});
}