# Udacity Data Analyst Project 3:  Wrangling Open Street Data

_ by Jens Laufer _

## Introduction

http://overpass-api.de/api/map?bbox=6.6309,53.4302,7.8291,53.8227

![](img/map.png)

In [23]:
# python imports
from collections import defaultdict
import pprint
import re
import codecs
import json
import os
import extractor
import file_size_humanize as humanize
import os.path as path
# this is the import of the code from the case study
import audit

In [24]:
# some constants I am using 
OSM_URL = "http://overpass-api.de/api/map?bbox=6.6309,53.4302,7.8291,53.8227"
OSM_FILE = "ostfriesland.osm" 
OSM_EXTRACT_FILE = "{0}_extract.osm".format(OSM_FILE[:OSM_FILE.find('.osm')])

In [25]:
# I stream the osm data with the overpass url for the box we want to use into a local file in case the file 
# does not exist already
from urllib2 import urlopen

if not path.exists(OSM_FILE):
    response = urlopen(OSM_URL)
    CHUNK = 16 * 1024
    with open(OSM_FILE, 'wb') as f:
        while True:
            chunk = response.read(CHUNK)
            if not chunk:
                break
            f.write(chunk)

In [26]:
# For first evaluation of the data I am extracting every 30th node in case the extract does not exist already
# I use the provided function, which moved to an own module to keep the code clean
if not path.exists(OSM_EXTRACT_FILE):
    extractor.extract(OSM_FILE, OSM_EXTRACT_FILE, 50)

In [27]:
# File size of the full osm file
info = os.stat(OSM_FILE)
"Filesize of {0} {1}".format(OSM_FILE, humanize.humansize(info.st_size))

'Filesize of ostfriesland.osm 155.29 MB'

In [28]:
# File size of the extrected osm file
info = os.stat(OSM_EXTRACT_FILE)
"Filesize of {0} {1}".format(OSM_EXTRACT_FILE, humanize.humansize(info.st_size))

'Filesize of ostfriesland_extract.osm 5.29 MB'

In [29]:
audit.count_tags(OSM_EXTRACT_FILE)

{'member': 2493,
 'nd': 29879,
 'node': 24167,
 'osm': 1,
 'relation': 29,
 'tag': 11403,
 'way': 3442}

In [30]:
audit.audit_k_value(OSM_EXTRACT_FILE)

{'LOWER': 7295, 'LOWER_COLON': 3908, 'PROBLEMCHARS': 0, 'other': 200}

I am auditing the streetnames with a modified version for Germany (encapsulated in the audit module). This is a little more comp
The streetnames which are not expected look good to me.

In [31]:
audit.audit_streets(OSM_EXTRACT_FILE)

defaultdict(set,
            {'Aalring': {'Aalring'},
             'Aalweg': {'Aalweg'},
             'Abens': {'Abens'},
             'Achterum': {'Achterum'},
             'Ackerpad': {'Ackerpad'},
             u'Ahornstra\xdfe': {u'Ahornstra\xdfe'},
             u'Albatrosstra\xdfe': {u'Albatrosstra\xdfe'},
             u'Alleestra\xdfe': {u'Alleestra\xdfe'},
             'Altendeich': {'Grimersumer Altendeich', 'Wirdumer Altendeich'},
             'Altengroden': {'Funnixer Altengroden'},
             'Ampferweg': {'Ampferweg'},
             'Amt': {'Am Alten Amt'},
             'Amtmannskamp': {'Amtmannskamp'},
             u'Andreaestra\xdfe': {u'Andreaestra\xdfe'},
             'Anker': {'Vor dem Anker'},
             u'Ansgaristra\xdfe': {u'Ansgaristra\xdfe'},
             'Asternweg': {'Asternweg'},
             'Austernweg': {'Austernweg'},
             'Baantjebur': {'Baantjebur'},
             'Bahndamm': {'Am Bahndamm'},
             u'Bahnhofstra\xdfe': {u'Bahnhofstra\xdfe

In [32]:
len(audit.contributing_users(OSM_EXTRACT_FILE))

605

In [33]:
audit.process_map(OSM_FILE)

[{'address': {},
  'created': {'changeset': '13610919',
   'timestamp': '2012-10-23T22:13:06Z',
   'uid': '414856',
   'user': 'tempi',
   'version': '8'},
  'id': '17491341',
  'pos': ['53.4459044', '7.2641691'],
  'type': 'node'},
 {'address': {},
  'created': {'changeset': '44198619',
   'timestamp': '2016-12-06T04:19:50Z',
   'uid': '52533',
   'user': 'imehl',
   'version': '8'},
  'id': '17491343',
  'pos': ['53.4504953', '7.2708211'],
  'type': 'node'},
 {'address': {},
  'created': {'changeset': '840512',
   'timestamp': '2009-03-21T10:42:29Z',
   'uid': '36745',
   'user': 'Gluko',
   'version': '6'},
  'id': '17491347',
  'pos': ['53.4586228', '7.2826398'],
  'type': 'node'},
 {'address': {},
  'created': {'changeset': '840512',
   'timestamp': '2009-03-21T10:42:29Z',
   'uid': '36745',
   'user': 'Gluko',
   'version': '7'},
  'id': '17491355',
  'pos': ['53.4723660', '7.3181724'],
  'type': 'node'},
 {'address': {},
  'created': {'changeset': '30963774',
   'timestamp': '20