In [128]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Imports
import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow
from collections import defaultdict
import re
import pprint
import os
import pandas as pd
import urllib, json
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
pd.set_option('display.notebook_repr_html', True)

### Create a smaller version of the map (Warsaw-Poland))

In [26]:
'''Take a smaller sample of the map for auditing purposes '''

k = 2 # Parameter: take every k-th top level element

OSM_FILE = "warsaw_poland.osm"  # Replace this with your osm file
SAMPLE_FILE = "warsaw_poland_sample_k="+ str(k) + ".osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')
    
    #Get the file sizes in Kilobytes
    Original_OSM_size = os.path.getsize(OSM_FILE)/0.001
    Sample_OSM_size = os.path.getsize(SAMPLE_FILE)/0.001
    
    print(' Done \n Original_OSM_size: {:,} KBytes\n Sample_OSM_size: {:,} KBytes'.format(Original_OSM_size, Sample_OSM_size))

 Done 
 Original_OSM_size: 1,169,168.705 KBytes
 Sample_OSM_size: 592,097.28 KBytes


### Find out what kind of tags exist in your file

In [None]:
def count_tags(filename):
    """
    Use the iterative parsing to process the map file and
    find out what tags are there, but also how many and to get the
    feeling on how much of which data you can expect to have in the map.
    Returns a dictionary with the tag name as the key and number of times this tag can be encountered in 
    the map as value.

    """  
    #init empty dic
    elem_dic = defaultdict(int)
    
    #parse the xml file
    tree = ET.parse(filename)
    
    #get the elements, convert to string and extract the tags
    for elem in tree.iter():
        elem_dic[elem.tag]+=1      
        
    return elem_dic

In [6]:
osm_file = 'warsaw_poland_sample_k=5.osm'
tags = count_tags(osm_file)
pd.Series(tags).sort_values(ascending=False)

nd          1174493
node         983267
tag          897948
way          126661
member        36939
relation       1669
osm               1
dtype: int64

### Explore attributes

In [7]:
def count_attribs(filename):
    """
    Use the iterative parsing to process the map file and
    find out what attribs are there, but also how many and to get the
    feeling on how much of which data you can expect to have in the map.
    Returns a pandas dataseries with the tag name as the key and number of times this tag can be encountered in 
    the map as value.

    """  
    #init empty dic
    attrib_dic_tag = defaultdict(int)
    
    #parse the xml file
    # tree =  etree.parse(filename)
   
    #get the elements, convert to string and extract the tags
    for _, element in ET.iterparse(filename):
        if element.tag == "tag":
                attrib_dic_tag[element.attrib['k']]+=1      
        
    return attrib_dic_tag

In [None]:
attribs = count_attribs(osm_file)

attribs_df = pd.DataFrame({'attribs':attribs.keys(), 'counts': attribs.values()}).sort_values(by='counts', ascending=False)

In [16]:
attribs_df.head(500)

Unnamed: 0,attribs,counts
289,addr:housenumber,105903
851,addr:city,93634
48,addr:street,92431
596,addr:postcode,92050
513,source:addr,75741
1040,building,65054
116,addr:city:simc,58661
374,addr:street:sym_ul,47211
377,highway,41257
827,source,39508


In [15]:
attribs_df[attribs_df.attribs.str.contains('amenity',case=False)]

Unnamed: 0,attribs,counts
143,amenity,5690
877,disused:amenity,3
399,old_amenity,3
1003,ata_delete:amenity,1


In [18]:
#Find Function
def find(filename, attrib_type):
    '''Receives OSM filename and the attribute type the k element
    and returns a dictionary with keys the attribute types and values their count of occurence'''
    types = defaultdict(int)
    for _, element in ET.iterparse(filename):
        try:
            if element.attrib['k'] == attrib_type:
                types[element.attrib['v']]+=1
        except:
            continue
    return types

In [21]:
#Find phone
osm_file = 'warsaw_poland_sample_k=5.osm'
types = find(osm_file, 'phone')

phone = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [23]:
phone[phone.val.str.contains('',case=False)]

Unnamed: 0,val,counts
463,22 664 21 00,2
310,501139027,2
305,+48 22 418 00 00,1
317,+48 517 127 167,1
316,+48 22 641 93 03,1
315,+48 22 3532266,1
314,+48226486630,1
313,"226211646, 601243466",1
312,+48 25 759 51 00,1
311,+48 22 627 11 00,1


In [26]:
phone[phone.val.str.startswith('+48')]

Unnamed: 0,val,counts
305,+48 22 418 00 00,1
317,+48 517 127 167,1
316,+48 22 641 93 03,1
315,+48 22 3532266,1
314,+48226486630,1
312,+48 25 759 51 00,1
311,+48 22 627 11 00,1
309,+48227221088,1
307,+48 22 678 17 17,1
304,+48 261 855 690,1


After looking into the attributes with the most counts I did not find any problems with the usual street and postcode.

The phone numbers can use some cleaning and I will do that.

Looking into the amenity 'key' I noticed that there are many custom entries that do not appear at the wiky for this 'key'.

The idea is to use the json provided by the [API](https://taginfo.openstreetmap.org/taginfo/apidoc#api_4_key_values) with the most common values and filter the values of my map based on their number of occurences or the existence of a wiki:

In [42]:
#Find amenity
osm_file = 'warsaw_poland_sample_k=2.osm'
types = find(osm_file, 'amenity')

vals_amenity = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [43]:
vals_amenity

Unnamed: 0,val,counts
44,parking,4277
84,bench,1015
24,waste_basket,752
70,restaurant,747
72,atm,517
59,parking_space,506
22,school,486
15,pharmacy,422
56,fast_food,388
25,bank,385


In [107]:
vals_amenity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 44 to 69
Data columns (total 2 columns):
val       138 non-null object
counts    138 non-null int64
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


Get the json from the taginfo API for commonly used amenity values

In [32]:
#https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
url = "https://taginfo.openstreetmap.org/api/4/key/values?key=amenity&in_wiki=true&filter=all&lang=en&lang=pl&sortname=count&sortorder=desc&rp=14&qtype=value&format=json_pretty"
response = urllib.urlopen(url)
amenity = pd.DataFrame(json.loads(response.read()))

data_amenity = pd.DataFrame(list(amenity.data))

In [None]:
data_amenity

In [37]:
#https://stackoverflow.com/infoquestions/15325182/how-to-filter-rows-in-pandas-by-regex
data_amenity[(data_amenity.in_wiki==True) & (data_amenity.value.str.contains('hotel'))]

Unnamed: 0,count,description,fraction,in_wiki,value
124,862,,0.0001,True,love_hotel
1358,5,,0.0,True,hotel


In [92]:
def audit_type(val_to_audit, expected):
    '''Receives series of values to audit and the list of expected values and returns set
       of values not in expected'''
    types = set()
    for val in val_to_audit:
        if val not in expected:
            types.add(val)
    return types

In [115]:
#Audit amenity with occ occurences in OSM db
occ=1

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ)]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [116]:
audit_amenity,len(audit_amenity)

(set(), 0)

There are 0 entries that do not exist in the OSM Database

Let's try this with entries that have more than 5 occurences in the OSM Database

In [113]:
#Audit amenity with k occurences in OSM db
occ=5

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ)]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [114]:
audit_amenity,len(audit_amenity)

({'Cities Lighting Consultants',
  'City Information System',
  'Janczewice',
  'Kancelaria_Notarialna',
  'Zwrotnica',
  'air_compressor',
  'aviation_school',
  'cityhall',
  'fence',
  'film_studio',
  'laundry_box',
  u'lokal_do_wynaj\u0119cia',
  'movie_studio',
  'municipal_police',
  'vehicle_control'},
 15)

15 entries.

What will happen only the entries with wiki_entries and 10 occurences

In [121]:
#Audit amenity with k occurences in OSM db
occ=10

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ) & (data_amenity.in_wiki==True) ]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [122]:
audit_amenity, len(audit_amenity)

({'Cities Lighting Consultants',
  'City Information System',
  'Janczewice',
  'Kancelaria_Notarialna',
  'Zwrotnica',
  'air_compressor',
  'atm;bank',
  'aviation_school',
  'baby_care',
  'beater',
  'billboard',
  'border_control',
  'check_in',
  'cityhall',
  'cooking_school',
  'employment_agency',
  'enclosing',
  'estate_agent',
  'factory',
  'fence',
  'film_studio',
  'forester',
  'funeral_home',
  'funerary_services',
  'grave',
  'house',
  'insurance',
  'laundry_box',
  'lawyer',
  u'lokal_do_wynaj\u0119cia',
  'luggage_locker',
  'martial_arts',
  'ministry',
  'movie_studio',
  'municipal_police',
  'notary',
  'parcel_box',
  'photo',
  'publisher',
  'security',
  'stock_exchange',
  'vehicle_control',
  'wifi',
  'yes'},
 44)

44 - How to handle this?

Let's see if we can find equivalent meanings in the OSM db and map them.

In [131]:
# Looking for baby_care

data_amenity[(data_amenity.in_wiki==True) & ((data_amenity.value.str.contains('baby')) 
                                          | (data_amenity.value.str.contains('child'))
                                          | (data_amenity.value.str.contains('kinder')))]

Unnamed: 0,count,description,fraction,in_wiki,value
14,179214,A place for looking after preschool children a...,0.016,True,kindergarten
68,9232,Describes a place where children of different ...,0.0008,True,childcare
151,398,pl=Okno życia,0.0,True,baby_hatch


Does not really make any sense to change that..