In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Imports
import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow
from collections import defaultdict
import re
import pprint
import os
import pandas as pd
import urllib, json
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)
pd.set_option('display.notebook_repr_html', True)

### Create a smaller version of the map (Warsaw-Poland))

In [26]:
'''Take a smaller sample of the map for auditing purposes '''

k = 2 # Parameter: take every k-th top level element

OSM_FILE = "warsaw_poland.osm"  # Replace this with your osm file
SAMPLE_FILE = "warsaw_poland_sample_k="+ str(k) + ".osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')
    
    #Get the file sizes in Kilobytes
    Original_OSM_size = os.path.getsize(OSM_FILE)/0.001
    Sample_OSM_size = os.path.getsize(SAMPLE_FILE)/0.001
    
    print(' Done \n Original_OSM_size: {:,} KBytes\n Sample_OSM_size: {:,} KBytes'.format(Original_OSM_size, Sample_OSM_size))

 Done 
 Original_OSM_size: 1,169,168.705 KBytes
 Sample_OSM_size: 592,097.28 KBytes


### Find out what kind of tags exist in your file

In [None]:
def count_tags(filename):
    """
    Use the iterative parsing to process the map file and
    find out what tags are there, but also how many and to get the
    feeling on how much of which data you can expect to have in the map.
    Returns a dictionary with the tag name as the key and number of times this tag can be encountered in 
    the map as value.

    """  
    #init empty dic
    elem_dic = defaultdict(int)
    
    #parse the xml file
    tree = ET.parse(filename)
    
    #get the elements, convert to string and extract the tags
    for elem in tree.iter():
        elem_dic[elem.tag]+=1      
        
    return elem_dic

In [6]:
osm_file = 'warsaw_poland_sample_k=5.osm'
tags = count_tags(osm_file)
pd.Series(tags).sort_values(ascending=False)

nd          1174493
node         983267
tag          897948
way          126661
member        36939
relation       1669
osm               1
dtype: int64

### Find out how many unique users contributed

In [163]:
"""
How many unique users have contributed to the map in this particular area.

The function process_map should return a set of unique user IDs ("uid")
"""

def get_user(element):
    return


def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        try:
            if element.attrib['uid']:
                users.add(element.attrib['uid'])
        except:
            continue

    return users

users = process_map(osm_file)
len(users)

2546

### Explore attributes

In [7]:
def count_attribs(filename):
    """
    Use the iterative parsing to process the map file and
    find out what attribs are there, but also how many and to get the
    feeling on how much of which data you can expect to have in the map.
    Returns a pandas dataseries with the tag name as the key and number of times this tag can be encountered in 
    the map as value.

    """  
    #init empty dic
    attrib_dic_tag = defaultdict(int)
    
    #parse the xml file
    # tree =  etree.parse(filename)
   
    #get the elements, convert to string and extract the tags
    for _, element in ET.iterparse(filename):
        if element.tag == "tag":
                attrib_dic_tag[element.attrib['k']]+=1      
        
    return attrib_dic_tag

In [None]:
attribs = count_attribs(osm_file)

attribs_df = pd.DataFrame({'attribs':attribs.keys(), 'counts': attribs.values()}).sort_values(by='counts', ascending=False)

In [16]:
attribs_df.head(500)

Unnamed: 0,attribs,counts
289,addr:housenumber,105903
851,addr:city,93634
48,addr:street,92431
596,addr:postcode,92050
513,source:addr,75741
1040,building,65054
116,addr:city:simc,58661
374,addr:street:sym_ul,47211
377,highway,41257
827,source,39508


In [15]:
attribs_df[attribs_df.attribs.str.contains('amenity',case=False)]

Unnamed: 0,attribs,counts
143,amenity,5690
877,disused:amenity,3
399,old_amenity,3
1003,ata_delete:amenity,1


In [3]:
#Find Function
def find(filename, attrib_type):
    '''Receives OSM filename and the attribute type the k element
    and returns a dictionary with keys the attribute types and values their count of occurence'''
    types = defaultdict(int)
    for _, element in ET.iterparse(filename):
        try:
            if element.attrib['k'] == attrib_type:
                types[element.attrib['v']]+=1
        except:
            continue
    return types

After looking into the attributes with the most counts I did not find any problems with the usual street and postcode.

The phone numbers can use some cleaning and I will do that.

Looking into the amenity 'key' I noticed that there are many custom entries that do not appear at the wiky for this 'key'.

The idea is to use the json provided by the [API](https://taginfo.openstreetmap.org/taginfo/apidoc#api_4_key_values) with the most common values and filter the values of my map based on their number of occurences or the existence of a wiki:

In [42]:
#Find amenity
osm_file = 'warsaw_poland_sample_k=2.osm'
types = find(osm_file, 'amenity')

vals_amenity = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [134]:
vals_amenity.head(10)

Unnamed: 0,val,counts
44,parking,4277
84,bench,1015
24,waste_basket,752
70,restaurant,747
72,atm,517
59,parking_space,506
22,school,486
15,pharmacy,422
56,fast_food,388
25,bank,385


In [107]:
vals_amenity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 44 to 69
Data columns (total 2 columns):
val       138 non-null object
counts    138 non-null int64
dtypes: int64(1), object(1)
memory usage: 3.2+ KB


Get the json from the taginfo API for commonly used amenity values

In [32]:
#https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
url = "https://taginfo.openstreetmap.org/api/4/key/values?key=amenity&in_wiki=true&filter=all&lang=en&lang=pl&sortname=count&sortorder=desc&rp=14&qtype=value&format=json_pretty"
response = urllib.urlopen(url)
amenity = pd.DataFrame(json.loads(response.read()))

data_amenity = pd.DataFrame(list(amenity.data))

In [34]:
data_amenity.head()

Unnamed: 0,count,description,fraction,in_wiki,value
0,2368503,Miejsca na zaparkowanie pojazdu,0.2118,True,parking
1,872614,[[wikipedia:pl:Świątynia,0.078,True,place_of_worship
2,830890,Instytucja zaprojektowana do nauczania pod nad...,0.0743,True,school
3,737098,Ławka.,0.0659,True,bench
4,722496,"A place selling full meals served by waiters, ...",0.0646,True,restaurant


In [37]:
#https://stackoverflow.com/infoquestions/15325182/how-to-filter-rows-in-pandas-by-regex
data_amenity[(data_amenity.in_wiki==True) & (data_amenity.value.str.contains('hotel'))]

Unnamed: 0,count,description,fraction,in_wiki,value
124,862,,0.0001,True,love_hotel
1358,5,,0.0,True,hotel


In [9]:
def audit_type(val_to_audit, expected):
    '''Receives series of values to audit and the list of expected values and returns set
       of values not in expected'''
    types = set()
    for val in val_to_audit:
        if val not in expected:
            types.add(val)
    return types

In [115]:
#Audit amenity with occ occurences in OSM db
occ=1

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ)]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [116]:
audit_amenity,len(audit_amenity)

(set(), 0)

There are 0 entries that do not exist in the OSM Database

Let's try this with entries that have more than 5 occurences in the OSM Database

In [113]:
#Audit amenity with k occurences in OSM db
occ=5

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ)]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [114]:
audit_amenity,len(audit_amenity)

({'Cities Lighting Consultants',
  'City Information System',
  'Janczewice',
  'Kancelaria_Notarialna',
  'Zwrotnica',
  'air_compressor',
  'aviation_school',
  'cityhall',
  'fence',
  'film_studio',
  'laundry_box',
  u'lokal_do_wynaj\u0119cia',
  'movie_studio',
  'municipal_police',
  'vehicle_control'},
 15)

15 entries.

What will happen only the entries with wiki_entries and 10 occurences

In [121]:
#Audit amenity with k occurences in OSM db
occ=10

val_to_audit= list(vals_amenity.val)
expected = list((data_amenity[(data_amenity['count'] >= occ) & (data_amenity.in_wiki==True) ]).value)
audit_amenity = audit_type(val_to_audit, expected)

In [150]:
len(audit_amenity)

44

44 - How to handle this?

Let's see if we can find equivalent meanings in the OSM db and map them.

In [131]:
# Looking for baby_care
data_amenity[(data_amenity.in_wiki==True) & ((data_amenity.value.str.contains('baby')) 
                                          | (data_amenity.value.str.contains('child'))
                                          | (data_amenity.value.str.contains('kinder')))]

Unnamed: 0,count,description,fraction,in_wiki,value
14,179214,A place for looking after preschool children a...,0.016,True,kindergarten
68,9232,Describes a place where children of different ...,0.0008,True,childcare
151,398,pl=Okno życia,0.0,True,baby_hatch


Does not really make any sense to change that..

### Take a look at City

In [136]:
#Find city
osm_file = 'warsaw_poland_sample_k=2.osm'
types = find(osm_file, 'addr:city')

city = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [139]:
city[city.val.str.contains('',case=False)].head(10)

Unnamed: 0,val,counts
977,Warszawa,58050
920,Otwock,4497
563,Marki,3770
43,Piaseczno,3714
44,Józefów,3298
201,Żyrardów,3258
1064,Kobyłka,3223
1005,Legionowo,2992
19,Mińsk Mazowiecki,2908
1011,Grodzisk Mazowiecki,2896


In [140]:
city[city.val.str.contains('',case=False)].tail(10)

Unnamed: 0,val,counts
442,Mienia,1
783,Klusek,1
433,Rybitew,1
772,Strugi Krzywickie,1
134,Urszulin,1
61,Ilinko,1
63,warszawa,1
273,Linin,1
274,Michrówek,1
166,Mały Stanisławów,1


Everything looks nice and clean here

### Take a look at `Building`

In [4]:
#Find building
osm_file = 'warsaw_poland_sample_k=2.osm'
types = find(osm_file, 'building')

building = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [10]:
building[building.val.str.contains('',case=False)].head(10)

Unnamed: 0,val,counts
6,yes,138781
36,house,8228
8,residential,7274
35,apartments,1781
32,terrace,835
27,garage,780
51,commercial,544
39,farm_auxiliary,465
60,industrial,464
78,garages,429


In [149]:
building[building.val.str.contains('',case=False)].tail(10)

Unnamed: 0,val,counts
18,residential; retail,1
82,storage_tank,1
20,cage,1
84,ruins; house,1
85,civ,1
86,monument,1
54,convent,1
64,boat,1
89,tribunes,1
88,bridge,1


In [147]:
building.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113 entries, 6 to 88
Data columns (total 2 columns):
val       113 non-null object
counts    113 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.6+ KB


Let's run it against the OSM db

In [7]:
#https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
url = "https://taginfo.openstreetmap.org/api/4/key/values?key=building&filter=all&lang=en&lang=pl&sortname=count&sortorder=desc&rp=14&qtype=value&format=json_pretty"
response = urllib.urlopen(url)
building_raw = pd.DataFrame(json.loads(response.read()))

data_building = pd.DataFrame(list(building_raw.data))

In [160]:
data_building.head(10)

Unnamed: 0,count,description,fraction,in_wiki,value
0,187374178,,0.8114,True,yes
1,22252140,Dom jednorodzinny,0.0964,True,house
2,6836488,,0.0296,True,residential
3,2595308,Pojedynczy garaż.,0.0112,True,garage
4,2094589,Budynek z wieloma pomieszczeniami mieszkalnymi...,0.0091,True,apartments
5,1221624,"Chata, chałupa, barak, buda, szałas, itd.",0.0053,True,hut
6,1090985,Obiekt przemysłowy.,0.0047,True,industrial
7,903231,Pojedyńczy dom wolnostojący.,0.0039,True,detached
8,829981,"Szopa, wiata.",0.0036,True,shed
9,805745,Zadaszenie,0.0035,True,roof


In [159]:
data_building.tail(10)

Unnamed: 0,count,description,fraction,in_wiki,value
11364,1,,0.0,False,Meeting
11365,1,,0.0,False,LEA
11366,1,,0.0,False,LEC
11367,1,,0.0,False,garge
11368,1,,0.0,False,trasportation
11369,1,,0.0,False,Pastry Chef(Bakery)
11370,1,,0.0,False,hous
11371,1,,0.0,False,LGU
11372,1,,0.0,False,circus
11373,1,,0.0,False,armory


There seems to be some potential in cleaning the `building` up

In [20]:
#Audit building with occ occurences in OSM db
occ=1

val_to_audit= list(building.val)
expected = list((data_building[(data_building['count'] >= occ)]).value)
audit_building = audit_type(val_to_audit, expected)

In [21]:
audit_building,len(audit_building)

(set(), 0)



Let's try this with entries that have more than 5 occurences in the OSM Database

If we filter for  entries only with a wiki 

In [11]:
#Audit building with occ occurences in OSM db
occ=1

val_to_audit= list(building.val)
expected = list((data_building[(data_building['count'] >= occ) & (data_building.in_wiki==True)]).value)
audit_building = audit_type(val_to_audit, expected)

In [89]:
audit_building, len(audit_building)

({'Basen',
  u'Plutonowego J\xf3zefa Cie\u0107wierza',
  'ambassadors_residence',
  'aviary',
  'basilica',
  'belfry',
  'boat',
  'bunker;museum',
  'cage',
  'castle',
  'cheap_motel',
  'city_hall',
  'civ',
  'classrooms',
  'clinic',
  'collapsed',
  'convent',
  'corridor',
  'day_care',
  'doctors',
  'empty',
  'enclosing',
  'factory',
  'family_house',
  'fast_food',
  'fuel',
  'glasshouse',
  'hall',
  'library',
  'military',
  'monastery',
  'monument',
  'no',
  'office;residential',
  'office;retail;residential',
  'offices',
  'palace',
  'part',
  'passage',
  'power',
  'prison',
  'residential; retail',
  'residential;office',
  'retail;office',
  'ruin',
  'ruins; house',
  'semidetached_house',
  'serv',
  'shop',
  'silo',
  'sports_centre',
  'storage_tank',
  'store',
  'tent',
  'terminal',
  'terraces',
  'tower',
  'tribunes',
  'wayside_shrine'},
 59)

Ok, let's start with these ones and try mapping them to one of the more popular entries.

In [54]:
#Initiate a mapping dictionary for building
building_mapping = {}

In [53]:
# Looking for Basen
data_building[((data_building.value.str.contains('pool')))]
#                                           | (data_building.value.str.contains(''))
#                                           | (data_building.value.str.contains('')))]

Unnamed: 0,count,description,fraction,in_wiki,value
404,64,,0.0,False,swimming_pool
477,46,,0.0,False,indoor_swimming_pool
923,13,,0.0,False,poolhouse
1091,9,,0.0,False,pool
2552,2,,0.0,False,pool_house
4612,1,,0.0,False,"Sports,_exercise_area_and_swimming_pool"
5693,1,,0.0,False,community_harbour_pool
6239,1,,0.0,False,swimming-pool
6831,1,,0.0,False,swimmingpool
7138,1,,0.0,False,terrace with pool


In [78]:
building_mapping['Basen'] = 'swimming_pool'
building_mapping['ambassadors_residence'] ='embassy'

In [61]:
# Looking for day_care in OSM db
data_building[((data_building.value.str.contains('care')))]
#                                           | (data_building.value.str.contains(''))
#                                           | (data_building.value.str.contains('')))]

Unnamed: 0,count,description,fraction,in_wiki,value
153,535,,0.0,False,healthcare
892,13,,0.0,False,care_home
1248,7,,0.0,False,childcare
1446,6,,0.0,False,daycare
3193,2,,0.0,False,social care
3471,2,,0.0,False,day care center
4728,1,,0.0,False,care clinic
5172,1,,0.0,False,"day_care,_community_space,_etc."
6098,1,,0.0,False,Daycare Centre
6317,1,,0.0,False,day_care_centre


In [68]:
# Looking for day_care in Warsaw map
building[((building.val.str.contains('day')))]
#                                           | (data_building.value.str.contains(''))
#                                           | (data_building.value.str.contains('')))]

Unnamed: 0,val,counts
70,day_care,1


In [76]:
building_mapping['day_care'] = 'daycare'

In [71]:
# Looking for 'office;retail;residential' in OSM db
# Looking for day_care in Warsaw map
building[((building.val.str.contains('apartments')))]
#                                           | (data_building.value.str.contains(''))
#                                           | (data_building.value.str.contains('')))]

Unnamed: 0,val,counts
35,apartments,1781


In [87]:
# Looking for day_care in Warsaw map
building[((building.val.str.contains('serv')))]
#                                           | (data_building.value.str.contains(''))
#                                           | (data_building.value.str.contains('')))]

Unnamed: 0,val,counts
66,service,193
81,serv,1


We can build a smalll rule here according to the wiki entry: http://wiki.openstreetmap.org/wiki/Tag:building%3Dapartments

In [82]:
for val in list(building.val):
    if 'residential' in val:
        building_mapping[val] = 'apartments'

One more and generalize as much as I can using this as reference: http://wiki.openstreetmap.org/wiki/Key:building

In [12]:
building_mapping={}

building_mapping['day_care'] = 'daycare'
building_mapping['Basen'] = 'swimming_pool'
building_mapping['family_house'] = 'house'
building_mapping['classrooms'] = 'school'
building_mapping['family_house'] = 'house'

for val in audit_building:
    if  'shop' in val or 'store' in val or 'retail' in val or 'food' in val:
        building_mapping[val] = 'retail'
    if 'office' in val or 'commercial' in val:
        building_mapping[val] = 'commercial'
    if 'residential' in val:
        building_mapping[val] = 'apartments'
    if 'museum' in val:
        building_mapping[val] = 'museum'
    if 'ruin' in val or 'collapsed' in val:
        building_mapping[val] = 'ruins'
    if 'terrac' in val:
        building_mapping[val] = 'terrace'    
    if 'serv' in val or 'power' in val:
        building_mapping[val] = 'service'  
    if 'avia' in val:
        building_mapping[val] = 'hangar' 
    if 'mbass' in val:
        building_mapping[val] = 'embassy'
    if 'boat' in val:
        building_mapping[val] = 'houseboat'
    if 'otel' in val:
        building_mapping[val] = 'hotel'
    if 'detache' in val:
        building_mapping[val] = 'detached'  
    if  'clini' in val or 'docto' in val:
        building_mapping[val] = 'hospital'
    if  'convent' in val or 'basilica' in val or 'monastery' in val :
        building_mapping[val] = 'church'
    if  'factory' in val :
        building_mapping[val] = 'industrial'
    if  'shrine' in val :
        building_mapping[val] = 'shrine'
    if  'glass' in val :
        building_mapping[val] = 'conservatory'
    if  'enter' in val or 'hall' in val:
        building_mapping[val] = 'civic'

building_mapping

{'Basen': 'swimming_pool',
 'ambassadors_residence': 'embassy',
 'aviary': 'hangar',
 'basilica': 'church',
 'boat': 'houseboat',
 'bunker;museum': 'museum',
 'cheap_motel': 'hotel',
 'city_hall': 'civic',
 'classrooms': 'school',
 'clinic': 'hospital',
 'collapsed': 'ruins',
 'convent': 'church',
 'day_care': 'daycare',
 'doctors': 'hospital',
 'factory': 'industrial',
 'family_house': 'house',
 'fast_food': 'retail',
 'glasshouse': 'conservatory',
 'hall': 'civic',
 'monastery': 'church',
 'office;residential': 'apartments',
 'office;retail;residential': 'apartments',
 'offices': 'commercial',
 'power': 'service',
 'residential; retail': 'apartments',
 'residential;office': 'apartments',
 'retail;office': 'commercial',
 'ruin': 'ruins',
 'ruins; house': 'ruins',
 'semidetached_house': 'detached',
 'serv': 'service',
 'shop': 'retail',
 'store': 'retail',
 'terraces': 'terrace',
 'wayside_shrine': 'shrine'}

I could not find an adequate category for these without being in danger of deleting precious information 

In [None]:
('belfry', 'cage','castle', 'corridor', 'empty',
  'enclosing','fast_food','fuel','library','military',
  'monument','no','palace','part','passage','prison',
  'silo','storage_tank','tent','terminal','tower', 'tribunes')

### Audit the phone numbers

Phone numbers should have the following format:

`phone=+<country code> <area code> <local number>`, following the ITU-T E.123 and the DIN 5008 pattern

Ref: http://wiki.openstreetmap.org/wiki/Key:phone

In [13]:
#Find phone
osm_file = 'warsaw_poland_sample_k=2.osm'
types = find(osm_file, 'phone')

phone = pd.DataFrame({'val':types.keys(), 'counts': types.values()}, columns=['val','counts']).sort_values(by='counts', ascending=False)

In [25]:
phone[phone.val.str.contains('',case=False)].head(100)

Unnamed: 0,val,counts
250,+48227284262,3
900,+48468578260,2
280,22 636 47 74,2
521,22 886 00 88,2
922,536 221 878,2
802,+48 22 581 92 62,1
807,+48 22 833 15 85,1
806,+48508946514,1
805,+48227564247,1
804,22 638 97 46,1


In [17]:
phone[phone.val.str.startswith('+48')].head()

Unnamed: 0,val,counts
250,+48227284262,3
900,+48468578260,2
802,+48 22 581 92 62,1
807,+48 22 833 15 85,1
806,+48508946514,1


In [None]:
val=list(phone.val)[:]


def insert_spaces(number):
    '''Insert spaces acc to the format'''
        number = number[:3] + ' ' + number[3:5] + ' ' + number[5:]
        return number
    
def get_raw_local(number):
    
    '''Clean the number and get raw local num'''
        #Delete special characters an spaces
        char = [' ', '(', ')', '-', '/',';', ',', '_']  
        number = filter(lambda x: not (x in char), number)
        
        #get the local number
        number = number.lstrip('+')
        number = number.lstrip('00')
        number = number.lstrip('48')

        # ditch the second number or extra digits over 9
        if len(number)>=9:
            number = number[:9]
            
        return number
        
def is_foreign(number):
    '''Check if foreign number'''
    
    #Delete special characters an spaces
    char = [' ', '(', ')', '-', '/',';']  
    number = filter(lambda x: not (x in char), number)
    
    #Check if foreign
    if number.startswith('00') or number.startswith('+'):
        number = number.lstrip('+')
        number = number.lstrip('00')
        if not (number.startswith('48') or number.startswith('22')):
            return True
 

In [124]:
#Test the is_foreign function

val=list(phone.val)[:]
for i, number in enumerate(val):
    if is_foreign(number):
            print(number)

+46222521852
0000231936


### Make the new Phone Mapping

In [125]:
for i, number in enumerate(val):
        if not is_foreign(number):
            number =  get_raw_local(number)
            
            #add the country prefix
            number = ('+48'+ number)
            
        number = insert_spaces(number)   
        val[i] = number        

In [130]:
#Testing
val[:20]

['+48 22 7284262',
 '+48 68 578260',
 '+48 22 6364774',
 '+48 22 8860088',
 '+48 53 6221878',
 '+48 22 5819262',
 '+48 22 8331585',
 '+48 50 8946514',
 '+48 22 7564247',
 '+48 22 6389746',
 '+48 51 7921541',
 '+48 22 8261232',
 '+48 22 6322146',
 '+48 22 6200480',
 '+48 22 8877737',
 '+48 57 1232',
 '+48 26 1842075',
 '+48 22 7861026',
 '+48 22 7841947',
 '+48 22 6661769']

In [129]:
#Test for foreign
for v in val:
    if v.startswith('+46'):
        print (v)

+46 22 2521852


In [131]:
ls

audit-for-appointment.ipynb      warsaw_poland_sample_k=7.osm
audit.ipynb                      warsaw_poland_sample.osm
berlin_germany.osm               Warsaw_Small_nodes.csv
[0m[01;32mchinook.db[0m*                      Warsaw_Small_nodes_tags.csv
example4.osm                     Warsaw_Small.osm
[01;35mMetro-Extracts_Warsaw.png[0m        Warsaw_Small_ways.csv
OpenStreet-CaseStudy-Code.ipynb  Warsaw_Small_ways_nodes.csv
schema.py                        Warsaw_Small_ways_tags.csv
schema.pyc                       Wrangle-OpenStreetMap-Data-SQL.html
warsaw_poland.osm                Wrangle-OpenStreetMap-Data-SQL.ipynb
warsaw_poland_sample_k=2.osm     Wrangle-OpenStreetMap-Data-SQL.pdf
warsaw_poland_sample_k=5.osm     [01;34mWrangle-OpenStreetMaps-Data-with-SQL[0m/


In [154]:
%%bash 
sqlite3
.mode csv
.import Warsaw_Small_ways_tags.csv ways_tags
.schema

CREATE TABLE ways_tags(
  "id" TEXT,
  "key" TEXT,
  "value" TEXT,
  "type" TEXT
);


In [163]:
%%bash
sqlite3
.mode csv
.import Warsaw_Small_ways_tags.csv ways_tags
.mode columns
.headers on
SELECT DISTINCT type, count(type) as ct FROM ways_tags GROUP BY type ORDER BY ct DESC; 

type        ct        
----------  ----------
regular     136572    
addr        68204     
source      12948     
building    8822      
roof        5222      
area        1785      
name        837       
turn        765       
maxspeed    625       
buildingpa  416       
lanes       281       
note        272       
step        270       
maxgcweigh  191       
capacity    97        
railway     95        
cycleway    66        
fuel        55        
psv         52        
watch       43        
contact     42        
highway     40        
handrail    36        
oneway      36        
destinatio  35        
disused     34        
routing     33        
electrifie  32        
bus         17        
wikipedia   17        
toll        14        
ztm         14        
removed     13        
3dr         12        
animal      12        
ref         11        
tactile_pa  11        
access      10        
heritage    10        
maxgvweigh  8         
maxweight   8         
social_fac 