# OpenStreetMap Case Study: Mestre Venice


In this project, I apply data munging techniques like assessing data quality for validity, accuracy, completeness, consistency and uniformity, to clean the OpenStreetMap data for my hometown [Venice Mestre, Italy](https://www.openstreetmap.org/node/29997772#map=10/45.4953/12.2415)

The following image is a satellite picture of the Venice area taken from Google maps. [Mestre](https://en.wikipedia.org/wiki/Mestre) is part of the city of Venice and it is denoted with the red flag A. 
![alt text](https://7crooks.files.wordpress.com/2011/04/screen-shot-2011-04-20-at-11-23-56-am.png "")

The dataset is about **112 MB** and is used entirely in this analysis. 

In this project, the following information of the dataset are audited:

1. street names 
2. postal code between 30121 and 30176
3. names of city suburbs
4. province information
5. telephone number in the format +39 XXX XXXXXX


## PART 1: find errors in the dataset and define functions to correct them

In this part of the project, we check the OSM dataset and we check the fields listed in the introduction to find errors. Given the list of errors, functions that correct these error are created and tested.

### Street names

The XML file of Mestre OpenStreetMap is imported using python's cElementTree. We access the street name using the tags of the XML file. Using Python's regular expression library, we extract street names like raod, avenue, etc, and we compare this name with a list of correct names. If the name is not in the list, we save it in a dictionary of wrong names where the key is the wrong street name and value the list of full wrong names. We create a new dictionary that maps the wrong names (key) to the correct name (value). Using this map, we correct the street names.

In this anaylsis, numerous typos error were found (for example, we found Dorsorduro instead of Dorsoduro). Moreover, several names without first capital letter were found. We would like to keep the format with capital letter. 

In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint


OSMFILE = "C:/Users/jacopo/Desktop/Deep Learning/Udacity/Projects/DataWrangling/Final project/mestre.osm"

# String pattern for checking street name anomalies
street_type_re = re.compile(r'(\S*)+\.?', re.I)
# +: at least one match of the previous symbol 
# *: at least 0 match of the previous symbol 
# ?: 0 or 1 occurrences of the previous symbol


#street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
num_line_street_re = re.compile(r'\d0?(st|nd|rd|th|)\s(Line)$', re.IGNORECASE) # Spell lines ten and under

expected = ["Via", "Corso", "Viale", "Vicolo", "Piazza","Piazzetta", 
            "Piazzale", "Calle","Dorsoduro","Rio","Giudecca","Sotoportego",
            "Campo","Campiello","Fondamenta","Crosera","Salizada","Rotonda",
            "Riviera","Ponte","Corte","Cannaregio","San","Santa","Piscina","Borgo"]


mapping_street = { "Campazzo": "Campo",
                   "Dorsorduro": "Dorsoduro",
                   "Fondamente": "Fondamenta",
                   "Gallion": "Calle Gallion",
                   "salizada": "Salizada",
                   "via": "Via",
                   "cannaregio": "Cannaregio",
                   "santa": "Santa",
                   "Sestiere": "",
                   "Carbonera": "",
                   "Forte": "Via Forte"}


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit_street(osmfile):
    osm_file = open(osmfile, encoding="utf8")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def correct_street_name(name, mapping):
    
    name_correct = name
    
    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type =='Isola':
            name_correct = re.sub("Nuova", "Nova",name)
        elif street_type =='Lista':
            name_correct = 'Rio Terà ' + name
            name_correct = re.sub(r'(\n)+', "", name_correct)
        elif street_type =='Stazione':
            name_correct = 'Cannaregio'
        elif street_type == 'Dorsoduro,':
            name_correct = re.sub(r'(\,+\D+)', "", name)
        elif street_type == 'La':
            name_correct = 'Fondamenta Sant Eufemia'
        elif street_type in mapping:
            name_correct = street_type_re.sub(mapping[street_type], name, count = 1)
            
            if street_type == 'Sestiere' or street_type == 'Carbonera':
                name_correct = re.sub(r'\s+',"", name_correct, count = 1)
            elif street_type == 'salizada':
                name_correct = re.sub(r'(samuele)+\D+[0-9]+','Samuele', name_correct, count = 1)

    return name_correct



# Run codes
st_types = audit_street(OSMFILE)
print('The following errors were found:\n')  
pprint.pprint(dict(st_types))
print()

print('Correction of street names:\n')  
for st_type, ways in st_types.items():
    for name in ways:
        better_name = correct_street_name(name, mapping_street)
        print(name + '-->' + better_name)
        print()
        

The following errors were found:

{'Campazzo': {'Campazzo dei Tolentini'},
 'Carbonera': {'Carbonera Corte Nuova'},
 'Dorsoduro,': {'Dorsoduro, San Trovaso'},
 'Dorsorduro': {'Dorsorduro'},
 'Fondamente': {'Fondamente delle Scuole'},
 'Forte': {'Forte Marghera'},
 'Gallion': {'Gallion'},
 'Isola': {'Isola Nuova del Tronchetto'},
 'La': {'La Palanca Fondamenta Sant´Eufemia'},
 'Lista': {'Lista di Spagna\n'},
 'Sestiere': {'Sestiere Dorsoduro', 'Sestiere Cannaregio'},
 'Stazione': {'Stazione Santa Lucia'},
 'cannaregio': {'cannaregio'},
 'salizada': {'salizada San samuele, san Marco 3358'},
 'santa': {'santa croce'},
 'via': {'via Gino Allegri', 'via fratelli Bandiera'}}

Correction of street names:

Sestiere Dorsoduro-->Dorsoduro

Sestiere Cannaregio-->Cannaregio

Dorsoduro, San Trovaso-->Dorsoduro

Gallion-->Calle Gallion

Stazione Santa Lucia-->Cannaregio

Forte Marghera-->Via Forte Marghera

La Palanca Fondamenta Sant´Eufemia-->Fondamenta Sant Eufemia

salizada San samuele, san Marco

### Postal code

We check if the postal code lies between 30121 and 30176. We found that two tags contain the city or street name, whereas several postal codes were outside the expected range. We found that the map of Mestre contains several location outside the city!

In [2]:
# Audit post code
mapping_postal_code = { "PontedeiPugni": "30123",
                        "Ponte dei Pugni": "30123",
                        "Venice30123": "30123",
                        "Venice 30123": "30123"}


POSTCODE = re.compile(r'[A-z]\d[A-z]\s?\d[A-z]\d')
min_code = 30121
max_code = 30176
def audit_postcode(osmfile):
    postal_code_wrong = defaultdict(set)
    post_file = open(osmfile, encoding="utf8")
    for event, elem in ET.iterparse(post_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'addr:postcode':
                    post_code = re.sub(" ", "", tag.attrib['v'].strip())
                    m = POSTCODE.match(post_code)
                    if m is None:
                        aa = [int(s) for s in post_code.split() if s.isdigit()]
                        if aa:
                            if aa[0]<min_code or aa[0]>max_code:
                                #if post_code not in postal_code_wrong:
                                print('wrong postal code: %d' %aa[0])
                        else:
                            #if post_code not in postal_code_wrong:
                            postal_code_wrong[post_code].add(post_code)
    post_file.close()
    return postal_code_wrong

def correct_postal_code(name, mapping):
    
    code_correct = name
    
    m = re.search(r'(\D+\d*)+',name)
    if m:
        code = m.group()
        code_correct = re.sub(r'(\D+\d*)+',mapping[code], name, count = 1)
        return code_correct
    
    return code_correct



# Run codes
print('The following errors in the postal code were found:\n')  
wrong_postal_codes = audit_postcode(OSMFILE)
pprint.pprint(dict(wrong_postal_codes))
print()
print('Correction of postal codes:\n')  
for code_type, ways in wrong_postal_codes.items():
    for name in ways:
        new_postal_code = correct_postal_code(name, mapping_postal_code)
        print(name + '-->' + new_postal_code)
        print()

The following errors in the postal code were found:

wrong postal code: 30100
wrong postal code: 30034
wrong postal code: 30020
wrong postal code: 30020
wrong postal code: 30030
wrong postal code: 30030
wrong postal code: 30330
wrong postal code: 30012
wrong postal code: 30030
wrong postal code: 30100
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30100
wrong postal code: 30034
wrong postal code: 30100
wrong postal code: 30100
wrong postal code: 30100
wrong postal code: 30100
wrong postal code: 30030
wrong postal code: 30030
wrong postal code: 30030
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30034
wrong postal code: 30030
wrong postal code: 30034
wrong postal code: 30034
{'PontedeiPugni': {'PontedeiPugni'}, 'Venice30123': {'Venice30123'}}

Correction of postal codes:

PontedeiPugni-->30123

Venice30123-->30123



### Names of city suburbs

Here we check if the names of suburbs were correct. Also in this case, we found several locations outside the city. Moreover, we correct missplelled names as in part 1. In two cases, we found the postal code instead of the suburb name.

In [3]:
# Audit city information

mapping_city = {"Venice": "Venezia",
                "Marghera VE": "Marghera",
                "Venezia Mestre": "Mestre",
                "30173": "Tessera",
                "3073": "Tessera",
                "Martellago": "Martellago",
                "Mira": "Mira",
                "Olmo": "Olmo",
                "Spinea": "Spinea"}

expected_suburb = ['Venezia', 'Oriago', 'Mestre', 'Favaro Veneto','Tessera',
                   'Marghera', 'Zelarino', 'Campalto', 'Malcontenta','Marcon']

def audit_city(osmfile):
    suburb_list = defaultdict(set)
    city_file = open(osmfile, encoding="utf8")
    for event, elem in ET.iterparse(city_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'addr:city':
                    city = tag.attrib['v']
                    # province = re.sub(" ", "", tag.attrib['v'].strip())
                    if city not in expected_suburb:
                        suburb_list[city].add(city)
                    
    city_file.close()
    return suburb_list
    
    
def correct_city_sub(name, mapping):
    
    name_correct = name
    
    m = re.search(r'(\D*\d*)+',name)
    if m:
        suburb = m.group()
        if suburb not in expected_suburb:
            name_correct = mapping[suburb]
            
    return name_correct



# Run codes
city_sub_wrong = audit_city(OSMFILE)
print()
print('Correction of suburb names:\n')  
for code_type, ways in city_sub_wrong.items():
    for name in ways:
        new_suburb = correct_city_sub(name, mapping_city)
        print(name + '-->' + new_suburb)
        print()


Correction of suburb names:

Venice-->Venezia

Martellago-->Martellago

Olmo-->Olmo

Marghera VE-->Marghera

Mira-->Mira

30173-->Tessera

3073-->Tessera

Spinea-->Spinea

Venezia Mestre-->Mestre



### Province information

We make sure that province is "Venezia" in every node.

In [4]:
# Audit province information
def correct_province(name):
    
    if name=='VE':
        name = 'Venezia'
        
    return name


    
def audit_province(osmfile):
    province_list = []
    province_file = open(osmfile, encoding="utf8")
    for event, elem in ET.iterparse(province_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'addr:province':
                    province = correct_province(re.sub(" ", "", tag.attrib['v'].strip()))
                    if province not in province_list:
                        province_list.append(province)
                    
                    
    province_file.close()
    print(province_list)
    
audit_province(OSMFILE)

['Venezia']


### Telephone number

We check telephone numbers. Using the same approach as part 1, we ensures that all the phone numbers have the same format +39 XXX XXXXX.

In [5]:
# Audit phone number    
PHONENUM = re.compile(r'\+39\s\d{3}\s\d{6,7}')

def correct_phone_num(phone_num):

    # Check for valid phone number format
    m = PHONENUM.match(phone_num)
    if m is None:
        
        # remove postal code 
        if re.match(r'\d{5}', phone_num) is not None:
            if re.match(r'\d{6}', phone_num) is None:
                return None
        
        # Convert all dashes to spaces
        if "-" in phone_num:
            phone_num = re.sub("-", " ", phone_num)
         
        # remove space between + and 39
        if re.match(r'\+ 3',phone_num[:3]) is not None:
            phone_num = re.sub(" ", "", phone_num, count=1)
            
        # Substitute 00 with +
        if phone_num[:2]=='00':
            phone_num = '+' + phone_num[2:]
            
        # Add coutry code
        if re.match(r'\+39|39',phone_num) is None:
            phone_num = "+39" + phone_num
            
        # Remove whitespaces
        if " " in phone_num:
            phone_num = re.sub(" ", "", phone_num)
            
        # Add + in country code
        if re.match(r'39\d{3,}', phone_num) is not None:
            phone_num = "+" + phone_num
            
        # Check number in the 4th position
        if phone_num[3]=='4':
            phone_num = phone_num[:3]+'0'+phone_num[3:]
            
        if phone_num[3:5]=='03':
            phone_num = phone_num[:3]+phone_num[5:]
            
      
        # Space phone number
        phone_num = phone_num[:3] + " " + phone_num[3:6] + " " + phone_num[6:]

    return phone_num


def audit_phone(osmfile):
    phone_list = []
    phone_file = open(osmfile, encoding="utf8")
    counter = 0
    counter_max = 10
    for event, elem in ET.iterparse(phone_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if tag.attrib['k'] == 'phone':
                    phone_num = tag.attrib['v']
                    # province = re.sub(" ", "", tag.attrib['v'].strip())
                    new_phone = correct_phone_num(phone_num)
                    if counter<=counter_max:     
                        print(new_phone)
                    counter = counter + 1
                   # if phone_num not in phone_list:
                    #    phone_list.append(phone_num)
                    
    phone_file.close()
    
    
print('Corrected phone numbers (first 10):\n')  
audit_phone(OSMFILE)

Corrected phone numbers (first 10):

+39 041 715359
+39 041 2750218
+39 041 5244332
+39 041 2776142
+39 041 5351288
+39 041 2749227
+39 041 721901
+39 041 5240016
+39 041 5442945
+39 041 8221044
+39 041 5239825


## PART 2: prepare the data to be inserted into a SQL database

The OSM dataset is now parsed and saved as a set of dictionaries, being "node" and "way".
Each element of the OSM XML file is analyzed and corrected using the functions defined in step 1. The document is transformed to tabular format using python's dictionaries and saved into .csv files. These csv files can then easily be imported to a SQL database as tables.
This process consists of the following steps:
- iteratively step through each top level element in the OSM XML 
- analyze each node by correcting wrong values and saving them into dictionaries
- use a schema and validation library to ensure the transformed data is in the correct format
- write each data structure to the appropriate .csv files

In [6]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
To do so you will parse the elements in the OSM XML file, transforming them from document format to
tabular format, thus making it possible to write to .csv files.  These csv files can then easily be
imported to a SQL database as tables.
The process for this transformation is as follows:
- Use iterparse to iteratively step through each top level element in the XML
- Shape each element into several data structures using a custom function
- Utilize a schema and validation library to ensure the transformed data is in the correct format
- Write each data structure to the appropriate .csv files

## Shape Element Function
The function should take as input an iterparse Element object and return a dictionary.

### If the element top level tag is "node":
The dictionary returned should have the format {"node": .., "node_tags": ...}
The "node" field should hold a dictionary of the following top level node attributes:
- id
- user
- uid
- version
- lat
- lon
- timestamp
- changeset
All other attributes can be ignored
The "node_tags" field should hold a list of dictionaries, one per secondary tag. Secondary tags are
child tags of node which have the tag name/type: "tag". Each dictionary should have the following
fields from the secondary tag attributes:
- id: the top level node id attribute value
- key: the full tag "k" attribute value if no colon is present or the characters after the colon if one is.
- value: the tag "v" attribute value
- type: either the characters before the colon in the tag "k" value or "regular" if a colon
        is not present.
Additionally,
- if the tag "k" value contains problematic characters, the tag should be ignored
- if the tag "k" value contains a ":" the characters before the ":" should be set as the tag type
  and characters after the ":" should be set as the tag key
- if there are additional ":" in the "k" value they and they should be ignored and kept as part of
  the tag key. For example:
  <tag k="addr:street:name" v="Lincoln"/>
  should be turned into
  {'id': 12345, 'key': 'street:name', 'value': 'Lincoln', 'type': 'addr'}
- If a node has no secondary tags then the "node_tags" field should just contain an empty list.
The final return value for a "node" element should look something like:
{'node': {'id': 757860928,
          'user': 'uboot',
          'uid': 26299,
       'version': '2',
          'lat': 41.9747374,
          'lon': -87.6920102,
          'timestamp': '2010-07-22T16:16:51Z',
      'changeset': 5288876},
 'node_tags': [{'id': 757860928,
                'key': 'amenity',
                'value': 'fast_food',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'cuisine',
                'value': 'sausage',
                'type': 'regular'},
               {'id': 757860928,
                'key': 'name',
                'value': "Shelly's Tasty Freeze",
                'type': 'regular'}]}
                
### If the element top level tag is "way":
The dictionary should have the format {"way": ..., "way_tags": ..., "way_nodes": ...}
The "way" field should hold a dictionary of the following top level way attributes:
- id
- user
- uid
- version
- timestamp
- changeset
All other attributes can be ignored
The "way_tags" field should again hold a list of dictionaries, following the exact same rules as
for "node_tags".
Additionally, the dictionary should have a field "way_nodes". "way_nodes" should hold a list of
dictionaries, one for each nd child tag.  Each dictionary should have the fields:
- id: the top level element (way) id
- node_id: the ref attribute value of the nd tag
- position: the index starting at 0 of the nd tag i.e. what order the nd tag appears within
            the way element
The final return value for a "way" element should look something like:
{'way': {'id': 209809850,
         'user': 'chicago-buildings',
         'uid': 674454,
         'version': '1',
         'timestamp': '2013-03-13T15:58:04Z',
         'changeset': 15353317},
 'way_nodes': [{'id': 209809850, 'node_id': 2199822281, 'position': 0},
               {'id': 209809850, 'node_id': 2199822390, 'position': 1},
               {'id': 209809850, 'node_id': 2199822392, 'position': 2},
               {'id': 209809850, 'node_id': 2199822369, 'position': 3},
               {'id': 209809850, 'node_id': 2199822370, 'position': 4},
               {'id': 209809850, 'node_id': 2199822284, 'position': 5},
               {'id': 209809850, 'node_id': 2199822281, 'position': 6}],
 'way_tags': [{'id': 209809850,
               'key': 'housenumber',
               'type': 'addr',
               'value': '1412'},
              {'id': 209809850,
               'key': 'street',
               'type': 'addr',
               'value': 'West Lexington St.'},
              {'id': 209809850,
               'key': 'street:name',
               'type': 'addr',
               'value': 'Lexington'},
              {'id': '209809850',
               'key': 'street:prefix',
               'type': 'addr',
               'value': 'West'},
              {'id': 209809850,
               'key': 'street:type',
               'type': 'addr',
               'value': 'Street'},
              {'id': 209809850,
               'key': 'building',
               'type': 'regular',
               'value': 'yes'},
              {'id': 209809850,
               'key': 'levels',
               'type': 'building',
               'value': '1'},
              {'id': 209809850,
               'key': 'building_id',
               'type': 'chicago',
               'value': '366409'}]}
"""

import csv
import codecs
import re
import xml.etree.cElementTree as ET

import cerberus

import schema

# from audit import correct_name, is_street_name

OSM_PATH = OSMFILE

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
PHONENUM = re.compile(r'\+1\s\d{3}\s\d{3}\s\d{4}')
POSTCODE = re.compile(r'[A-z]\d[A-z]\s?\d[A-z]\d')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

In [7]:
# function to analyze secondary tags. This analysis is the same for node and way (primary) tags
def analyze_subtag(element, second_tag):
    
    # create dictionary for the secondary tag
    dict_second_tag = {}
    
    # if a secondary tag is present, it is parsed
    if second_tag is not None:
        dict_second_tag['id'] = element.attrib['id']

        # check is column is present
        if ":" not in second_tag.attrib['k']:

            # no column
            dict_second_tag['key'] = second_tag.attrib['k']
            dict_second_tag['value'] = second_tag.attrib['v']
            dict_second_tag['type'] = 'regular'
        else:

            # column found
            column_separator_position = second_tag.attrib['k'].index(':')
            dict_second_tag['key'] = second_tag.attrib['k'][(column_separator_position+1):]
            dict_second_tag['type'] = second_tag.attrib['k'][:column_separator_position]

            
        ### Find and correct problems in the values of the OSM map data        
        # correct street name
        if dict_second_tag['key'] == "street":
            dict_second_tag['value'] = correct_street_name(second_tag.attrib['v'],mapping_street)
    
        # correct postal code
        elif dict_second_tag['key'] == 'postcode':
            dict_second_tag['value'] = correct_postal_code(second_tag.attrib['v'], mapping_postal_code)
            
        # correct suburb name
        elif dict_second_tag['key'] == 'city':
            dict_second_tag['value'] = correct_city_sub(second_tag.attrib['v'],mapping_city)
    
        # correct phone number
        elif dict_second_tag['key'] == 'phone':
            dict_second_tag['value'] = correct_phone_num(second_tag.attrib['v'])
            if dict_second_tag['value'] is None:
                return None
            
    
        # correct province 
        elif dict_second_tag['key'] == 'province':
            dict_second_tag['value'] = correct_province(second_tag.attrib['v'])

        else:
            dict_second_tag['value'] = second_tag.attrib['v']

    return dict_second_tag
    

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    node_dict = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements


    if element.tag == 'node':
        # analyze primary tag
        for attrib in element.attrib.items():
            node_attribs[attrib[0]] = attrib[1]
            
        # check secondary tags
        for secondary_tag in element.iter():
            if secondary_tag.tag == 'tag':
                dict_subtag = analyze_subtag(element, secondary_tag)
                if dict_subtag is not None:
                    tags.append(dict_subtag)
                    
        return {'node': node_attribs, 'node_tags': tags}
         
    
    elif element.tag == 'way':
        for attrib in element.attrib.items():
            way_attribs[attrib[0]] = attrib[1]
            
        # check secondary tags
        counter = 0
        for secondary_tag in element.iter():
            if secondary_tag.tag == 'tag':
                dict_subtag = analyze_subtag(element, secondary_tag)
                if dict_subtag is not None:
                    tags.append(dict_subtag)
                    
            elif secondary_tag.tag == 'nd':
                dict_subtag_nd = {}
                dict_subtag_nd['id'] = element.attrib['id']
                dict_subtag_nd['node_id'] = secondary_tag.attrib['ref']
                dict_subtag_nd['position'] = counter
                counter = counter + 1
                way_nodes.append(dict_subtag_nd)
  
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    
    

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        print()
        print(element)
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.items()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)