# Cleaning and Passing data to csv files

In this notebook we document the process of cleaning and passing the data to csv files for the project analyzing open street map data of the city of Lima, Peru.

In the repository, it is possible to find the file "lima_sample.osm" which contains a small subset of the XML tree.

In [55]:
### Helper functions. This functions help us to take data from the XML document.

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow
from collections import Counter
import os.path
import re

OSM_FILE = "lima_peru.osm"  # Replace this with your osm file
SAMPLE_FILE = "lima_sample.osm"

k = 50 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

In [56]:
if not os.path.exists("lima_sample.osm"):
    with open(SAMPLE_FILE, 'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm>\n  ')
    
        # Write every kth top level element
        for i, element in enumerate(get_element(OSM_FILE)):
            if i % k == 0:
                output.write(ET.tostring(element, encoding='utf-8'))
    
        output.write('</osm>')

In [57]:
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

We start the process of analizing our data by checking the street names. This is useful to see if the data present some irregularities and to implement solutions of the observed problems. 

In [58]:
## Getting street names
def get_streets(element):
    """Helper function to get street names from data """
    for tag in element.findall('tag'):
        if tag.attrib['k'].startswith("addr:street"):
            street_name = tag.attrib['v']
            return street_name
        else:
            continue

In [59]:
street_sample = []
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    street = get_streets(element)
    if  street is not None:
        street_sample.append(street)

In [60]:
[item.split()[0] for item in street_sample[:10]]

[u'Malec\xf3n',
 u'\xd3valo',
 u'Avenida',
 'Avenida',
 'Los',
 'Avenida',
 'Coricancha',
 'Las',
 'Satelite',
 'Jorge']

The first thing we can notice from this is that we might have problems trying to aggregate the data because some entities are with accents and some not. We decide to uniform the data by removing all accents. 

In [61]:
import unicodedata
def strip_accents(s):
    """Strips all the accents from unicode data"""
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def normalizing_attributes(encode_attrib):
    """Pass the data to the appropriate unicode format and then accents are removed"""
    if encode_attrib is not None:
        atrib_encode = encode_attrib.encode("utf-8")
        attr_cont = atrib_encode.decode("utf-8")
        return strip_accents(attr_cont)

In [62]:
#Checking if function works appropriately
#Checking first 20 entries
map(normalizing_attributes, street_sample)[:10]

[u'Malecon de la Reserva',
 u'Ovalo Gutierrez',
 u'Avenida Nicolas de Pierola',
 u'Avenida Proceres de la Independencia',
 u'Los Cedros',
 u'Avenida Circunvalacion',
 u'Coricancha',
 u'Las Lausonias',
 u'Satelite',
 u'Jorge Chavez']

Now that we have uniformly treated the names of the streets, we want to check further problems with the data. Note that the first name of the street names gives us its type (e.g. Avenida, Jiron, Pasaje). With this information, we want to chek if there are certain problems with these entries that we can correct. In what follows we generate a function to get the first word of the street entry. We then count common entries by using the counter module.

In [63]:
def getting_norm_and_type_street_names(street_name):
    """Gets fisrt word of street - For checking type"""
    norm_street = None
    street_type = None
    #street_name = get_streets(element)
    #
    norm_street = normalizing_attributes(street_name)
    if street_name:
        street_type = norm_street.split()[0].strip()
        norm_street = norm_street
    return street_type, norm_street

In [64]:
type_street_counter = Counter(getting_norm_and_type_street_names(get_streets(element))[0] for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')))

In [65]:
type_street_counter

Counter({None: 13273,
         u'13': 1,
         u'300': 1,
         u'7': 1,
         u'Alameda': 4,
         u'Alfredo': 2,
         u'Ampliacion': 1,
         u'Arequipa': 2,
         u'Auxiliar': 3,
         u'Av.': 6,
         u'Avenida': 288,
         u'Callao': 1,
         u'Calle': 40,
         u'Carretera': 5,
         u'Casma': 1,
         u'Colcabamba': 1,
         u'Coricancha': 1,
         u'Federico': 1,
         u'Gallesi': 1,
         u'General': 1,
         u'Gonzales': 1,
         u'Gozzoli': 1,
         u'Gran': 1,
         u'Guardia': 1,
         u'Guillermo': 1,
         u'Hernando': 1,
         u'Huancarama': 1,
         u'Inca': 1,
         u'Iquique': 1,
         u'Jesus': 1,
         u'Jiron': 48,
         u'Jorge': 3,
         u'Jose': 2,
         u'Jr.': 1,
         u'La': 2,
         u'Las': 9,
         u'Leoncio': 1,
         u'Los': 10,
         u'Malecon': 1,
         u'Manuel': 2,
         u'Maria': 1,
         u'Mercedes': 1,
         u'Monterrosa': 1,

From the list, we identify four problems:
1. Av., Avenida and Avienda: They all refer to the same stret type.
2. Defensores and Defendores: This might be a typing issue.
3. Jr., Jr, Jiron: They all refere to the same street type.
4. Calle and calle: difference because of capitalization.
We correct this discrepancies in the data below.

In [66]:
def corrections_street_type(street_type):
    """changing types according to corrections"""
    if street_type in ["Av", "Av.", "Avienda"]:
        correct_type = "Avenida"
    elif street_type in ["Defendores"]:
        correct_type = "Defensores"
    elif street_type in ["Jr", "Jr."]:
        correct_type = "Jiron"
    elif street_type in ["calle"]:
        correct_type = "Calle"
    else:
        correct_type = None
    return correct_type

In [67]:
def correcting_street_names(street_name):
    """"Corrects problematic entries according to specified rule in function
    corrections_street_type"""
    street_name, street_type = getting_norm_and_type_street_names(street_name)
    corrected_name = corrections_street_type(street_type)
    if corrected_name:
        street_name = street_name.replace(street_type, corrected_name, 1)
    return street_name

Below we print only the street types for our data set to see if changes were correctly applied. As it is possible to see, all changes specified by the function yield the expected result.

In [68]:
street_type_checker = set()
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    street_name = get_streets(element)
    if street_name:
        street_type_checker.add(correcting_street_names(street_name).split()[0])

In [71]:
street_type_checker

{u'13',
 u'300',
 u'7',
 u'Alameda',
 u'Alfredo',
 u'Ampliacion',
 u'Arequipa',
 u'Auxiliar',
 u'Av.',
 u'Avenida',
 u'Callao',
 u'Calle',
 u'Carretera',
 u'Casma',
 u'Colcabamba',
 u'Coricancha',
 u'Federico',
 u'Gallesi',
 u'General',
 u'Gonzales',
 u'Gozzoli',
 u'Gran',
 u'Guardia',
 u'Guillermo',
 u'Hernando',
 u'Huancarama',
 u'Inca',
 u'Iquique',
 u'Jesus',
 u'Jiron',
 u'Jorge',
 u'Jose',
 u'Jr.',
 u'La',
 u'Las',
 u'Leoncio',
 u'Los',
 u'Malecon',
 u'Manuel',
 u'Maria',
 u'Mercedes',
 u'Monterrosa',
 u'Ovalo',
 u'Pacasmayo',
 u'Pasaje',
 u'Paseo',
 u'Paz',
 u'Prolongacion',
 u'Puente',
 u'Rio',
 u'Rufino',
 u'San',
 u'Santa',
 u'Satelite',
 u'Sergio',
 u'Tupac',
 u'Victor',
 u'maynas'}

## Checking Phone data

We implement a function to check if phone data is correctly formatted.

In [24]:
def get_phone(element):
    """Helper function to get all the available phones in the data set."""
    phone = None
    for tag in element.findall('tag'):
        if tag.attrib['k'] == 'phone':
            phone = tag.attrib['v']
    return phone



In [72]:
def phone_cleaner(phone):
    """Checks if entry constitutes a valid phone and uniformizes data.
    >>>phone_cleaner("(+51) 01 3766489")
    '+51 1 3766489'
    >>>phone_cleaner("(+511 4612344 Anexo 25")
    '+51 1 4612344 ext. 25'
    """
    if phone:
        phone_list = re.split("[;\,\/]", phone)
        phone = []
        for number in phone_list:
            # Taking out 511 in the beginning
            matching = re.sub(r"^\s?\(?\+?\s?5\s?1\)?\s?1?\s?\)?", "", number) 
            #Taking out (01) and - and white spaces
            matching_2 = re.sub(r"^\(?\-?0?\-?1\)?|-|\s|\.|\(|\)", "", matching) 
            taking_ext = re.split(r"Anex[os]|ex[t\.]", matching_2, flags=re.IGNORECASE)
            extension_num = None
            if len(taking_ext)==2:
                extension_num = taking_ext[1]
            #Checking if phones are only composed of numbers
            if re.match(r"\d", taking_ext[0]): 
                #Checking if has exact digits to be a cell phone or fixed line
                if len(taking_ext[0]) == 9:
                    cell_number = "+51 " + taking_ext[0]
                    phone.append(cell_number)
                elif len(taking_ext[0]) == 7:
                    fix_num = "+51 1 " + taking_ext[0]
                    if extension_num and len(extension_num) <= 4:
                        fix_num += " " "ext. " + extension_num
                    phone.append(fix_num)
        return phone

In [73]:
#Checking function / This function was already tested with all the data set. here just with sample file
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    if get_phone(element):
        print get_phone(element), phone_cleaner(get_phone(element))

+51 1 6104000 ['+51 1 6104000']
+51 1 5192000 ['+51 1 5192000']
+51 1 6197000 ['+51 1 6197000']
+51 1 3563916 ['+51 1 3563916']
+51 1 211 8800 (ext. 2233) ['+51 1 2118800 ext. 2233']
012191000 ['+51 1 2191000']
6610446 ['+51 1 6610446']
4632727 ['+51 1 4632727']
+51980910194 ['+51 980910194']
+515290535 ['+51 1 5290535']
+51 1 4428828 ['+51 1 4428828']
012913613 ['+51 1 2913613']
+51 1 4596890 ['+51 1 4596890']
(511) 433-4724 / (511) 424-1988 ['+51 1 4334724', '+51 1 4241988']
634-2700 ['+51 1 6342700']
418 0710 ['+51 1 4180710']
424 6634 ['+51 1 4246634']
2762753 ['+51 1 2762753']


# Checking amenities and tag contents for banks and fast food.

We wanted to make a more rigourous analysis of banks and fast foods in the data. We started checking what are the available amenity tags. From there we went to analyzing what type of data is associated with banks and fast food. We observed and try to correct problems realted with multiple names for a single entities and cases of bad tagging.

In [74]:
def get_amenity(element):
    """gets name of amenities of a given element"""
    amen_name = None
    type_amen = None
    for tag in element.findall('tag'):
        if tag.attrib['k'] == 'name':
                amen_name = tag.attrib['v']
        if tag.attrib['k'] =='amenity':
            type_amen = tag.attrib['v']
    return (normalizing_attributes(type_amen), normalizing_attributes(amen_name))
        
        

In [75]:
types_amen = set()
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    types_amen.add(get_amenity(element)[0])

In [76]:
types_amen

{None,
 u'bank',
 u'bar',
 u'bench',
 u'bureau_de_change',
 u'bus_station',
 u'cafe',
 u'car_rental',
 u'car_wash',
 u'casino',
 u'childcare',
 u'cinema',
 u'clinic',
 u'college',
 u'community_centre',
 u'courthouse',
 u'dentist',
 u'doctors',
 u'embassy',
 u'events_venue',
 u'fast_food',
 u'fire_station',
 u'fountain',
 u'fuel',
 u'hospital',
 u'kindergarten',
 u'marketplace',
 u'nightclub',
 u'parking',
 u'parking_entrance',
 u'pharmacy',
 u'place_of_worship',
 u'police',
 u'public_building',
 u'ranger_station',
 u'restaurant',
 u'school',
 u'social_centre',
 u'studio',
 u'swimming_pool',
 u'telephone',
 u'theatre',
 u'toilets',
 u'veterinary',
 u'waste_basket'}

In [77]:
bank_names = set()
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    type_amen, amen_name = get_amenity(element)
    if type_amen == 'bank':
            bank_names.add(amen_name)

In [78]:
bank_names

{None,
 u'BBVA',
 u'BCP',
 u'BanBif',
 u'Banbif',
 u'Banco Falabella',
 u'Banco Financiero',
 u'Banco Interbank',
 u'Banco de la Nacion',
 u'Caja Palta',
 u'Interbank',
 u'Mi Banco',
 u'Mibanco',
 u'Scotiabank'}

In [79]:
def rechecker_names(dict_names):
    """creates a set of names from dict values"""
    checking_list = []
    for item in dict_names.values():
        checking_list.extend(item)
    return set(checking_list)

In [80]:
#Bank Clean    
bank_dict = {
        "BBVA": [],
        "BCP": [],
        "Banco de la Nacion": [],
        "Interbank": [],
        "GNB": [],
        "Falabella": [],
        "Azteca": [],
        "Mi Banco": [],
        "Financiero": [],
        "Scotiabank": [],
        "Edyficar": [],
        "Citibank": [],
        "BIF": [],
        "Compartamos": [],
        "Others": []
    }

for bank in bank_names:
    if bank:
        if "continental" in bank.lower() or "bbva" in bank.lower():
            bank_dict["BBVA"].append(bank)
        elif ("banco" in bank.lower() and "credito" in bank.lower()) or ("bcp" in bank.lower()):
            bank_dict["BCP"].append(bank)
        elif "mibanco" in bank.lower().replace(" ", ""):
            bank_dict["Mi Banco"].append(bank)
        elif ("interbank" in bank.lower() or "internacional" in bank.lower() 
              or "intel" in bank.lower()):
            bank_dict["Interbank"].append(bank)
        elif "nacion" in bank.lower() or "multired" in bank.lower():
            bank_dict["Banco de la Nacion"].append(bank)
        elif "gnb" in bank.lower():
            bank_dict["GNB"].append(bank)
        elif "financiero" in bank.lower():
            bank_dict["Financiero"].append(bank)
        elif "azteca" in bank.lower():
            bank_dict["Azteca"].append(bank)
        elif "falabella" in bank.lower():
            bank_dict["Falabella"].append(bank)
        elif "scotiabank" in bank.lower():
            bank_dict["Scotiabank"].append(bank)
        elif "edyficar" in bank.lower():
            bank_dict["Edyficar"].append(bank)
        elif "citi" in bank.lower():
            bank_dict["Citibank"].append(bank)
        elif "bif" in bank.lower():
            bank_dict["BIF"].append(bank)
        elif "compartamos" in bank.lower():
            bank_dict["Compartamos"].append(bank)
        else:
            bank_dict["Others"].append(bank)   
bank_dict 

{'Azteca': [],
 'BBVA': [u'BBVA'],
 'BCP': [u'BCP'],
 'BIF': [u'Banbif', u'BanBif'],
 'Banco de la Nacion': [u'Banco de la Nacion'],
 'Citibank': [],
 'Compartamos': [],
 'Edyficar': [],
 'Falabella': [u'Banco Falabella'],
 'Financiero': [u'Banco Financiero'],
 'GNB': [],
 'Interbank': [u'Banco Interbank', u'Interbank'],
 'Mi Banco': [u'Mibanco', u'Mi Banco'],
 'Others': [u'Caja Palta'],
 'Scotiabank': [u'Scotiabank']}

In [81]:
#Checking correctness of tag. Only choose those tags that we have already selected as important.
def dict_no_other(dictio_items):
    """Takes away Other as the dict key so we can check only for the cases we have a solution."""
    selected_items = {item: value for item, value in dictio_items.items() if item != "Others"}
    return selected_items
selected_banks = dict_no_other(bank_dict)  


In [82]:
#Inverting bank dictionary so mapping is easier
def reverse_dict_items(dict_to_reverse):
    """Points the incorrect bank names (dict keys) to the correct entries (dict values)"""
    reverse_dict = {item: value for value, items in dict_to_reverse.items() for item in items}
    for value in set(reverse_dict.values()):
        if value not in reverse_dict.keys():
            reverse_dict[value] = value
    return reverse_dict

reverse_bank_dict = reverse_dict_items(selected_banks)
curated_bank_set = reverse_bank_dict.keys()
curated_bank_set

[u'BanBif',
 u'Banco Interbank',
 u'Banco de la Nacion',
 'BIF',
 u'Banco Financiero',
 u'BBVA',
 u'Mibanco',
 'Falabella',
 u'Banbif',
 u'Scotiabank',
 u'BCP',
 u'Mi Banco',
 'Financiero',
 u'Banco Falabella',
 u'Interbank']

In [83]:
# Normalizing bank names
def name_checker(element, reverse_dict, curated_list):
    """Normalize the name of the banks given a dictionary that points to the 
    corrected list (reverse dict) and the correct name of banks (curated_list)"""
    type_amen, amen_name = get_amenity(element)
    if amen_name and amen_name in curated_list:
        amen_name = reverse_dict[amen_name]
    return amen_name

In [84]:

def bank_tag_checker(element):
    """Checks if the amenity is correctly classified as a bank when there is no label given."""
    type_amen, amen_name = get_amenity(element)
    if amen_name and amen_name in curated_bank_set and not type_amen:
        type_amen = "bank"
    return type_amen

In [85]:
def bank_name_tag_checker(element):
    """Function for double checking if bank names and tags are Ok."""
    type_amen = bank_tag_checker(element)
    amen_name = name_checker(element, reverse_bank_dict, curated_bank_set)
    return type_amen, amen_name

Here we check that our function is running as expected. As it is possible to see in the output below, the code is working as expected. For the banks that we have, it corrects the tag and labels it as a bank.

In [90]:
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    tag, name = bank_name_tag_checker(element) 
    if tag == "bank" and name:
        print tag, name
        

bank Interbank
bank Financiero
bank Interbank
bank Banco de la Nacion
bank BBVA
bank Falabella
bank Mi Banco
bank BCP
bank Mi Banco
bank Mi Banco
bank BBVA
bank Caja Palta
bank Interbank
bank Scotiabank
bank BIF
bank Scotiabank
bank BCP
bank Mi Banco
bank BIF


## Checking fast food names

Here we check the main differences with fast food names. First we get the list of fast foods and check for different instances that refer to the same restaurant. We correct the list to modify these inconsistencies. We compare this list to the list of restaurants to check if there are restaurants that are double listed as restaurants and fast food. If this is the case, we include them in the fast food and take it off from the restaurant list.

In [91]:
#Checking fast food restaurants
fast_food_set = set()
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    type_amen, amen_name = get_amenity(element)
    if type_amen=="fast_food" or type_amen == "restaurant":
            fast_food_set.add(amen_name) 

In [92]:
fast_food_set

{None,
 u'Bembos',
 u'Bereket Doner',
 u'Caleo',
 u'Camila',
 u'Caravana',
 u'Cevicheria Micalo',
 u'Chifa Chang Shong',
 u'Chifa El Delicioso',
 u'Chifa Fu Hua',
 u'Chifa Guo',
 u'Chifa Hong Yi',
 u'Chifa La Muralla',
 u'Chifa Montesur',
 u'Chifa Sabor Cantones',
 u'Chifa Sax Say',
 u'Chifa Shin Wing',
 u'Chifa Shun De',
 u'Chifa Susu Huang',
 u'Chifa Tiansheng',
 u'Chifa Xing Lung',
 u'Chifa Yue Hua',
 u'Chifa Zhen',
 u'Chifa Zhen Yun',
 u'Combo Marino',
 u"D'Candela",
 u'Donde Alfredo',
 u'EDO Sushi Bar',
 u'Eco Pizza',
 u'El Mistiano',
 u'El Olimpico',
 u'El Peruano',
 u'El Sabor de Royer',
 u'House Pizzas',
 u'KFC',
 u'La Casa Del Pollo',
 u'La Diabla',
 u'La Panka',
 u'La Posada del Angel III',
 u'Las Canastas',
 u'Las Tinajas',
 u'Los Incas',
 u'Marcelino Pizza & Vino',
 u"Mr. Noky's",
 u'Mr. Sushi',
 u'Nautilius',
 u"Norky's",
 u'Norkys',
 u'OiSHii',
 u'Osharz',
 u'Palco Restobar',
 u'Perroquet',
 u'Pisa 2',
 u'Pizzeria El Italiano',
 u'Polleria Hikari',
 u'Pollos A La Lena',
 

In [399]:
#Bank Clean    
fast_food_dict = {
        "KFC": [],
        "Bembos": [],
        "Burger King": [],
        "Pizza Hut": [],
        "Papa John's": [],
        "Popeye's": [],
        "Norky's": [],
        "Rocky's": [],
        "Subway": [],
        "McDonald's": [],
        "China Wok": [],
        "D'nnos Pizza": [],
        "Telepizza": [],
        "Tip Top": [],
        "Dunkin Donuts": [],
        "Domino's Pizza": [],
        "Others": []
    }

def renaming_fast_food(fast_food_list): 
    for fast_food in fast_food_list:
        if fast_food:
            if "bembo" in fast_food.lower():
                fast_food_dict["Bembos"].append(fast_food)
            if "kfc" in fast_food.lower() or "kentucky" in fast_food.lower():
                fast_food_dict["KFC"].append(fast_food)
            if "burger king" in fast_food.lower():
                fast_food_dict["Burger King"].append(fast_food)
            if "pizza hut" in fast_food.lower():
                fast_food_dict["Pizza Hut"].append(fast_food)
            if "papa john" in fast_food.lower():
                fast_food_dict["Papa John's"].append(fast_food)
            if "popeye" in fast_food.lower():
                fast_food_dict["Popeye's"].append(fast_food)
            if "norky" in fast_food.lower():
                fast_food_dict["Norky's"].append(fast_food)
            if "rocky" in fast_food.lower() or "roky" in fast_food.lower() :
                fast_food_dict["Rocky's"].append(fast_food)
            if "subway" in fast_food.lower():
                fast_food_dict["Subway"].append(fast_food)
            if "mcdonald" in fast_food.lower() or "mc don" in fast_food.lower():
                fast_food_dict["McDonald's"].append(fast_food)
            if "china wok" in fast_food.lower() or "chinawok" in fast_food.lower():
                fast_food_dict["China Wok"].append(fast_food)
            if "d'nnos " in fast_food.lower() or "dinnos" in fast_food.lower():
                fast_food_dict["D'nnos Pizza"].append(fast_food)
            if "telepizz" in fast_food.lower():
                fast_food_dict["Telepizza"].append(fast_food)
            if "tip top" in fast_food.lower():
                fast_food_dict["Tip Top"].append(fast_food)
            if "dunki" in fast_food.lower():
                fast_food_dict["Dunkin Donuts"].append(fast_food)
            if "domino" in fast_food.lower():
                fast_food_dict["Domino's Pizza"].append(fast_food)
            else:
                fast_food_dict["Others"].append(fast_food)
    return fast_food_dict

fast_food_dict = renaming_fast_food(fast_food_set)
fast_food_dict["KFC"].append("KfC")

In [400]:
#Reversing restaurant names
selected_rest = dict_no_other(fast_food_dict) 
reverse_rest_dict = reverse_dict_items(selected_rest)
curated_rest = reverse_rest_dict.keys()

In [401]:
#Changing Nonen and restaurant tag to fast food tag
def fast_food_tag_checker(element):
    type_amen, amen_name = get_amenity(element)
    if amen_name and amen_name in curated_rest and type_amen=='restaurant':
        type_amen = "fast_food"
    elif amen_name and amen_name in curated_rest and not type_amen:
        type_amen = "fast_food"
    return type_amen

In [402]:
def rest_name_tag_checker(element):
    """Function for checking if tag and names match specifications"""
    type_amen = fast_food_tag_checker(element)
    amen_name = name_checker(element, reverse_rest_dict, curated_rest)
    return type_amen, amen_name

In [403]:
for element in get_element(SAMPLE_FILE, tags=('node', 'way', 'relation')):
    tag, amen_name = get_amenity(element)
    if tag == "fast_food" or tag == "restaurant":
        print rest_name_tag_checker(element)

('fast_food', "Rocky's")
(u'restaurant', u'Villa Chicken')
(u'restaurant', u'Palco Restobar')
(u'restaurant', u'Chifa Hong Yi')
(u'restaurant', u'Camila')
(u'restaurant', u'Chifa Sabor Cantones')
(u'restaurant', u'Marcelino Pizza & Vino')
(u'restaurant', u'Las Canastas')
(u'restaurant', u'Pollos A La Lena')
(u'restaurant', None)


# Passing data to CSV

Having written our functions that will help us clean the data, we pass the data to the appropriate CSV files, checking consistency.

In order to pass our data to csv files, we use this code. We update it appropriately so the changes introduced by our defined functions are used. 

In [363]:
import csv
import codecs
import re
import xml.etree.cElementTree as ET

import cerberus

import schema



OSM_PATH = "lima_peru.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']



def tag_info_getter(tag, element):
    """Given an element it gives it a format so it can be passed to a csv file."""
    if PROBLEMCHARS.search(tag.attrib['k']):
        return None
    else:
        dictio_tag = {}
        dictio_tag['id'] = element.attrib['id']
        k_value_split = tag.attrib['k'].split(":")
        if len(k_value_split) > 1:
            key_dict = ":".join(k_value_split[:-1])
            type_dict = k_value_split[-1]
            dictio_tag['key'] = key_dict
            dictio_tag['type'] = type_dict
        else:
            key_dict = tag.attrib['k']
            type_dict = 'regular'
            dictio_tag['key'] = tag.attrib['k']
            dictio_tag['type'] = type_dict
        if key_dict.startswith("addr") or type_dict == "street":
            dictio_tag['value'] = correcting_street_names(tag.attrib['v'])
        norm_values = normalizing_attributes(tag.attrib['v'])
        dictio_tag['value'] = norm_values
        if tag.attrib['k'] == "amenity":
            dictio_tag['value'] = bank_tag_checker(element)
            if dictio_tag['value'] == tag.attrib['v']:
                dictio_tag['value'] = fast_food_tag_checker(element)
        if tag.attrib['k'] == "name":
            dictio_tag['value'] = name_checker(element, reverse_bank_dict, curated_bank_set)
            if dictio_tag['value'] == tag.attrib['v']:
                dictio_tag['value'] = name_checker(element, reverse_rest_dict, curated_rest)
        if tag.attrib['k'] == "phone":
            list_phone = []
            for phone in  phone_cleaner(tag.attrib['v']):
                dict_phone = {}
                dict_phone['id'] = element.attrib['id']
                dict_phone['value'] = phone
                dict_phone['key'] = tag.attrib['k']
                dict_phone["type"] = 'regular'
                list_phone.append(dict_phone)
            return list_phone
        return dictio_tag

def shape_element(element):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}

    # YOUR CODE HERE
    if element.tag == 'node':
        node_attribs['id'] = element.attrib['id']
        node_attribs['uid'] = element.attrib['uid']
        node_attribs['user'] = element.attrib['user']
        node_attribs['version'] = element.attrib['version']
        node_attribs['lat'] = element.attrib['lat']
        node_attribs['lon'] = element.attrib['lon']
        node_attribs['timestamp'] = element.attrib['timestamp']
        node_attribs['changeset'] = element.attrib['changeset']
        tags = [] 
        for tag in element.findall('tag'):
            dictio_tag = tag_info_getter(tag, element)
            if type(dictio_tag) == list:
                for item in dictio_tag:
                    tags.append(item)
            else:
                tags.append(dictio_tag)
        return {'node': node_attribs, 'node_tags': tags}
    
    elif element.tag == 'way':
        way_attribs['id'] = element.attrib['id']
        way_attribs['user'] = element.attrib['user']
        way_attribs['version'] = element.attrib['version']
        way_attribs['uid'] = element.attrib['uid']
        way_attribs['timestamp'] = element.attrib['timestamp']
        way_attribs['changeset'] = element.attrib['changeset']
        tags = []
        way_nodes = []
        counter = 0
        for node in element.findall('nd'):
            dictio_nodes = {}
            dictio_nodes['id'] = element.attrib['id']
            dictio_nodes['node_id'] = node.attrib['ref']
            dictio_nodes['position'] = counter
            counter += 1
            way_nodes.append(dictio_nodes)
        for tag in element.findall('tag'):
            dictio_tag = tag_info_getter(tag, element)
            if type(dictio_tag) == list:
                for item in dictio_tag:
                    tags.append(item)
            else: 
                tags.append(dictio_tag)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}

# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                try:
                    if validate is True:
                        if "node_tags" in el.keys():
                            if None in el["node_tags"]:
                                el["node_tags"].remove(None)
                        elif "way_tags" in el.keys():
                            if None in el["way_tags"]:
                                el["way_tags"].remove(None)                        
                        validate_element(el, validator)
                except AttributeError:
                    break
                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


#if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
#process_map(OSM_FILE, validate=True)