In [2]:
%%writefile audit.py
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import string
import pprint
from optparse import OptionParser

regex = re.compile(r'\b\S+\.?', re.IGNORECASE)
OSM_FILE = 'hyd_sample_new.osm'

expected = ["Hyderabad", "Road", "Gunj", "Nagar", "Junction", "Hills", "Society"] #expected names in the dataset

mapping = {"hyderabad": "Hyderabad",
            "Nr.": "NR",
           "nr": "NR",
           "Ave.": "Avenue",
           "sbk": "SBK",
           "gandhi": "Gandhi",
           "bridge": "Bridge",
           "road": "Road",
           "Ft.": "Feet",
           "ft": "Feet",
           "Rd": "Road",
           "Rd.": "Road",
           "Rd,": "Road,",
           "rd": "Road",
           "Roads": "Road",
           "society": "Society",
           "soc.": "Society",
           "jn" : "Junction",
           "Jn." : "Junction",
           "Jn" : "Junction"                     
            }

# Search string for the regex. If it is matched and not in the expected list then add this as a key to the set.
def audit_street(street_types, street_name): 
    m = regex.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem): # Check if it is a street name
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile): # return the list that satify the above two functions
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    #tree = ET.parse(osm_file)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street(street_types, tag.attrib['v'])
                    
  
    return street_types

def findWholeWord(w): #Some street names have double occurance 
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

#pprint.pprint(dict(audit(OSM_FILE))) # print the existing names

def string_case(s): # change string into titleCase except for UpperCase
    if s.isupper():
        return s
    else:
        return s.title()

# return the updated names
def update_name(name, mapping):
    name = name.split(' ')
    for i in range(len(name)):
        if name[i] in mapping:
            name[i] = mapping[name[i]]
            name[i] = string_case(name[i])
        elif name[i] in string.punctuation:
            name[i] = name[i].replace(',','')
        elif re.match('[^;\s]+', name[i]): 
            name[i] = name[i].split(';',1)[-1]
        else:
            name[i] = string_case(name[i])
    name = ' '.join(name)
    
# Further cleaning the address strings 
    if ('Sikh Road,') in name:
            name = name.replace('Sikh Road,','').replace('  ','').rstrip(',')
    elif name.startswith(" "):
        name = name.replace(' ','',1)
    else:
        name = string_case(name)
      
    return name



# print the updated names
update_street = audit(OSM_FILE) 
for street_type, ways in update_street.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name  



Overwriting audit.py
