In [2]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

OSMFILE = "/Users/indur/Downloads/new-delhi_india.osm"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Ave.": "Avenue",
            "Rd": "Road",
            "Rd.": "Road",
            "St": "Street",
            "St.": "Street"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        # We will like to only examine the streets which are unusual/abbreviated
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    
    # As the given file can be quite large, we process it iteratively
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        # Audit only street data
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])

    return street_types


# Fix the street name if we can
def update_name(name, mapping, st_type): 

    # Looking into our mapping table to find possible fixed name
    if st_type in mapping:
        name = street_type_re.sub(mapping[st_type], name)

    return name

# Audit and fix the street names
def audit_fix():
    st_types = audit(OSMFILE)
    pprint.pprint(dict(st_types))

    # For each possible problematic street name fix it if we can
    for st_type, ways in st_types.iteritems():
        for name in ways:
            better_name = update_name(name, mapping, st_type)
            if name != better_name:
                print name, "=>", better_name


if __name__ == '__main__':
    audit_fix()


{'1': set(['Gali No. 1',
           'Greater Kailash 1',
           'Udyog Vihar Phase -1',
           'south city 1']),
 '10': set(['Sector 10', 'h/no 1/55 sadar bazar delhi cantt 10']),
 '100': set(['Suite 100']),
 '11': set(['Sector - 11', 'sector 11']),
 '11,': set(['Sector - 11,']),
 '110/105': set(['110/105']),
 '12': set(['Gali No 12',
            'Sector - 12',
            'Sector 12',
            'Sector 12 Block V Road, Sector 12']),
 '120': set(['sector 120']),
 '126': set(['Sector NO. 22, nr A- 126']),
 '13': set(['Rohini sector 13', 'Sector 13']),
 '17C': set(['Sector 17C']),
 '18': set(['Captain Vijyant Thapar Marg, N Block, Pocket C, Sector 18']),
 '19': set(['Sector 19']),
 '2': set(['DLF Phase 2', 'South City 2', 'Sushant Lok 2']),
 '201304': set(['Bhangel, Salarpur Noida - 201304']),
 '22': set(['Marg 22']),
 '225': set(['Avtaar Enclave, Paschim Vihar, Opp Metro Pillar No 225']),
 '24': set(['Sector 24']),
 '25': set(['Building 5, DLF City II, Sector 25', 'DLF City II