# Clean Data Example
An example to show how to use regular expressions to clean data in an OSM file, an OpenStreetMap data file downloaded from [Mapzen](https://mapzen.com/data/metro-extracts/).

In [1]:
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from collections import defaultdict
import re

filename = 'southampton_england.osm.zip'

# # Create a function that opens the zip file.
# # Extract all members from the archive to the current working directory.
# def extract_zip(fname):
#     with ZipFile(fname, 'r') as myzip:
#         myzip.extractall()
# extract_zip(filename)
# osm_file = 'southampton_england.osm'

# Open the zipped file.
with ZipFile(filename, 'r') as zfile:
    unzippedfile = zfile.open('southampton_england.osm')

osm_file = unzippedfile

# Use re.compile() and save the resulting regular expression object for reuse.
street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)

# Create a defaultdict (a normal dict, but will never raise a KeyError for a nonexistent key).
street_types = defaultdict(int)

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] += 1

def print_sorted_dict(d):
    keys = d.keys()
    # Use lambda function to specify lower() directly, inline in the sorted() expression.
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys:
        v = d[k]
        print("%s: %d" % (k, v)) 

def is_street_name(elem):
    return (elem.tag == "tag") and (elem.attrib['k'] == "addr:street")

def audit():
    # Parse the XML section into element tree incrementally. Useful for large files.
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])    
    print_sorted_dict(street_types)    

if __name__ == '__main__':
    audit()

387: 1
access: 2
Avenue: 2024
Bridge: 7
Broadway: 22
Buildings: 13
Centre: 4
Cloisters: 8
Close: 1548
Cottages: 4
Court: 143
Crescent: 741
Dell: 5
Drive: 162
Drove: 8
East: 97
Esplanade: 8
Estate: 23
Finches: 4
Firs: 8
Gardens: 982
Green: 57
Greenways: 22
Grove: 247
High-Rise: 1
Hill: 104
Holt: 8
House: 13
Lane: 860
Loop: 1
Mayflowers: 8
Meadow: 16
Mews: 101
Mount: 19
North: 109
Parade: 3
Park: 21
Place: 236
Polygon: 37
Precinct: 1
Quay: 15
Queensway: 3
Raod: 1
Rd: 3
re: 1
Redhill: 12
Rise: 6
Road: 13585
road: 3
S: 1
Saltmead: 75
South: 91
Square: 122
Street: 806
Street): 1
Terrace: 60
View: 15
Village: 3
Walk: 34
Way: 609
West: 105
Westal: 1
