In [1]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re

rawOSMfile = "Tulsa map.osm"

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
number_in_string = re.compile('\d+')
street_type_in_string = re.compile(' Avenue| Ave| Lane| Ln| Road| Rd| Boulevard| Blvd| Drive| Dr| Street| St| Highway| Hwy', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]

#function to audit street types
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    if elem.attrib['k'] == "addr:street":
        return True
    elif elem.attrib['k'] == "name":
        #check if string contains a number and a street type
        if re.search(street_type_in_string, elem.attrib['v']):
            if re.search(number_in_string, elem.attrib['v']):
                return True
    else:
        return False
           


def audit(osmfile):
    osm_file = open(rawOSMfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

#run the audit street type function to see results of the Tulsa openstreetmap dataset
st_types = audit(rawOSMfile)
pprint.pprint(dict(st_types))

{'1': set(['Catoosa Fire Station #1',
           'Denver Avenue Station Bay 1',
           'Memorial Midtown Station Bay 1',
           'Rolling Hills Fire Protection District Station #1']),
 '10': set(['Denver Avenue Station Bay 10',
            'Memorial Midtown Station Bay 10']),
 '1045': set(['1045']),
 '11': set(['Denver Avenue Station Bay 11',
            'Memorial Midtown Station Bay 11']),
 '12': set(['Memorial Midtown Station Bay 12']),
 '15': set(['Tulsa Fire Station Number 15']),
 '167': set(['Oklahoma State Highway 167']),
 '2': set(['Catoosa Fire Station #2',
           'Denver Avenue Station Bay 2',
           'Memorial Midtown Station Bay 2',
           'Rolling Hills Fire Protection District Fire Station #2']),
 '2502': set(['2502']),
 '3': set(['Denver Avenue Station Bay 3', 'Memorial Midtown Station Bay 3']),
 '4': set(['Denver Avenue Station Bay 4', 'Memorial Midtown Station Bay 4']),
 '5': set(['Broken Arrow Fire Station #5',
           'Denver Avenue Station Bay 5'

In [2]:
#function to update street name
mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road",
            "Rd": "Road",
            "Dr.": "Drive",
            "Avenu": "Avenue",
            "West Florence": "West Florence Street",
            'independence': "Independence Street",
            'sheridan': "Sheridan Street",
            'Main': 'Main Street'
            }
def update_name(name, mapping):
    for key,value in mapping.iteritems():
        if key in name:
            name = re.sub(key+"$", value, name)
    return name

#test the update street name function against Tulsa openstreetmap dataset
for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name

East Main => East Main Street
S Cherokee => S Cherokee
South Cherokee => South Cherokee
South 73rd Avenue West => South 73rd Avenue West
South 40th Avenue West => South 40th Avenue West
South 51st Avenue West => South 51st Avenue West
South 49th Avenue West => South 49th Avenue West
Pebble Brook => Pebble Brook
E BA Frontage Rd => E BA Frontage Road
S. Sheridan Rd => S. Sheridan Road
East 48th street => East 48th street
North Old Highway 66 => North Old Highway 66
South Memorial Dr. => South Memorial Drive
North Hemlock Circle => North Hemlock Circle
South 108th Avenue East => South 108th Avenue East
South 71st Avenue East => South 71st Avenue East
South Toledo Avenue East => South Toledo Avenue East
South 106th Avenue East => South 106th Avenue East
South 92nd Avenue East => South 92nd Avenue East
South 94th Avenue East => South 94th Avenue East
South 83rd Avenue East => South 83rd Avenue East
South 101st Avenue East => South 101st Avenue East
288th Avenue East => 288th Avenue East
So

In [3]:
#function to check if postcode is a valid Tulsa postcode
def checkTulsazip(postcode):
    simplepostcode = postcode.split("-")[0]
    if len(simplepostcode) == 5 and int(simplepostcode[:2]) == 74:
        return True
    else:
        return False

In [9]:
import codecs
import json

#shaping and write data to json file

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def shape_element(element):
    node = {}
    created = {}
    address = {}
    if element.tag == "node" or element.tag == "way" :
        for key, value in element.attrib.iteritems():
            if key in CREATED:
                created[key] = value
            else:
                node[key] = value
        node["created"] = created
        node['type'] = element.tag
        if 'lat' and 'lon' in node:
            node['pos'] = [ float(node['lat']), float(node['lon']) ]
            #remove lat and lon keys in node since they are now in pos
            map(node.pop, ['lat','lon'])
        else:
            node['pos'] = None
        
        #get the k value for address
        for tag in element.iter('tag'):
            #ignore tag k with problem characters
            if re.search(problemchars, tag.attrib['k']):
                continue
            #add tag tag "k" value starts with "addr:" to a dictionary "address"
            elif re.search("addr:", tag.attrib['k']):
                #ignore the second ":" that separates the type/direction of a street
                if len(tag.attrib['k'].split(":")) < 3:
                    addresstag = tag.attrib['k'].split(":")[1]
                    address[addresstag] = tag.attrib['v']
            elif re.search("name:", tag.attrib['k']):
                if is_street_name(tag):
                    address["street"] = tag.attrib['v']
            else:
                address[tag.attrib['k']] = tag.attrib['v']
        #if there is an address, correct the street name and add address dictionery to the node dictionery
        if address:
            #update street name to get rid of typo, abbreviation
            if "street" in address:
                address["street"] = update_name(address["street"], mapping)
            #if postcode is not valid, delete it from address
            if "postcode" in address:
                if checkTulsazip(address['postcode']) == False:
                    address.pop('postcode')
            node['address'] = address
        
        
        
        #get the nd value
        nodereference = []
        for tag in element.iter('nd'):
            nodereference.append(tag.attrib['ref'])
        
        if nodereference:
            node['node_refs'] = nodereference
        return node
    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "Cleaned {0}.json".format(file_in)
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")


process_map(rawOSMfile, False)
