# This notebook is used to create a samller sample data and audit the data

In [1]:
# python 2 environment
# -*- coding: utf-8 -*-

import xml.etree.cElementTree as ET
from pprint import pprint as pp
import re
from collections import defaultdict

OSM_FILE = "houston_texas.osm" 
SAMPLE_FILE = "sample.osm"


In [3]:
# Create Sample Data
k = 80 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')


In [4]:
#Check the attributes of nodes and ways
#print them out if they do not following the following format
nom=re.compile(r'^([a-z]|_)+:?([a-z]|_)+$')

def check_attributes(f):
    
    node_attributes = defaultdict(int)
    way_attributes = defaultdict(int)  
    for event, elem in ET.iterparse(f, events=("start",)):
        if elem.tag == "node":
            for tag in elem.iter("tag"): #find children tags
                m = nom.search(tag.attrib['k'].lower())
                if not m:      
                    if tag.get('k') not in node_attributes:
                        node_attributes[tag.get('k')]=1
                    else:   
                        
                        node_attributes[tag.get('k')]+=1
                
        if elem.tag == "way":
            for tag in elem.iter("tag"): #find children tags
                m = nom.search(tag.attrib['k'].lower())
                if not m:   
                    if tag.get('k') not in node_attributes:
                        way_attributes[tag.get('k')]=1
                    else:   
                        way_attributes[tag.get('k')]+=1

    return node_attributes,way_attributes
        
node_attributes,way_attributes = check_attributes(SAMPLE_FILE)


In [5]:
# print out the weird attributes of nodes
l=sorted( ((v,k) for k,v in node_attributes.iteritems()), reverse=True)
for v,k in l:
    print k,v

socket:type1 4
voltage-high 2
service:bicycle:rental 2
service:bicycle:chain_tool 1
name_1 1


In [6]:
# print out the weird attributes of ways
l=sorted( ((v,k) for k,v in way_attributes.iteritems()), reverse=True)
for v,k in l:
    print k,v

name_1 1508
turn:lanes:forward 1
turn:lanes:both_ways 1
turn:lanes:backward 1
tiger:zip_right_4 1
tiger:zip_right_3 1
tiger:zip_right_2 1
tiger:zip_right_1 1
tiger:zip_left_4 1
tiger:zip_left_3 1
tiger:zip_left_2 1
tiger:zip_left_1 1
tiger:name_type_4 1
tiger:name_type_3 1
tiger:name_type_2 1
tiger:name_type_1 1
tiger:name_direction_suffix_3 1
tiger:name_direction_suffix_2 1
tiger:name_direction_suffix_1 1
tiger:name_direction_prefix_3 1
tiger:name_direction_prefix_2 1
tiger:name_direction_prefix_1 1
tiger:name_base_4 1
tiger:name_base_3 1
tiger:name_base_2 1
tiger:name_base_1 1
roof:slope:direction 1
plant:output:electricity 1
parking:lane:right 1
old_ref:pre_1939 1
old_name_1 1
note:toll:hov 1
name_3 1
name_2 1
name:2 1
name:1 1
multi-story 1
mtb:scale:uphill 1
mtb:scale:imba 1
leisure_1 1
layer1 1
is_in:iso_3166_2 1
gnis:feature_id:old 1
gnis:created:old 1
generator:output:electricity 1
addr:street_1 1
access:lanes:both_ways 1


In [24]:
# check if there are nodes/ways calling for fixes.
def checkfixme(file):
    for _, element in ET.iterparse(file):
        if element.tag == "tag":
            if 'fixme' in element.get('k', None) or 'FIXME' in element.get('k', None) :
                print element.attrib
        
checkfixme(SAMPLE_FILE)

{'k': 'FIXME', 'v': 'This is an empty field hereabouts'}
{'k': 'FIXME', 'v': 'This school does not exist at this location.  This location is a block of row houses.'}
{'k': 'FIXME', 'v': 'Needs exact location; this is at least closer'}
{'k': 'FIXME', 'v': 'confirm location'}
{'k': 'FIXME', 'v': 'verify'}
{'k': 'fixme', 'v': '33'}
{'k': 'FIXME', 'v': 'Does Alief Clodine Rd change to Harwin Dr here?'}
{'k': 'FIXME', 'v': 'continue'}
{'k': 'fixme', 'v': 'Su-Th 11-22, Fr-Sa 11:00-23:00'}
{'k': 'fixme', 'v': 'Double check this when construction is done'}
{'k': 'FIXME', 'v': 'remove noexit when extending'}
{'k': 'FIXME', 'v': 'continue path'}
{'k': 'FIXME', 'v': 'Location approximated'}
{'k': 'fixme', 'v': 'some structure there, guessing at dam from imagery'}
{'k': 'FIXME', 'v': 'Location approximated'}
{'k': 'fixme', 'v': 'confirm hours, may be open until 22:00'}
{'k': 'FIXME', 'v': 'Add a gate or gap as appropriate'}
{'k': 'FIXME', 'v': 'positions hidden by trees'}
{'k': 'fixme', 'v': 'cont

In [25]:
# checck if the nodes are within the Greater Houston Area
def checklocation():
    lat=set()
    lon=set()
    for _, element in ET.iterparse('sample.osm'):
        if element.tag == "node":
            lat.add(element.get('lat', None) )
            lon.add(element.get('lon', None) )
    return lat,lon
lat,lon=checklocation()

print min(lat),min(lon)
print max(lat),max(lon)

28.8560051 -94.378062
30.2609976 -96.063993


In [26]:
# check zipcode, left and right
def checkzipcode():
    zip=set()
    for _, element in ET.iterparse('sample.osm'):
        if element.tag == "tag":
            if element.get('k', None)=='tiger:zip_left':
                n=element.get('v', None)
                zip.add(n)           
            if element.get('k', None)=='tiger:zip_right':
                n=element.get('v', None)
                zip.add(n) 
            if element.get('k', None)=='addr:postcode':
                n=element.get('v', None)
                zip.add(n)                 
    problemzip=[]
    for z in zip:
        try:
            z=int(z)
            # if the zipcode is not Houston area, add it into the list
            if z < 77000 or z > 77999:
                problemzip.append(z)
        except:
            problemzip.append(z)
                
    return problemzip

zip=checkzipcode()
pp(zip)

['77002; 77004',
 '77024; 77024; 77036',
 '77338:77346',
 '77045;77489',
 '77489; 77085',
 '77099:77478',
 '77077:77084',
 '77004:77023',
 '77031:77074',
 '77037;77076',
 '77088:77091',
 '77031:77035',
 '77022;77076;77009',
 '77024; 77063',
 '77009:77022',
 '77571; 77587',
 '77026:77050',
 '77016:77026',
 '77091;77076',
 '77373:77388',
 '77429; 77095',
 '77002;77004']


In [27]:
# check the keys of Tiger GPS data
def checktiger(file):
    tigers=defaultdict(int)
    for _, element in ET.iterparse(file):
        if element.tag == "tag":
            if 'tiger' in element.get('k'):
                tigers[element.get('k')]+=1
    return tigers

tigers=checktiger(SAMPLE_FILE)

l=sorted( ((v,k) for k,v in tigers.iteritems()), reverse=True)
for v,k in l:
    print k,v

tiger:county 16441
tiger:cfcc 16335
tiger:reviewed 13565
tiger:name_base 10702
tiger:name_type 9534
tiger:source 7267
tiger:tlid 7211
tiger:zip_left 7185
tiger:zip_right 6733
tiger:separated 6571
tiger:upload_uuid 3321
tiger:name_base_1 1526
tiger:name_direction_prefix 852
tiger:name_type_1 732
tiger:zip_left_1 324
tiger:name_base_2 212
tiger:name_direction_prefix_1 128
tiger:name_direction_suffix 126
tiger:zip_right_1 125
tiger:name_type_2 125
tiger:name_direction_suffix_1 88
tiger:state_id 75
tiger:zip_left_2 67
tiger:mtfcc 59
tiger:name_base_3 30
tiger:rttyp 28
tiger:zip_right_2 27
tiger:zip_left_3 21
tiger:name_direction_suffix_2 20
tiger:STATEFP 16
tiger:PLCIDFP 16
tiger:PLACENS 16
tiger:PLACEFP 16
tiger:PCINECTA 16
tiger:PCICBSA 16
tiger:NAMELSAD 16
tiger:NAME 16
tiger:MTFCC 16
tiger:LSAD 16
tiger:FUNCSTAT 16
tiger:CPI 16
tiger:CLASSFP 16
tiger:zip_right_3 13
tiger:name_direction_prefix_2 12
tiger:name_type_3 10
tiger:zip_left_4 7
tiger:zip 3
tiger:name_base_4 3
tiger:zip_right_4

In [28]:
# check if there will be tags of add:street and tiger:name_base under the same way element
def checkaddress(osmfile):
    osm_file = open(osmfile, "r")
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            n=0
            for tag in elem.iter("tag"): #find children tags
                if tag.get('k') =='addr:street' :
                    n+=1
                if tag.get('k') == 'tiger:name_base':
                    n+=1
            if n==2:
                print elem
        
    osm_file.close()

checkaddress(SAMPLE_FILE)

In [36]:
#Audit Street Names

street_type_re = re.compile(r'\S+\.?$', re.IGNORECASE)
street_type_prefix = re.compile(r'^([NSWE])\s', re.IGNORECASE)
street_type_suffix = re.compile(r'\s([NSWE])$', re.IGNORECASE)
street_type_number = re.compile(r'\s#?\d+[a-zA-Z]?$', re.IGNORECASE)


expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons","Freeway","Highway",'Circle','Speedway','Way','Loop']

mapping = { "Cir": "Circle","Ct": "Court",'Dr':'Drive','Ln':'Lane','Fwy':'Freeway',
           'Pkwy':"Parkway",'Pky':"Parkway","Ave": "Avenue", 'Blvd':'Boulevard','Hwy':'Highway',
           'Rd':'Road','St':'Street','Pl':'Place','Trl':"Trail",'Blvd.':'Boulevard',
           'Byp':'Bypass'}

def update_name(name, mapping):
    subwords=name.split()
    lastword=subwords[-1]
    if lastword in mapping:
        subwords[-1]=mapping[lastword]
        name=' '.join(subwords)
    return name

def audit_street_type(street_types, street_name):
    #if there is a suffix, leave it out
    if street_type_suffix.search(street_name): 
        street_name=(' ').join(street_name.split()[:-1])
    #after moving the suffix, if there is a number, leave it out
    if street_type_number.search(street_name): 
        street_name=(' ').join(street_name.split()[:-1])
    #print out the street name if its updated last word is not in the mapping
    if street_type_re.search(street_name):
        street_name = update_name(street_name,mapping)
        street_type = street_type_re.search(street_name).group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)  #defaultdict
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            iftiger=False
            name_base,name_type,name_direction_prefix,name_direction_suffix=None,None,None,None
            for tag in elem.iter("tag"): #find children tags
                if tag.get('k') == 'addr:street':
                    audit_street_type(street_types, tag.attrib['v'])
                #complie the Tiger GPS name
                if tag.get('k') == 'tiger:name_base':
                    iftiger=True
                    name_base=re.split('[:;]',tag.get('v'))[0] 
                if tag.get('k') == 'tiger:name_type':
                    name_type=re.split('[:;]',tag.get('v'))[0] 
                if tag.get('k') == 'tiger:name_direction_prefix':
                    name_direction_prefix=re.split('[:;]',tag.get('v'))[0] 
                if tag.get('k') == 'tiger:name_direction_suffix':
                    name_direction_suffix=re.split('[:;]',tag.get('v'))[0] 
            if iftiger: 
                streetlist=[name_direction_prefix,name_base,name_type,name_direction_suffix]
                street=' '.join(filter(None,streetlist)) 
                audit_street_type(street_types,street)
    osm_file.close()
    return street_types



if __name__ == '__main__':
    st_types = audit(SAMPLE_FILE)
    pp(dict(st_types))


{'#N': set(['Kingwood Drive #N']),
 '(Alt)': set(['United States Highway 90 (Alt)']),
 '(Bus)': set(['State Highway 288B (Bus)', 'United States Highway 59 (Bus)']),
 '(Ssr)': set(['South Service Rd (Ssr)']),
 '1/2': set(['Avenue G 1/2',
             'Avenue M 1/2',
             'Avenue N 1/2',
             'Avenue O 1/2',
             'Avenue P 1/2',
             'Avenue Q 1/2',
             'Avenue T 1/2']),
 '13384': set(['13384']),
 '191C': set(['191C']),
 '2004': set(['2004']),
 '206': set(['206']),
 '228': set(['228']),
 '24A': set(['24A']),
 '300': set(['300']),
 '348': set(['348']),
 '456': set(['456']),
 '545': set(['545']),
 '566': set(['566']),
 '610A': set(['610A']),
 '868E': set(['868E']),
 'A': set(['Avenue A', 'N Avenue A']),
 'Acres': set(['Acres']),
 'Afloat': set(['Afloat']),
 'Agee': set(['Agee']),
 'Airline': set(['Airline', 'Stuebner Airline']),
 'Allyne': set(['Allyne']),
 'Aly': set(['Hogans Aly']),
 'Arcadia': set(['Arcadia']),
 'Ashe': set(['Ashe']),
 'Atascader