In [40]:
import xml.etree.cElementTree as ET
from collections import defaultdict 
import re 
import pprint
import sample_file as sf #sample_file.py in ./osm_src
import data as d

sf.get_element('sacramento.osm') #write a sample
expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", 'Terrace', 'Ingoglia','Broadway','Circle','Way']
mapping = { "St"  : "Street",
            "St." : "Street",
            'ST'  : 'Street',
            "Ave" : "Avenue",
            "Ave.": "Avenue",
            "Rd." : "Road",
            'Rd'  : 'Road',
            'Ct'  : 'Court',
            'Ct.' : 'Court', 
            'Blvd': 'Boulevard', 
            'Blvd.' : 'Boulevard',
            'Dr'  : 'Drive', 
            'Dr.' : 'Drive',
            'PlaceZ': 'Plaza',
            'Ln.' : 'Lane',
            'PKWY': 'Parkway'}




In [2]:
element = d.get_element('sample.osm', tags=('node','way')) # get element. 

In [18]:
def is_post_code(elem): 
    return (elem.attrib['k'] == 'addr:postcode')

def audit_inconsistent_zip(osmfile):
    inconsistent_zip = []
    osm_file = open(osmfile, 'r')
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag) and len(tag.attrib['v']) != 5: 
                    inconsistent_zip.append(tag.attrib['v'])
    osm_file.close()
    return inconsistent_zip
                    

In [27]:
audit_inconsistent_zip('sacramento.osm')

['95832-1447',
 'CA 95826',
 'CA 95819',
 'CA 95834',
 'CA 95834',
 '2557',
 '95819-6055',
 '95819-6055',
 '95819-6055',
 'CA 95832',
 'CA 95822',
 '95832-1447',
 'CA 95626',
 '95826-2625',
 '95819-6024',
 '95819-6138']

In [30]:
def update_zip(zip_code,mapping):
    m = re.search(r'95', tag, re.M|re.I)
    if m.group in mapping.keys():
        zip_code = re.sub(m.group(),mapping[m.group()], zip_code)
    return zip_code
    

<_sre.SRE_Match object at 0x1039157e8>


In [83]:
zip_type_re = re.compile("(^\w*\s.*\d$)")
zip_mapping = {'95832-1447' : '95832',
               'CA 95826'   : '95826',
               'CA 95819'   : '95819',
               'CA 95834'   : '95834',
               '95819-6055' : '95819',
               'CA 95832'   : '95832',
               'CA 95822'   : '95822',
               '95832-1447' : '95832',
               'CA 95626'   : '95626',
               '95826-2625' : '95826',
               '95819-6024' : '95819',
               '95819-6138' : '95819'}

expected = ['95816']
def audit_inconsistent_zip(osmfile): #collects inconsist zip_codes in a list. 
    inconsistent_zip = []
    osm_file = open(osmfile, 'r')
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag) and len(tag.attrib['v']) != 5: 
                    inconsistent_zip.append(tag.attrib['v'])
    osm_file.close()
    return inconsistent_zip

def update_zip(name,mapping): 
    z = zip_type_re.search(name)
    if z not in expected:
        if z.group() in zip_mapping.keys(): #create a map that takes anything with '95' and replaces it.
            name = re.sub(z.group(),zip_mapping[z.group()], name)
    return name

for zip_codes in audit_inconsistent_zip('sacramento.osm'): 
        better_zip = update_zip(zip_codes,zip_mapping)
        print zip_codes, '=>', better_zip


AttributeError: 'NoneType' object has no attribute 'group'

In [75]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons"]



def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def is_post_code(elem): 
    return (elem.attrib['k'] == 'addr:postcode')

def audit_inconsistent_zip(osmfile):
    inconsistent_zip = []
    osm_file = open(osmfile, 'r')
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag) and len(tag.attrib['v']) != 5: 
                    inconsistent_zip.append(tag.attrib['v'])
    osm_file.close()
    return inconsistent_zip

def check_zip(osmfile): 
    osm_file = open(osmfile,'r') #open file
    zip_sac = [] #list of all zip codes in Sacramento County 
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag): # use the element attrib for 'addr:postcode'
                    zip_sac.append(tag.attrib['v']) # append them all
    osm_file.close()
    return zip_sac

def audit_incorrect_zip(osmfile):
    incorrect_zip = set() #for unique inconsistent zips we use a set.
    for zip_codes in check_zip(osmfile): 
        m = re.search(r'95', zip_codes, re.M|re.I) #checks for '95' in zip codes
        if m: 
            pass #passing the correct ones 
        else: 
            incorrect_zip.add(zip_codes)
    return incorrect_zip
    
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)
            
def audit_check_street(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types
# audit_zip inconsistent: 
# ['95832-1447','CA 95826','CA 95819','CA 95834','CA 95834','2557','95819-6055','95819-6055','95819-6055','CA 95832',
# 'CA 95822','95832-1447','CA 95626','95826-2625','95819-6024','95819-6138']


#audit_incorrect_zip: 
#{'2557', '85834', '96816', '98584'}

In [49]:
regex = re.compile(r"(\w+) World")
result = regex.search("Hello World is the easiest")
if result:
    # This will print:
    #   0 11
    # for the start and end of the match
    print result.start(), result.end()

0 11


In [88]:
zip_type_re = re.compile("r(^\w*\s.*\d$)")
zip_mapping = {'95832-1447' : '95832',
               'CA 95826'   : '95826',
               'CA 95819'   : '95819',
               'CA 95834'   : '95834',
               '95819-6055' : '95819',
               'CA 95832'   : '95832',
               'CA 95822'   : '95822',
               '95832-1447' : '95832',
               'CA 95626'   : '95626',
               '95826-2625' : '95826',
               '95819-6024' : '95819',
               '95819-6138' : '95819'}

z = zip_type_re.search('CA 95826')
if z: 
    if (z.group() in zip_mapping) == True: 
        name = re.sub(z.group(),zip_mapping[z.group()], 'hello')
        print name
    else: 
        print False

hello


In [111]:
#UPDATE THIS DICT
expected = ['95816'] #place holder 
zip_type_re = re.compile("(^\d*$)")
zip_mapping = {'95832-1447' : '95832',
               'CA 95826'   : '95826',
               'CA 95819'   : '95819',
               'CA 95834'   : '95834',
               '95819-6055' : '95819',
               'CA 95832'   : '95832',
               'CA 95822'   : '95822',
               '95832-1447' : '95832',
               'CA 95626'   : '95626',
               '95826-2625' : '95826',
               '95819-6024' : '95819',
               '95819-6138' : '95819'}

def audit_zip_type(zip_types, zip_name):
    m = zip_type_re.search(zip_name)
    if m:
        zip_type = m.group()
        if zip_type not in expected:
            zip_types[zip_type].add(zip_name)

def audit_inconsistent_zip(osmfile): #collects inconsist zip_codes in a list. 
    osm_file = open(osmfile, 'r')
    zip_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag): 
                    audit_zip_type(zip_types, tag.attrib['v'])
    osm_file.close()
    return zip_types


def is_post_code(elem): 
    return (elem.attrib['k'] == 'addr:postcode')

def update_zip(name,mapping): 
    z = zip_type_re.search(name)
    if z not in expected:
        if z.group() in zip_mapping.keys(): #create a map that takes anything with '95' and replaces it.
            name = re.sub(z.group(), zip_mapping[z.group(), name])
    return name


{'2557': set(['2557']),
 '85834': set(['85834']),
 '95605': set(['95605']),
 '95608': set(['95608']),
 '95624': set(['95624']),
 '95652': set(['95652']),
 '95660': set(['95660']),
 '95673': set(['95673']),
 '95691': set(['95691']),
 '95757': set(['95757']),
 '95758': set(['95758']),
 '95811': set(['95811']),
 '95814': set(['95814']),
 '95815': set(['95815']),
 '95817': set(['95817']),
 '95818': set(['95818']),
 '95819': set(['95819']),
 '95820': set(['95820']),
 '95821': set(['95821']),
 '95822': set(['95822']),
 '95823': set(['95823']),
 '95824': set(['95824']),
 '95825': set(['95825']),
 '95826': set(['95826']),
 '95827': set(['95827']),
 '95828': set(['95828']),
 '95829': set(['95829']),
 '95831': set(['95831']),
 '95832': set(['95832']),
 '95833': set(['95833']),
 '95834': set(['95834']),
 '95835': set(['95835']),
 '95838': set(['95838']),
 '95841': set(['95841']),
 '95842': set(['95842']),
 '95864': set(['95864']),
 '96816': set(['96816']),
 '98584': set(['98584'])}
95838 => 95838

In [113]:
'CA 95819'

'CA 95819'

In [117]:
'CA' in 'CA 95819' 

True

In [118]:
'-' in '95832-1447'

True

In [119]:
for i in zip_mapping: 
    print i

95819-6024
95819-6138
CA 95826
CA 95834
CA 95819
CA 95832
CA 95822
95819-6055
95832-1447
CA 95626
95826-2625


In [131]:
node_tags= {}
for i in zip_mapping: 
    if 'CA' in i: 
        split = i.split(' ')
        node_tags['value'] = split[1]
    elif '-' in i: 
        split = i.split('-')
        node_tags['value'] = split[0]
    print node_tags

{'value': '95819'}
{'value': '95819'}
{'value': '95826'}
{'value': '95834'}
{'value': '95819'}
{'value': '95832'}
{'value': '95822'}
{'value': '95819'}
{'value': '95832'}
{'value': '95626'}
{'value': '95826'}


In [161]:
def audit_inconsistent_zip(osmfile):
    inconsistent_zip = []
    nodes_tags = {}
    osm_file = open(osmfile, 'r')
    for event, elem in ET.iterparse(osm_file,events=('start',)): 
        
        if elem.tag == 'node' or elem.tag == 'way': 
            for tag in elem.iter('tag'): 
                if is_post_code(tag):
                    if 'CA' in tag.attrib['v']: 
                        split = tag.attrib['v'].split(' ')
                        nodes_tags['value'] = split[1]
    osm_file.close()
    return nodes_tags

In [162]:
for i in audit_inconsistent_zip('sacramento.osm'):
    print i

value
