In [1]:
## Import all necessary libraries for auditing data, correcting data and mapping into json format for MongoDB upload

import xml.etree.cElementTree as ET
from collections import defaultdict
import pprint
import re
import codecs
import json

In [2]:
## Create Global variables and list needed for character checking and for json format  


lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
double_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [3]:
## Step 1 in auditing counting the amount of different tags in .osm file from openstreetmap

def count_tags(filename):
        tags = {}
        for event, element in ET.iterparse(filename):
            if element.tag in tags:
                tags[element.tag] += 1
            else:
                tags.update({element.tag:0})
                tags[element.tag] += 1            

        return tags

In [4]:
count_tags("dallas_test.osm")

{'bounds': 1,
 'member': 3946,
 'meta': 1,
 'nd': 80622,
 'node': 69640,
 'note': 1,
 'osm': 1,
 'relation': 101,
 'tag': 36719,
 'way': 7902}

In [5]:
## Step 2 in auditing counting the amount of subordinates of tags in .osm file from openstreetmap

def count_subs(filename):
        subs = {}
        for event, element in ET.iterparse(filename):
            for attrName, attrValue in element.attrib.items():
                if attrName in subs:
                    subs[attrName] += 1
                else:
                    subs.update({attrName:0})
                    subs[attrName] += 1
                
        return subs

In [6]:
count_subs("dallas_test.osm")

{'changeset': 77643,
 'generator': 1,
 'id': 77643,
 'k': 36719,
 'lat': 69640,
 'lon': 69640,
 'maxlat': 1,
 'maxlon': 1,
 'minlat': 1,
 'minlon': 1,
 'osm_base': 1,
 'ref': 84568,
 'role': 3946,
 'timestamp': 77643,
 'type': 3946,
 'uid': 77643,
 'user': 77643,
 'v': 36719,
 'version': 77644}

In [7]:
## Step 3 audit iterate through street names and return list and count of endings

osm_file =open('dallas_test.osm', 'r')

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type] +=1

def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key = lambda s: s.lower())
    for k in keys:
        v = d[k]
        print '%s: %d' % (k , v)

def is_street_name(elem):
    return (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:street')

def audit():
    for event, elem in ET.iterparse(osm_file):
        if is_street_name(elem):
            audit_street_type(street_types, elem.attrib['v'])
    print_sorted_dict(street_types)  


In [8]:
audit()

200: 1
300: 1
500E: 1
75240: 2
Avenue: 1
Boulevard: 1
Dr.: 2
Drive: 1
Expressway: 5
Highway: 3
Lane: 12
Rd: 1
Rd.: 2
Road: 21
Street: 1
Way: 1


In [9]:

## From audit above we can see issues with Dr./Drive and Rd/Rd./Road as well as numbers included in street Names
## which is corrected via the following code.
## Following first run of this code further issues were identified. Street names including "Dallas" / "tx" as well as starting 
## with numbers, all which have been included in the code for cleaning.

mapping = { "Dr.": "Drive",'Rd.': 'Road','Rd': 'Road', '#500E': '', 'dallas': '', 'tx': '', 'W': 'West'}

            
def update_name(file, mapping):
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:street'):
            name = elem.attrib['v']
            n = street_type_re.search(name)
            if n:
                street_type = n.group()
                
                if street_type.isdigit():
                    m = street_type_re.split(name)
                    name = m[0]
                    
                
            for word in name.split():
                if word in mapping.keys():
                    name = name.replace(word,mapping[word])
                elif word.isdigit():
                    name = name.replace(word,'')
                    name = name.lstrip(' ')
                    name = name.capitalize()
                            
            return name



In [10]:
update_name('dallas_test.osm',mapping)

'Hillcrest Road'

In [11]:
## The following code provides a list of house numbers from which I can audit the data

def check_house(filename):
    for event, elem in ET.iterparse(filename):
        if elem.tag == 'tag':
            if elem.attrib['k'] == 'addr:housenumber':
                print elem.attrib['v']



In [12]:
check_house('dallas_test.osm')

8500
5959
7171
972-788-2591
12720
11611
11551
10909
12720
4319
4315
4323
4327
10207
5365
6807
7804
13881
13601
13355
8687
13319
8611
5811
5803
1490 W Spring Valley Rd
12950
13021
7171
13155
13319
11920
10720
10455
3757
14280
11880
10600
6831
8604
8135
13770
10410 Stone Canyon Rd Dallas, TX 75230
10410
6315
12120
8343
6724
14021
11722
10056
6423
4900
11600
3333
8687


In [13]:
## From audit above we can see issues with telephone no. instead of house number and house number with address following
## In order to correct this the following code will iterate over the house number and update with "To be updated" if including
## "-" reflecting it is a telephone number and if lenght of data is >5 digits, remove everything after 5th digit. 


def update_house(file):
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:housenumber'):
            house_no = elem.attrib['v']
            if '-' in house_no:
                house_no = 'To be updated'
            
            elif len(house_no) > 5:
                house_no = house_no[:5]
                            
            return house_no


In [14]:
update_house('dallas_test.osm')

'8500'

In [15]:
## The following code provides a list of postcodes from which I can audit the data

def check_post(filename):
    postcode = []
    for event, elem in ET.iterparse(filename):
        if elem.tag == 'tag':
            if elem.attrib['k'] == 'addr:postcode':
                postcode.append(elem.attrib['v'])
    
    postcode = set(postcode)
    print postcode



In [16]:
check_post('dallas_test.osm')

set(['75243', 'TX 75229', '75240', '75001', '75231', '75244', '75229', '75230', 'TX 75230', '75225', '75234', '75214', '75080'])


In [17]:
## From audit above of postcodes we can see issues with a preceeding 'TX'
## In order to correct this the following code will iterate over the postcodes and remove 'TX' when found.


def update_postcode(file):
    for event, elem in ET.iterparse(file):
        if (elem.tag == 'tag') and (elem.attrib['k'] == 'addr:postcode'):
            post_co = elem.attrib['v']
            if 'TX' in post_co:
                post_co = post_co.replace('TX ','')
            
                                        
            return post_co


In [18]:
update_postcode('dallas_test.osm')

'75225'

# The remaining code blocks are some additional auditing and code used for converting openstreet map data to a .json format
# This .json file is then loaded into a MongoDB database/collection needed for queries and answers to questions from project rubric.
# The actual upload to MongoDB is done via Command prompt commands in the MongoDB file library.


In [19]:

def key_type(element, keys):
    if element.tag == "tag":
        for tag in element.iter("tag"):
            if lower.search(element.attrib['v']):
                keys['lower'] += 1
            elif lower_colon.search(element.attrib['v']):
                keys['lower_colon'] += 1
            elif problemchars.search(element.attrib['v']):
                keys['problemchars'] += 1
            else:
                keys['other'] += 1

    return keys


In [20]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys


In [21]:
keys = process_map('dallas_test.osm')
pprint.pprint(keys)

{'lower': 15059, 'lower_colon': 4, 'other': 12797, 'problemchars': 8859}


In [22]:
def shape_element(element):
    node = {}
    created = {}
    pos = []
    pos_sort = []
    address = {}
    node_refs = []
    if element.tag == "node" or element.tag == "way" :
        for attrName, attrValue in element.attrib.items():
            
            
            if attrName in CREATED:
                created[attrName] = attrValue
            elif attrName == 'lon' or attrName == 'lat':
                pos.append(float(attrValue))
            else:
                node[attrName] = attrValue
            
            for tag in element.iter("tag"):
                if tag.attrib['k'][:5] == 'addr:':
                    address[tag.attrib['k'][5:]] = tag.attrib['v']
                elif problemchars.search(tag.attrib['v']):
                    continue
                elif double_colon.search(tag.attrib['k']):
                    continue
                else:
                    node[tag.attrib['k']] = tag.attrib['v']
                    
            node['type'] = element.tag
            
        for nd in element.iter('nd'):
                node_refs.append(nd.attrib['ref'])
            
        node['created'] = created
        
        if len(pos) > 0:
            pos.sort(reverse=True)
            node['pos'] = pos
            
        if len(address) > 0:
            node['address'] = address
        
        if len(node_refs) > 0:
            node['node_refs'] = node_refs
        
        
        return node
    else:
        return None
    
    

In [23]:

def process_map_mongo(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data


In [24]:
data = process_map_mongo('dallas_texas.osm', False)