In [1]:
datafile = "../raleigh_north-carolina.osm"
#datafile = "../Submission/04-sample.osm"

In [2]:
import json
import xml.etree.cElementTree as ET
import re
from collections import defaultdict, Counter

# Audit Data for some fields

## Count Tags

In [3]:
def count_tags(filename):
        # YOUR CODE HERE
        tags = {}
        for event, elem in ET.iterparse(filename):
            if event == 'end':
                if elem.tag not in tags.keys():
                    tags[elem.tag] = 1
                else:
                    tags[elem.tag] += 1
        return tags

In [4]:
tags = count_tags(datafile)
print(tags)
print('Total Nodes and Ways = {:,}'.format(tags['node']+tags['way']))

{'way': 211467, 'tag': 813843, 'node': 2524263, 'osm': 1, 'nd': 2784745, 'member': 7647, 'bounds': 1, 'relation': 732}
Total Nodes and Ways = 2,735,730


## Create functions to audit specific kinds of data

In [5]:
#Generator function to yield specific tags
def audit(osmfile, tagname):
    with open(osmfile, "r") as osm_file:
        for event, elem in ET.iterparse(osm_file, events=("start",)):
            if elem.tag == "node" or elem.tag == "way":
                for tag in elem.iter("tag"):
                    if tag.attrib['k'] == tagname:
                        yield tag.attrib['v']

In [6]:
##############
#Street Names
##############

imported_street_names = audit(datafile,'addr:street')
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

#Known Street Types
expected_street_names = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Loop", "Way", "Run", "Circle", "Hill", "Fork", "Plaza",
           "Point","Terrace", "Crescent", "Crossing"]

#Known Mappings
mapping_street_names = {"St": "Street", "St.": "Street", "St,":"Street", "ST":"Street", 
            "Rd": "Road", "Rd.": "Road", "Ave":"Avenue", "Ave.":"Avenue", "Blvd":"Boulevard",
            "Blvd.":"Boulevard", "Pkwy":"Parkway", "Pky":"Parkway", "Dr":"Drive", "Ln":"Lane",
            "Ct":"Court", "Pl":"Place", "Cir":"Circle", "N":"North","E":"East","S":"South","W":"West"}

specific_streetname = {"Meadowmont Village CIrcle":"Meadowmont Village Circle",
                       "LaurelcherryStreet":"Laurel Cherry Street",
                      "Garrett Driver":"Garrett Drive"}

def correct_streetname(streetname):
    """Attempt to correct a street name"""
    if streetname in specific_streetname.keys():
        return specific_streetname[streetname]
    else:
        name = streetname.split(' ')
        for idx, subname in enumerate(name):
            if subname in mapping_street_names.keys():
                name[idx] = mapping_street_names[subname]
        return " ".join(name)

def audit_street_type(street_names):
    """Check if street name is a known name after correction.  If not, record it."""
    recorded_names = defaultdict(set)
    for street_name in street_names:
        corrected = correct_streetname(street_name)
        m = street_type_re.search(corrected)
        if m:
            street_type = m.group()           
            if street_type not in expected_street_names:
                recorded_names[street_type].add(street_name)
    return recorded_names
            

street_types = audit_street_type(imported_street_names)
for k,v in street_types.items():
    print(k,v)

Highway {'Wake Forest Highway', 'Apex Highway'}
Grove {'Newton Grove'}
Ext {'New Hope Commons Boulevard Ext'}
100 {'100'}
Extension {'Weaver Dairy Road Extension'}
Bypass {'US 15 501 Bypass'}
Suite {'N Duke St Suite'}
70 {'US 70'}
East {'US Highway 70 East'}
17 {'US Highway 17'}
Practice {'Triangle Family Practice'}
Hills {'The Circle at North Hills'}
West {'Highway 54 West', 'NC Highway 55 West', 'Highway 55 West', 'Highway West'}
PI {'Alexander Promenade PI'}
751 {'NC Highway 751'}
501 {'US 15;US 501'}
55 {'NC Highway 55', 'Highway 55', 'US 55'}
54 {'Highway 54', 'West NC Highway 54', 'West Highway 54', 'State Highway 54'}
1000 {'Six Forks Road #1000'}


In [7]:
##############
#Postal Codes
##############

imported_postcodes = audit(datafile,'addr:postcode')
postcode_re = re.compile(r'^[0-9]{5}$')
extended_postcode_re = re.compile(r'^[0-9]{5}-[0-9]{4}$')

def correct_postcode(postcode):
    """Try to convert postcode to 5 digit int"""
    if extended_postcode_re.match(postcode): #strip extended postcode with "-####"
        postcode = postcode[0:5]
        return int(postcode)
    elif postcode_re.match(postcode): #normal 5 digit postcode
        return int(postcode)
    else:
        return None

def audit_postcodes(postcodes):
    """Try to convert to int.  Record if it doesn't work."""
    valid = Counter()
    invalid = set()
    for postcode in postcodes:
        postcode_fix = correct_postcode(postcode)
        if postcode_fix:
            valid[postcode_fix] += 1
        else:
            invalid.add(postcode)
        
    return valid, invalid

valid_postcode, invalid_postcode = audit_postcodes(imported_postcodes)

print("{:,} total validpostal codes found".format(sum(valid_postcode.values())))
print(valid_postcode.most_common())

print('-'*30)

print("{} invalid postal codes were found".format(len(invalid_postcode)))
print(invalid_postcode)

6,564 total validpostal codes found
[(27612, 1683), (27609, 1120), (27519, 839), (27701, 659), (27705, 478), (27615, 344), (27510, 267), (27514, 165), (27511, 113), (27606, 98), (27513, 92), (27707, 89), (27601, 84), (27517, 72), (27560, 65), (27704, 54), (27516, 53), (27713, 49), (27703, 48), (27617, 33), (27613, 28), (27603, 20), (27604, 19), (27607, 19), (27610, 15), (27605, 11), (27614, 10), (27608, 9), (27695, 5), (27616, 4), (27162, 4), (27518, 4), (27602, 2), (27599, 2), (27895, 2), (27708, 2), (28616, 1), (27710, 1), (27502, 1)]
------------------------------
6 invalid postal codes were found
{'275198404', '275199', 'NC', '2612-6401', '277030', '275609194'}


# Correct and Convert to JSON

## Formatting Function for Node

In [8]:
def process_node(element):
    """Format Node object and associated tags.  Remove attributes as they are processed."""
    node = {'type':'node'}
    #Extract raw information for this node
    attributes = {a:element.attrib[a] for a in element.attrib}
    tags = [(t.attrib['k'],t.attrib['v']) for t in element.findall('tag')]
    #tags used a list of tuples instead of dict because of potential duplicate keys
    
    #Add ID
    node['id'] = element.attrib['id']
    del attributes['id']
    
    #Add Position
    if 'lat' in attributes and 'lon' in attributes:
        node['pos'] = [float(element.attrib["lat"]), float(element.attrib["lon"])]
        del attributes['lat']
        del attributes['lon']

    #Created
    CREATED = ["version", "changeset", "timestamp", "user", "uid"]
    created_attrib = [a for a in attributes if a in CREATED] #Which of the created attr are present
    if len(created_attrib) > 0:
        node['created'] = {}
        for a in created_attrib:
            node['created'][a] = element.attrib[a]
            del attributes[a]
    
    #Remaining Attributes
    for a in attributes:
        if problemchars.match(a): #skip attributes with problematic keys
            continue
        else:
            node[a] = element.attrib[a]

    #Process tags
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    address = re.compile(r'^addr:([a-z]|_)+$')#match 'addr', one colon, some tag
    gnis = re.compile(r'^gnis:([a-z]|_)+$')#similar structure to address.  Seems interesting.
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

    for k,v in tags:
        if problemchars.match(k):
            continue
        elif address.match(k):
            #Add Address dict if needed
            if 'address' not in node.keys(): #add address dict if not present
                node['address'] = dict()
            #Get subnode and corrected (if possible) value (see above)
            subnode = k.split(':')[1]
            if subnode == 'street':
                v = correct_streetname(v)
            elif subnode == 'postcode':
                v = correct_postcode(v)
            #Add value
            node['address'][subnode] = v
        elif gnis.match(k):
            if 'gnis' not in node.keys(): #add address dict if not present
                node['gnis'] = dict()
            node['gnis'][k.split(':')[1]] = v
        elif lower.match(k) or lower_colon.match(k):
            node[k] = v         
    
    return node

## Formatting function for Way

In [9]:
def process_way(element):
    """Format Way object and associated nd and tags"""
    way = {'type':'way'}
    
    #Extract raw information for this node
    attributes = {a:element.attrib[a] for a in element.attrib}
    noderefs = [n.attrib['ref'] for n in element.findall('nd')]
    tags = [(t.attrib['k'],t.attrib['v']) for t in element.findall('tag')]
    
    #Add ID
    way['id'] = element.attrib['id']
    del attributes['id']
    
    #Created
    CREATED = ["version", "changeset", "timestamp", "user", "uid"]
    created_attrib = [a for a in attributes if a in CREATED] #Which of the created attr are present
    if len(created_attrib) > 0:
        way['created'] = {}
        for a in created_attrib:
            way['created'][a] = element.attrib[a]
            del attributes[a]
            
    #Remaining Attributes
    for a in attributes:
        if problemchars.match(a): #skip attributes with problematic keys
            continue
        else:
            way[a] = element.attrib[a]
            
    #Node Refs
    if len(noderefs) > 0:
        way['node_refs'] = noderefs
    
    #Process tags
    problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
    tiger = re.compile(r'^tiger:([a-z]|_)+$') #Import from US Census Data for roadways, etc
    lower = re.compile(r'^([a-z]|_)*$')
    lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')

    for k,v in tags:
        if problemchars.match(k):
            continue
        elif tiger.match(k):
            if 'tiger' not in way.keys(): #add address dict if not present
                way['tiger'] = dict()
            way['tiger'][k.split(':')[1]] = v
        elif lower.match(k) or lower_colon.match(k):
            way[k] = v 
    
    return way

## Iterate through XML file.

In [10]:
def shape_element(element):
    if element.tag == "node":
        shaped_element = process_node(element)
    elif element.tag == 'way':
        shaped_element = process_way(element)
    else:
        shaped_element = None
    return shaped_element
        

def process_map(file_in):
    file_out = "{0}.json".format(file_in)
    with open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                fo.write(json.dumps(el)+"\n")

In [11]:
process_map(datafile)

# Import into MongoDB

In [12]:
#Using mongoimport
!mongoimport -d osm -c raleigh --drop --file="../raleigh_north-carolina.osm.json"

2015-07-07T00:38:40.359-0400	connected to: localhost
2015-07-07T00:38:40.359-0400	dropping: osm.raleigh
2015-07-07T00:38:43.334-0400	[#.......................] osm.raleigh	25.0 MB/562.6 MB (4.4%)
2015-07-07T00:38:46.334-0400	[##......................] osm.raleigh	51.5 MB/562.6 MB (9.2%)
2015-07-07T00:38:49.334-0400	[###.....................] osm.raleigh	79.0 MB/562.6 MB (14.0%)
2015-07-07T00:38:52.334-0400	[####....................] osm.raleigh	107.4 MB/562.6 MB (19.1%)
2015-07-07T00:38:55.334-0400	[#####...................] osm.raleigh	135.4 MB/562.6 MB (24.1%)
2015-07-07T00:38:58.334-0400	[######..................] osm.raleigh	154.4 MB/562.6 MB (27.4%)
2015-07-07T00:39:01.334-0400	[#######.................] osm.raleigh	180.3 MB/562.6 MB (32.0%)
2015-07-07T00:39:04.334-0400	[########................] osm.raleigh	207.6 MB/562.6 MB (36.9%)
2015-07-07T00:39:07.334-0400	[#########...............] osm.raleigh	234.4 MB/562.6 MB (41.7%)
2015-07-07T00:39:10.334-0400	[###########.............]

# Auditing of the imported data

In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.osm
collection = db.raleigh

In [2]:
#Number of documents
collection.find().count()

2735730

### Education amenity tags

In [3]:
#List of university
pipeline = [
    {'$match':{'amenity':'university'}},
    {'$group':{'_id':'$name', 'count':{'$sum':1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
]
documents = collection.aggregate(pipeline)
for r in documents['result']:
    print(r)

{'_id': 'Duke University East Campus', 'count': 3}
{'_id': 'Duke University Central Campus', 'count': 1}
{'_id': 'North Carolina State University (Centennial Campus)', 'count': 1}
{'_id': None, 'count': 1}
{'_id': 'Duke University West Campus', 'count': 1}
{'_id': "St. Augustine's University", 'count': 1}
{'_id': 'Duke University Medical Center', 'count': 1}
{'_id': 'JC Raulston Arboretum at NC State University', 'count': 1}
{'_id': 'Campbell University: Norman Adrian Wiggins School of Law', 'count': 1}
{'_id': 'North Carolina Central University', 'count': 1}


In [4]:
#List of college
pipeline = [
    {'$match':{'amenity':'college'}},
    {'$group':{'_id':'$name', 'count':{'$sum':1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
]
documents = collection.aggregate(pipeline)
for r in documents['result']:
    print(r)

{'_id': None, 'count': 28}
{'_id': 'White Building', 'count': 1}
{'_id': 'Collins Building', 'count': 1}
{'_id': 'Durham Tech Community College', 'count': 1}
{'_id': 'Wake Technical Community College: Perry Health Sciences Campus', 'count': 1}
{'_id': 'Meredith College', 'count': 1}
{'_id': 'AKG Guitar Lessons', 'count': 1}


In [5]:
#List of school
pipeline = [
    {'$match':{'amenity':'school'}},
    {'$group':{'_id':'$name', 'count':{'$sum':1}}},
    {'$sort':{'count':-1}},
    {'$limit':10}
]
documents = collection.aggregate(pipeline)
for r in documents['result']:
    print(r)

{'_id': None, 'count': 21}
{'_id': 'Carrboro Elementary School', 'count': 2}
{'_id': "Lowe's Grove Middle School", 'count': 2}
{'_id': 'Durham Technical Community College', 'count': 2}
{'_id': 'Durham Academy', 'count': 2}
{'_id': 'Eastway Elementary School', 'count': 2}
{'_id': 'The Goddard School', 'count': 2}
{'_id': 'Ravenscroft School', 'count': 2}
{'_id': 'Panther Creek High School', 'count': 2}
{'_id': 'C C Spaulding Elementary School', 'count': 2}


###Multiple points on the same position

In [21]:
pipeline = [
    {'$group':{'_id':'$pos', 'count':{'$sum':1}}},
    {'$sort':{'count':-1}},
    {'$skip':1}, #skip "None" record
    {'$limit':3}
]
documents = collection.aggregate(pipeline, allowDiskUse=True)

for r in documents['result']:
    pos = r['_id']
    print("-"*30,"\n",pos,"\n","-"*30,"\n")
    overlaps = collection.aggregate({'$match':{'pos':pos}})
    for o in overlaps['result']:
        print(o)

------------------------------ 
 [35.8950081, -79.0633303] 
 ------------------------------ 

{'id': '373215082', 'type': 'node', 'created': {'uid': '38487', 'user': 'jumbanho', 'version': '1', 'changeset': '444838', 'timestamp': '2009-04-12T14:28:50Z'}, '_id': ObjectId('559b57d50d0edd8d31948483'), 'pos': [35.8950081, -79.0633303]}
{'id': '373216876', 'type': 'node', 'created': {'uid': '38487', 'user': 'jumbanho', 'version': '1', 'changeset': '444838', 'timestamp': '2009-04-12T14:31:27Z'}, '_id': ObjectId('559b57d50d0edd8d31948506'), 'pos': [35.8950081, -79.0633303]}
{'id': '373217228', 'type': 'node', 'created': {'uid': '38487', 'user': 'jumbanho', 'version': '1', 'changeset': '444838', 'timestamp': '2009-04-12T14:31:56Z'}, '_id': ObjectId('559b57d50d0edd8d31948546'), 'pos': [35.8950081, -79.0633303]}
{'id': '373220087', 'type': 'node', 'created': {'uid': '38487', 'user': 'jumbanho', 'version': '1', 'changeset': '444838', 'timestamp': '2009-04-12T14:36:37Z'}, '_id': ObjectId('559b57d5